mirror of
https://github.com/Mueller-Patrick/Betterzon.git
synced 2026-04-26 23:30:11 +00:00
BETTERZON-56: Adding crawler load-balancing script (#28)
This commit is contained in:
@@ -0,0 +1,59 @@
|
||||
import json
|
||||
import requests
|
||||
import os
|
||||
|
||||
import sql
|
||||
|
||||
|
||||
def call_crawlers() -> bool:
|
||||
"""
|
||||
Fetches the list of all shops, does some load balancing magic and calls all registered crawler
|
||||
instances to start them
|
||||
:return: If the calls have been successful
|
||||
"""
|
||||
product_ids = sql.getProductsToCrawl()
|
||||
|
||||
# crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
|
||||
crawler_urls = ['http://localhost:22026']
|
||||
|
||||
balanced_lists = []
|
||||
|
||||
products_per_crawler = len(product_ids) // len(crawler_urls)
|
||||
rest = len(product_ids) % len(crawler_urls)
|
||||
|
||||
# Distrubute available products over available crawler instances
|
||||
for crawler_id in range(len(crawler_urls)):
|
||||
amount_of_prods = products_per_crawler
|
||||
|
||||
# If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
|
||||
if crawler_id < rest:
|
||||
amount_of_prods += 1
|
||||
|
||||
# Assign the required amount of product ids to the current crawler and remove them from the
|
||||
# list of all product ids
|
||||
balanced_lists.append(product_ids[:amount_of_prods])
|
||||
product_ids = product_ids[amount_of_prods:]
|
||||
|
||||
# Make the callouts to the instances
|
||||
successful = 0
|
||||
for crawler_id in range(len(crawler_urls)):
|
||||
prods = balanced_lists[crawler_id]
|
||||
url = crawler_urls[crawler_id]
|
||||
|
||||
# Send request
|
||||
data = {
|
||||
'key': os.environ['CRAWLER_ACCESS_KEY'],
|
||||
'products': prods
|
||||
}
|
||||
headers = {'content-type': 'application/json', 'accept': 'application/json'}
|
||||
|
||||
resp = requests.post(url=url, data=json.dumps(data), headers=headers)
|
||||
|
||||
if resp.status_code == 200:
|
||||
successful += 1
|
||||
|
||||
return successful == len(crawler_urls)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
call_crawlers()
|
||||
@@ -1,2 +1 @@
|
||||
pymysql
|
||||
logging
|
||||
|
||||
@@ -40,3 +40,20 @@ def getShopsToCrawl() -> [int]:
|
||||
vendor_ids = list(map(lambda x: x[0], cur.fetchall()))
|
||||
|
||||
return vendor_ids
|
||||
|
||||
def getProductsToCrawl() -> [int]:
|
||||
"""
|
||||
Queries the list of product IDs and returns them
|
||||
:return: The list of IDs
|
||||
"""
|
||||
conn = __getConnection__()
|
||||
cur = conn.cursor()
|
||||
|
||||
query = 'SELECT product_id FROM products'
|
||||
|
||||
cur.execute(query)
|
||||
|
||||
# Extract the IDs from the returned tuples into a list
|
||||
product_ids = list(map(lambda x: x[0], cur.fetchall()))
|
||||
|
||||
return product_ids
|
||||
|
||||
Reference in New Issue
Block a user