Betterzon/Crawler-Loadbalancer/dailycrawl.py

60 lines
1.7 KiB
Python
Raw Permalink Normal View History

import json
import requests
import os
import sql
def call_crawlers() -> bool:
"""
Fetches the list of all shops, does some load balancing magic and calls all registered crawler
instances to start them
:return: If the calls have been successful
"""
product_ids = sql.getProductsToCrawl()
# crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
crawler_urls = ['http://localhost:22026']
balanced_lists = []
products_per_crawler = len(product_ids) // len(crawler_urls)
rest = len(product_ids) % len(crawler_urls)
# Distrubute available products over available crawler instances
for crawler_id in range(len(crawler_urls)):
amount_of_prods = products_per_crawler
# If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
if crawler_id < rest:
amount_of_prods += 1
# Assign the required amount of product ids to the current crawler and remove them from the
# list of all product ids
balanced_lists.append(product_ids[:amount_of_prods])
product_ids = product_ids[amount_of_prods:]
# Make the callouts to the instances
successful = 0
for crawler_id in range(len(crawler_urls)):
prods = balanced_lists[crawler_id]
url = crawler_urls[crawler_id]
# Send request
data = {
'key': os.environ['CRAWLER_ACCESS_KEY'],
'products': prods
}
headers = {'content-type': 'application/json', 'accept': 'application/json'}
resp = requests.post(url=url, data=json.dumps(data), headers=headers)
if resp.status_code == 200:
successful += 1
return successful == len(crawler_urls)
if __name__ == '__main__':
call_crawlers()