mirror of
https://github.com/Mueller-Patrick/Betterzon.git
synced 2024-12-23 20:15:13 +00:00
60 lines
1.7 KiB
Python
60 lines
1.7 KiB
Python
|
import json
|
||
|
import requests
|
||
|
import os
|
||
|
|
||
|
import sql
|
||
|
|
||
|
|
||
|
def call_crawlers() -> bool:
|
||
|
"""
|
||
|
Fetches the list of all shops, does some load balancing magic and calls all registered crawler
|
||
|
instances to start them
|
||
|
:return: If the calls have been successful
|
||
|
"""
|
||
|
product_ids = sql.getProductsToCrawl()
|
||
|
|
||
|
# crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
|
||
|
crawler_urls = ['http://localhost:22026']
|
||
|
|
||
|
balanced_lists = []
|
||
|
|
||
|
products_per_crawler = len(product_ids) // len(crawler_urls)
|
||
|
rest = len(product_ids) % len(crawler_urls)
|
||
|
|
||
|
# Distrubute available products over available crawler instances
|
||
|
for crawler_id in range(len(crawler_urls)):
|
||
|
amount_of_prods = products_per_crawler
|
||
|
|
||
|
# If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
|
||
|
if crawler_id < rest:
|
||
|
amount_of_prods += 1
|
||
|
|
||
|
# Assign the required amount of product ids to the current crawler and remove them from the
|
||
|
# list of all product ids
|
||
|
balanced_lists.append(product_ids[:amount_of_prods])
|
||
|
product_ids = product_ids[amount_of_prods:]
|
||
|
|
||
|
# Make the callouts to the instances
|
||
|
successful = 0
|
||
|
for crawler_id in range(len(crawler_urls)):
|
||
|
prods = balanced_lists[crawler_id]
|
||
|
url = crawler_urls[crawler_id]
|
||
|
|
||
|
# Send request
|
||
|
data = {
|
||
|
'key': os.environ['CRAWLER_ACCESS_KEY'],
|
||
|
'products': prods
|
||
|
}
|
||
|
headers = {'content-type': 'application/json', 'accept': 'application/json'}
|
||
|
|
||
|
resp = requests.post(url=url, data=json.dumps(data), headers=headers)
|
||
|
|
||
|
if resp.status_code == 200:
|
||
|
successful += 1
|
||
|
|
||
|
return successful == len(crawler_urls)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
call_crawlers()
|