Betterzon/Crawler-Loadbalancer/dailycrawl.py

import json
import requests
import os

import sql


def call_crawlers() -> bool:
    """
    Fetches the list of all shops, does some load balancing magic and calls all registered crawler
    instances to start them
    :return: If the calls have been successful
    """
    product_ids = sql.getProductsToCrawl()

    # crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
    crawler_urls = ['http://localhost:22026']

    balanced_lists = []

    products_per_crawler = len(product_ids) // len(crawler_urls)
    rest = len(product_ids) % len(crawler_urls)

    # Distrubute available products over available crawler instances
    for crawler_id in range(len(crawler_urls)):
        amount_of_prods = products_per_crawler

        # If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
        if crawler_id < rest:
            amount_of_prods += 1

        # Assign the required amount of product ids to the current crawler and remove them from the
        # list of all product ids
        balanced_lists.append(product_ids[:amount_of_prods])
        product_ids = product_ids[amount_of_prods:]

    # Make the callouts to the instances
    successful = 0
    for crawler_id in range(len(crawler_urls)):
        prods = balanced_lists[crawler_id]
        url = crawler_urls[crawler_id]

        # Send request
        data = {
            'key': os.environ['CRAWLER_ACCESS_KEY'],
            'products': prods
        }
        headers = {'content-type': 'application/json', 'accept': 'application/json'}

        resp = requests.post(url=url, data=json.dumps(data), headers=headers)

        if resp.status_code == 200:
            successful += 1

    return successful == len(crawler_urls)


if __name__ == '__main__':
    call_crawlers()