Betterzon/Crawler-Loadbalancer/dailycrawl.py

import json
import requests
import os

import sql


def call_crawlers() -> bool:
    """
    Fetches the list of all shops, does some load balancing magic and calls all registered crawler
    instances to start them
    :return: If the calls have been successful
    """
    product_ids = sql.getProductsToCrawl()

    # crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
    crawler_urls = ['http://localhost:22026']

    balanced_lists = []

    products_per_crawler = len(product_ids) // len(crawler_urls)
    rest = len(product_ids) % len(crawler_urls)

    # Distrubute available products over available crawler instances
    for crawler_id in range(len(crawler_urls)):
        amount_of_prods = products_per_crawler

        # If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
        if crawler_id < rest:
            amount_of_prods += 1

        # Assign the required amount of product ids to the current crawler and remove them from the
        # list of all product ids
        balanced_lists.append(product_ids[:amount_of_prods])
        product_ids = product_ids[amount_of_prods:]

    # Make the callouts to the instances
    successful = 0
    for crawler_id in range(len(crawler_urls)):
        prods = balanced_lists[crawler_id]
        url = crawler_urls[crawler_id]

        # Send request
        data = {
            'key': os.environ['CRAWLER_ACCESS_KEY'],
            'products': prods
        }
        headers = {'content-type': 'application/json', 'accept': 'application/json'}

        resp = requests.post(url=url, data=json.dumps(data), headers=headers)

        if resp.status_code == 200:
            successful += 1

    return successful == len(crawler_urls)


if __name__ == '__main__':
    call_crawlers()
BETTERZON-56: Adding crawler load-balancing script (#28) 2021-04-14 16:52:22 +00:00			`import json`
			`import requests`
			`import os`

			`import sql`


			`def call_crawlers() -> bool:`
			`"""`
			`Fetches the list of all shops, does some load balancing magic and calls all registered crawler`
			`instances to start them`
			`:return: If the calls have been successful`
			`"""`
			`product_ids = sql.getProductsToCrawl()`

			`# crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']`
			`crawler_urls = ['http://localhost:22026']`

			`balanced_lists = []`

			`products_per_crawler = len(product_ids) // len(crawler_urls)`
			`rest = len(product_ids) % len(crawler_urls)`

			`# Distrubute available products over available crawler instances`
			`for crawler_id in range(len(crawler_urls)):`
			`amount_of_prods = products_per_crawler`

			`# If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3`
			`if crawler_id < rest:`
			`amount_of_prods += 1`

			`# Assign the required amount of product ids to the current crawler and remove them from the`
			`# list of all product ids`
			`balanced_lists.append(product_ids[:amount_of_prods])`
			`product_ids = product_ids[amount_of_prods:]`

			`# Make the callouts to the instances`
			`successful = 0`
			`for crawler_id in range(len(crawler_urls)):`
			`prods = balanced_lists[crawler_id]`
			`url = crawler_urls[crawler_id]`

			`# Send request`
			`data = {`
			`'key': os.environ['CRAWLER_ACCESS_KEY'],`
			`'products': prods`
			`}`
			`headers = {'content-type': 'application/json', 'accept': 'application/json'}`

			`resp = requests.post(url=url, data=json.dumps(data), headers=headers)`

			`if resp.status_code == 200:`
			`successful += 1`

			`return successful == len(crawler_urls)`


			`if __name__ == '__main__':`
			`call_crawlers()`