BETTERZON-56: Adding crawler load-balancing script (#28)

2026-04-26 23:30:11 +00:00 · 2021-04-14 18:52:22 +02:00
parent 04d12955cd
commit f5fd1825d7
6 changed files with 108 additions and 5 deletions
@@ -0,0 +1,59 @@
+import json
+import requests
+import os
+
+import sql
+
+
+def call_crawlers() -> bool:
+    """
+    Fetches the list of all shops, does some load balancing magic and calls all registered crawler
+    instances to start them
+    :return: If the calls have been successful
+    """
+    product_ids = sql.getProductsToCrawl()
+
+    # crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
+    crawler_urls = ['http://localhost:22026']
+
+    balanced_lists = []
+
+    products_per_crawler = len(product_ids) // len(crawler_urls)
+    rest = len(product_ids) % len(crawler_urls)
+
+    # Distrubute available products over available crawler instances
+    for crawler_id in range(len(crawler_urls)):
+        amount_of_prods = products_per_crawler
+
+        # If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
+        if crawler_id < rest:
+            amount_of_prods += 1
+
+        # Assign the required amount of product ids to the current crawler and remove them from the
+        # list of all product ids
+        balanced_lists.append(product_ids[:amount_of_prods])
+        product_ids = product_ids[amount_of_prods:]
+
+    # Make the callouts to the instances
+    successful = 0
+    for crawler_id in range(len(crawler_urls)):
+        prods = balanced_lists[crawler_id]
+        url = crawler_urls[crawler_id]
+
+        # Send request
+        data = {
+            'key': os.environ['CRAWLER_ACCESS_KEY'],
+            'products': prods
+        }
+        headers = {'content-type': 'application/json', 'accept': 'application/json'}
+
+        resp = requests.post(url=url, data=json.dumps(data), headers=headers)
+
+        if resp.status_code == 200:
+            successful += 1
+
+    return successful == len(crawler_urls)
+
+
+if __name__ == '__main__':
+    call_crawlers()
@@ -1,2 +1 @@
 pymysql
-logging
@@ -40,3 +40,20 @@ def getShopsToCrawl() -> [int]:
    vendor_ids = list(map(lambda x: x[0], cur.fetchall()))

    return vendor_ids
+
+def getProductsToCrawl() -> [int]:
+    """
+    Queries the list of product IDs and returns them
+    :return: The list of IDs
+    """
+    conn = __getConnection__()
+    cur = conn.cursor()
+
+    query = 'SELECT product_id FROM products'
+
+    cur.execute(query)
+
+    # Extract the IDs from the returned tuples into a list
+    product_ids = list(map(lambda x: x[0], cur.fetchall()))
+
+    return product_ids