BETTERZON-56: Adding crawler load-balancing script (#28)

2025-06-07 14:19:19 +00:00 · 2021-04-14 18:52:22 +02:00 · 2021-04-14 18:52:22 +02:00 · f5fd1825d7
commit f5fd1825d7
parent 04d12955cd
6 changed files with 108 additions and 5 deletions
--- a/Crawler-Loadbalancer/dailycrawl.py
+++ b/Crawler-Loadbalancer/dailycrawl.py
@ -0,0 +1,59 @@
 import json
 import requests
 import os
 import sql
 def call_crawlers() -> bool:
    """
    Fetches the list of all shops, does some load balancing magic and calls all registered crawler
    instances to start them
    :return: If the calls have been successful
    """
    product_ids = sql.getProductsToCrawl()
    # crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
    crawler_urls = ['http://localhost:22026']
    balanced_lists = []
    products_per_crawler = len(product_ids) // len(crawler_urls)
    rest = len(product_ids) % len(crawler_urls)
    # Distrubute available products over available crawler instances
    for crawler_id in range(len(crawler_urls)):
        amount_of_prods = products_per_crawler
        # If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
        if crawler_id < rest:
            amount_of_prods += 1
        # Assign the required amount of product ids to the current crawler and remove them from the
        # list of all product ids
        balanced_lists.append(product_ids[:amount_of_prods])
        product_ids = product_ids[amount_of_prods:]
    # Make the callouts to the instances
    successful = 0
    for crawler_id in range(len(crawler_urls)):
        prods = balanced_lists[crawler_id]
        url = crawler_urls[crawler_id]
        # Send request
        data = {
            'key': os.environ['CRAWLER_ACCESS_KEY'],
            'products': prods
        }
        headers = {'content-type': 'application/json', 'accept': 'application/json'}
        resp = requests.post(url=url, data=json.dumps(data), headers=headers)
        if resp.status_code == 200:
            successful += 1
    return successful == len(crawler_urls)
 if __name__ == '__main__':
    call_crawlers()
--- a/Crawler-Loadbalancer/requirements.txt
+++ b/Crawler-Loadbalancer/requirements.txt
@ -1,2 +1 @@
 pymysql
 logging
--- a/Crawler-Loadbalancer/sql.py
+++ b/Crawler-Loadbalancer/sql.py
@ -40,3 +40,20 @@ def getShopsToCrawl() -> [int]:
    vendor_ids = list(map(lambda x: x[0], cur.fetchall()))
    return vendor_ids
 def getProductsToCrawl() -> [int]:
    """
    Queries the list of product IDs and returns them
    :return: The list of IDs
    """
    conn = __getConnection__()
    cur = conn.cursor()
    query = 'SELECT product_id FROM products'
    cur.execute(query)
    # Extract the IDs from the returned tuples into a list
    product_ids = list(map(lambda x: x[0], cur.fetchall()))
    return product_ids
--- a/Crawler/api.py
+++ b/Crawler/api.py
@ -1,14 +1,24 @@
 from flask import Flask
-from flask_restful import Resource, Api
+from flask_restful import Resource, Api, reqparse
 app = Flask(__name__)
 api = Api(app)
 # To parse request data
 parser = reqparse.RequestParser()
 parser.add_argument('key')
 parser.add_argument('products')
 class CrawlerApi(Resource):
    def get(self):
        return {'Hallo': 'Betterzon'}
    def post(self):
        # Accept crawler request here
        args = parser.parse_args()
        return args
 api.add_resource(CrawlerApi, '/')
--- a/Crawler/requirements.txt
+++ b/Crawler/requirements.txt
@ -2,4 +2,3 @@ pymysql
 flask
 flask-sqlalchemy
 flask_restful
 logging
--- a/Crawler/sql.py
+++ b/Crawler/sql.py
@ -25,7 +25,7 @@ def __getConnection__() -> pymysql.Connection:
        return
-def getProductsForShop(vendor_id: int) -> [{}]:
+def getProductsForVendor(vendor_id: int) -> [{}]:
    """
    Queries the product links for all products of the given shop
    :param vendor_id: The vendor / shop to query products for
@ -43,8 +43,27 @@ def getProductsForShop(vendor_id: int) -> [{}]:
    return products
 def getProductLinksForProduct(product_id: int) -> [{}]:
    """
    Queries all the product links for the given product
    :param product_id: The product to query data for
    :return: A list of product objects, each having the following parameters:
                product_id, vendor_id, url
    """
    conn = __getConnection__()
    cur = conn.cursor()
-def insertShopData(data_to_insert: [tuple]) -> bool:
+    query = 'SELECT vendor_id, url FROM product_links WHERE product_id = %s'
    cur.execute(query, (product_id,))
    products = list(map(lambda x: {'product_id': product_id, 'vendor_id': x[0], 'url': x[1]}, cur.fetchall()))
    return products
 def insertData(data_to_insert: [tuple]) -> bool:
    """
    Inserts the given list of tuples into the DB
    :param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price