From f5fd1825d7cbb8ac4eb3f48b5065835bdbddf977 Mon Sep 17 00:00:00 2001 From: Patrick <50352812+Mueller-Patrick@users.noreply.github.com> Date: Wed, 14 Apr 2021 18:52:22 +0200 Subject: [PATCH] BETTERZON-56: Adding crawler load-balancing script (#28) --- Crawler-Loadbalancer/dailycrawl.py | 59 +++++++++++++++++++++++++++ Crawler-Loadbalancer/requirements.txt | 1 - Crawler-Loadbalancer/sql.py | 17 ++++++++ Crawler/api.py | 12 +++++- Crawler/requirements.txt | 1 - Crawler/sql.py | 23 ++++++++++- 6 files changed, 108 insertions(+), 5 deletions(-) create mode 100644 Crawler-Loadbalancer/dailycrawl.py diff --git a/Crawler-Loadbalancer/dailycrawl.py b/Crawler-Loadbalancer/dailycrawl.py new file mode 100644 index 0000000..a3da39c --- /dev/null +++ b/Crawler-Loadbalancer/dailycrawl.py @@ -0,0 +1,59 @@ +import json +import requests +import os + +import sql + + +def call_crawlers() -> bool: + """ + Fetches the list of all shops, does some load balancing magic and calls all registered crawler + instances to start them + :return: If the calls have been successful + """ + product_ids = sql.getProductsToCrawl() + + # crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz'] + crawler_urls = ['http://localhost:22026'] + + balanced_lists = [] + + products_per_crawler = len(product_ids) // len(crawler_urls) + rest = len(product_ids) % len(crawler_urls) + + # Distrubute available products over available crawler instances + for crawler_id in range(len(crawler_urls)): + amount_of_prods = products_per_crawler + + # If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3 + if crawler_id < rest: + amount_of_prods += 1 + + # Assign the required amount of product ids to the current crawler and remove them from the + # list of all product ids + balanced_lists.append(product_ids[:amount_of_prods]) + product_ids = product_ids[amount_of_prods:] + + # Make the callouts to the instances + successful = 0 + for crawler_id in range(len(crawler_urls)): + prods = balanced_lists[crawler_id] + url = crawler_urls[crawler_id] + + # Send request + data = { + 'key': os.environ['CRAWLER_ACCESS_KEY'], + 'products': prods + } + headers = {'content-type': 'application/json', 'accept': 'application/json'} + + resp = requests.post(url=url, data=json.dumps(data), headers=headers) + + if resp.status_code == 200: + successful += 1 + + return successful == len(crawler_urls) + + +if __name__ == '__main__': + call_crawlers() diff --git a/Crawler-Loadbalancer/requirements.txt b/Crawler-Loadbalancer/requirements.txt index 89437b8..d4a7eda 100644 --- a/Crawler-Loadbalancer/requirements.txt +++ b/Crawler-Loadbalancer/requirements.txt @@ -1,2 +1 @@ pymysql -logging diff --git a/Crawler-Loadbalancer/sql.py b/Crawler-Loadbalancer/sql.py index 69e11f8..13fa354 100644 --- a/Crawler-Loadbalancer/sql.py +++ b/Crawler-Loadbalancer/sql.py @@ -40,3 +40,20 @@ def getShopsToCrawl() -> [int]: vendor_ids = list(map(lambda x: x[0], cur.fetchall())) return vendor_ids + +def getProductsToCrawl() -> [int]: + """ + Queries the list of product IDs and returns them + :return: The list of IDs + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'SELECT product_id FROM products' + + cur.execute(query) + + # Extract the IDs from the returned tuples into a list + product_ids = list(map(lambda x: x[0], cur.fetchall())) + + return product_ids diff --git a/Crawler/api.py b/Crawler/api.py index b28ba1b..92617c4 100644 --- a/Crawler/api.py +++ b/Crawler/api.py @@ -1,14 +1,24 @@ from flask import Flask -from flask_restful import Resource, Api +from flask_restful import Resource, Api, reqparse app = Flask(__name__) api = Api(app) +# To parse request data +parser = reqparse.RequestParser() +parser.add_argument('key') +parser.add_argument('products') + class CrawlerApi(Resource): def get(self): return {'Hallo': 'Betterzon'} + def post(self): + # Accept crawler request here + args = parser.parse_args() + return args + api.add_resource(CrawlerApi, '/') diff --git a/Crawler/requirements.txt b/Crawler/requirements.txt index 70a6f57..0b9c558 100644 --- a/Crawler/requirements.txt +++ b/Crawler/requirements.txt @@ -2,4 +2,3 @@ pymysql flask flask-sqlalchemy flask_restful -logging diff --git a/Crawler/sql.py b/Crawler/sql.py index 11c30c2..e6f4e61 100644 --- a/Crawler/sql.py +++ b/Crawler/sql.py @@ -25,7 +25,7 @@ def __getConnection__() -> pymysql.Connection: return -def getProductsForShop(vendor_id: int) -> [{}]: +def getProductsForVendor(vendor_id: int) -> [{}]: """ Queries the product links for all products of the given shop :param vendor_id: The vendor / shop to query products for @@ -43,8 +43,27 @@ def getProductsForShop(vendor_id: int) -> [{}]: return products +def getProductLinksForProduct(product_id: int) -> [{}]: + """ + Queries all the product links for the given product + :param product_id: The product to query data for + :return: A list of product objects, each having the following parameters: + product_id, vendor_id, url + """ + conn = __getConnection__() + cur = conn.cursor() -def insertShopData(data_to_insert: [tuple]) -> bool: + query = 'SELECT vendor_id, url FROM product_links WHERE product_id = %s' + + cur.execute(query, (product_id,)) + + products = list(map(lambda x: {'product_id': product_id, 'vendor_id': x[0], 'url': x[1]}, cur.fetchall())) + + return products + + + +def insertData(data_to_insert: [tuple]) -> bool: """ Inserts the given list of tuples into the DB :param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price