From f5fd1825d7cbb8ac4eb3f48b5065835bdbddf977 Mon Sep 17 00:00:00 2001
From: Patrick <50352812+Mueller-Patrick@users.noreply.github.com>
Date: Wed, 14 Apr 2021 18:52:22 +0200
Subject: [PATCH] BETTERZON-56: Adding crawler load-balancing script (#28)

---
 Crawler-Loadbalancer/dailycrawl.py    | 59 +++++++++++++++++++++++++++
 Crawler-Loadbalancer/requirements.txt |  1 -
 Crawler-Loadbalancer/sql.py           | 17 ++++++++
 Crawler/api.py                        | 12 +++++-
 Crawler/requirements.txt              |  1 -
 Crawler/sql.py                        | 23 ++++++++++-
 6 files changed, 108 insertions(+), 5 deletions(-)
 create mode 100644 Crawler-Loadbalancer/dailycrawl.py

diff --git a/Crawler-Loadbalancer/dailycrawl.py b/Crawler-Loadbalancer/dailycrawl.py
new file mode 100644
index 0000000..a3da39c
--- /dev/null
+++ b/Crawler-Loadbalancer/dailycrawl.py
@@ -0,0 +1,59 @@
+import json
+import requests
+import os
+
+import sql
+
+
+def call_crawlers() -> bool:
+    """
+    Fetches the list of all shops, does some load balancing magic and calls all registered crawler
+    instances to start them
+    :return: If the calls have been successful
+    """
+    product_ids = sql.getProductsToCrawl()
+
+    # crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
+    crawler_urls = ['http://localhost:22026']
+
+    balanced_lists = []
+
+    products_per_crawler = len(product_ids) // len(crawler_urls)
+    rest = len(product_ids) % len(crawler_urls)
+
+    # Distrubute available products over available crawler instances
+    for crawler_id in range(len(crawler_urls)):
+        amount_of_prods = products_per_crawler
+
+        # If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
+        if crawler_id < rest:
+            amount_of_prods += 1
+
+        # Assign the required amount of product ids to the current crawler and remove them from the
+        # list of all product ids
+        balanced_lists.append(product_ids[:amount_of_prods])
+        product_ids = product_ids[amount_of_prods:]
+
+    # Make the callouts to the instances
+    successful = 0
+    for crawler_id in range(len(crawler_urls)):
+        prods = balanced_lists[crawler_id]
+        url = crawler_urls[crawler_id]
+
+        # Send request
+        data = {
+            'key': os.environ['CRAWLER_ACCESS_KEY'],
+            'products': prods
+        }
+        headers = {'content-type': 'application/json', 'accept': 'application/json'}
+
+        resp = requests.post(url=url, data=json.dumps(data), headers=headers)
+
+        if resp.status_code == 200:
+            successful += 1
+
+    return successful == len(crawler_urls)
+
+
+if __name__ == '__main__':
+    call_crawlers()
diff --git a/Crawler-Loadbalancer/requirements.txt b/Crawler-Loadbalancer/requirements.txt
index 89437b8..d4a7eda 100644
--- a/Crawler-Loadbalancer/requirements.txt
+++ b/Crawler-Loadbalancer/requirements.txt
@@ -1,2 +1 @@
 pymysql
-logging
diff --git a/Crawler-Loadbalancer/sql.py b/Crawler-Loadbalancer/sql.py
index 69e11f8..13fa354 100644
--- a/Crawler-Loadbalancer/sql.py
+++ b/Crawler-Loadbalancer/sql.py
@@ -40,3 +40,20 @@ def getShopsToCrawl() -> [int]:
     vendor_ids = list(map(lambda x: x[0], cur.fetchall()))
 
     return vendor_ids
+
+def getProductsToCrawl() -> [int]:
+    """
+    Queries the list of product IDs and returns them
+    :return: The list of IDs
+    """
+    conn = __getConnection__()
+    cur = conn.cursor()
+
+    query = 'SELECT product_id FROM products'
+
+    cur.execute(query)
+
+    # Extract the IDs from the returned tuples into a list
+    product_ids = list(map(lambda x: x[0], cur.fetchall()))
+
+    return product_ids
diff --git a/Crawler/api.py b/Crawler/api.py
index b28ba1b..92617c4 100644
--- a/Crawler/api.py
+++ b/Crawler/api.py
@@ -1,14 +1,24 @@
 from flask import Flask
-from flask_restful import Resource, Api
+from flask_restful import Resource, Api, reqparse
 
 app = Flask(__name__)
 api = Api(app)
 
+# To parse request data
+parser = reqparse.RequestParser()
+parser.add_argument('key')
+parser.add_argument('products')
+
 
 class CrawlerApi(Resource):
     def get(self):
         return {'Hallo': 'Betterzon'}
 
+    def post(self):
+        # Accept crawler request here
+        args = parser.parse_args()
+        return args
+
 
 api.add_resource(CrawlerApi, '/')
 
diff --git a/Crawler/requirements.txt b/Crawler/requirements.txt
index 70a6f57..0b9c558 100644
--- a/Crawler/requirements.txt
+++ b/Crawler/requirements.txt
@@ -2,4 +2,3 @@ pymysql
 flask
 flask-sqlalchemy
 flask_restful
-logging
diff --git a/Crawler/sql.py b/Crawler/sql.py
index 11c30c2..e6f4e61 100644
--- a/Crawler/sql.py
+++ b/Crawler/sql.py
@@ -25,7 +25,7 @@ def __getConnection__() -> pymysql.Connection:
         return
 
 
-def getProductsForShop(vendor_id: int) -> [{}]:
+def getProductsForVendor(vendor_id: int) -> [{}]:
     """
     Queries the product links for all products of the given shop
     :param vendor_id: The vendor / shop to query products for
@@ -43,8 +43,27 @@ def getProductsForShop(vendor_id: int) -> [{}]:
 
     return products
 
+def getProductLinksForProduct(product_id: int) -> [{}]:
+    """
+    Queries all the product links for the given product
+    :param product_id: The product to query data for
+    :return: A list of product objects, each having the following parameters:
+                product_id, vendor_id, url
+    """
+    conn = __getConnection__()
+    cur = conn.cursor()
 
-def insertShopData(data_to_insert: [tuple]) -> bool:
+    query = 'SELECT vendor_id, url FROM product_links WHERE product_id = %s'
+
+    cur.execute(query, (product_id,))
+
+    products = list(map(lambda x: {'product_id': product_id, 'vendor_id': x[0], 'url': x[1]}, cur.fetchall()))
+
+    return products
+
+
+
+def insertData(data_to_insert: [tuple]) -> bool:
     """
     Inserts the given list of tuples into the DB
     :param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price