BETTERZON-56: Adding crawler load-balancing script (#28)

This commit is contained in:
Patrick 2021-04-14 18:52:22 +02:00 committed by GitHub
parent 04d12955cd
commit f5fd1825d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 108 additions and 5 deletions

View File

@ -0,0 +1,59 @@
import json
import requests
import os
import sql
def call_crawlers() -> bool:
"""
Fetches the list of all shops, does some load balancing magic and calls all registered crawler
instances to start them
:return: If the calls have been successful
"""
product_ids = sql.getProductsToCrawl()
# crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
crawler_urls = ['http://localhost:22026']
balanced_lists = []
products_per_crawler = len(product_ids) // len(crawler_urls)
rest = len(product_ids) % len(crawler_urls)
# Distrubute available products over available crawler instances
for crawler_id in range(len(crawler_urls)):
amount_of_prods = products_per_crawler
# If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
if crawler_id < rest:
amount_of_prods += 1
# Assign the required amount of product ids to the current crawler and remove them from the
# list of all product ids
balanced_lists.append(product_ids[:amount_of_prods])
product_ids = product_ids[amount_of_prods:]
# Make the callouts to the instances
successful = 0
for crawler_id in range(len(crawler_urls)):
prods = balanced_lists[crawler_id]
url = crawler_urls[crawler_id]
# Send request
data = {
'key': os.environ['CRAWLER_ACCESS_KEY'],
'products': prods
}
headers = {'content-type': 'application/json', 'accept': 'application/json'}
resp = requests.post(url=url, data=json.dumps(data), headers=headers)
if resp.status_code == 200:
successful += 1
return successful == len(crawler_urls)
if __name__ == '__main__':
call_crawlers()

View File

@ -1,2 +1 @@
pymysql
logging

View File

@ -40,3 +40,20 @@ def getShopsToCrawl() -> [int]:
vendor_ids = list(map(lambda x: x[0], cur.fetchall()))
return vendor_ids
def getProductsToCrawl() -> [int]:
"""
Queries the list of product IDs and returns them
:return: The list of IDs
"""
conn = __getConnection__()
cur = conn.cursor()
query = 'SELECT product_id FROM products'
cur.execute(query)
# Extract the IDs from the returned tuples into a list
product_ids = list(map(lambda x: x[0], cur.fetchall()))
return product_ids

View File

@ -1,14 +1,24 @@
from flask import Flask
from flask_restful import Resource, Api
from flask_restful import Resource, Api, reqparse
app = Flask(__name__)
api = Api(app)
# To parse request data
parser = reqparse.RequestParser()
parser.add_argument('key')
parser.add_argument('products')
class CrawlerApi(Resource):
def get(self):
return {'Hallo': 'Betterzon'}
def post(self):
# Accept crawler request here
args = parser.parse_args()
return args
api.add_resource(CrawlerApi, '/')

View File

@ -2,4 +2,3 @@ pymysql
flask
flask-sqlalchemy
flask_restful
logging

View File

@ -25,7 +25,7 @@ def __getConnection__() -> pymysql.Connection:
return
def getProductsForShop(vendor_id: int) -> [{}]:
def getProductsForVendor(vendor_id: int) -> [{}]:
"""
Queries the product links for all products of the given shop
:param vendor_id: The vendor / shop to query products for
@ -43,8 +43,27 @@ def getProductsForShop(vendor_id: int) -> [{}]:
return products
def getProductLinksForProduct(product_id: int) -> [{}]:
"""
Queries all the product links for the given product
:param product_id: The product to query data for
:return: A list of product objects, each having the following parameters:
product_id, vendor_id, url
"""
conn = __getConnection__()
cur = conn.cursor()
def insertShopData(data_to_insert: [tuple]) -> bool:
query = 'SELECT vendor_id, url FROM product_links WHERE product_id = %s'
cur.execute(query, (product_id,))
products = list(map(lambda x: {'product_id': product_id, 'vendor_id': x[0], 'url': x[1]}, cur.fetchall()))
return products
def insertData(data_to_insert: [tuple]) -> bool:
"""
Inserts the given list of tuples into the DB
:param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price