mirror of
https://github.com/Mueller-Patrick/Betterzon.git
synced 2024-12-22 03:35:13 +00:00
BETTERZON-56: Adding crawler load-balancing script (#28)
This commit is contained in:
parent
04d12955cd
commit
f5fd1825d7
59
Crawler-Loadbalancer/dailycrawl.py
Normal file
59
Crawler-Loadbalancer/dailycrawl.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
import json
|
||||
import requests
|
||||
import os
|
||||
|
||||
import sql
|
||||
|
||||
|
||||
def call_crawlers() -> bool:
|
||||
"""
|
||||
Fetches the list of all shops, does some load balancing magic and calls all registered crawler
|
||||
instances to start them
|
||||
:return: If the calls have been successful
|
||||
"""
|
||||
product_ids = sql.getProductsToCrawl()
|
||||
|
||||
# crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
|
||||
crawler_urls = ['http://localhost:22026']
|
||||
|
||||
balanced_lists = []
|
||||
|
||||
products_per_crawler = len(product_ids) // len(crawler_urls)
|
||||
rest = len(product_ids) % len(crawler_urls)
|
||||
|
||||
# Distrubute available products over available crawler instances
|
||||
for crawler_id in range(len(crawler_urls)):
|
||||
amount_of_prods = products_per_crawler
|
||||
|
||||
# If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
|
||||
if crawler_id < rest:
|
||||
amount_of_prods += 1
|
||||
|
||||
# Assign the required amount of product ids to the current crawler and remove them from the
|
||||
# list of all product ids
|
||||
balanced_lists.append(product_ids[:amount_of_prods])
|
||||
product_ids = product_ids[amount_of_prods:]
|
||||
|
||||
# Make the callouts to the instances
|
||||
successful = 0
|
||||
for crawler_id in range(len(crawler_urls)):
|
||||
prods = balanced_lists[crawler_id]
|
||||
url = crawler_urls[crawler_id]
|
||||
|
||||
# Send request
|
||||
data = {
|
||||
'key': os.environ['CRAWLER_ACCESS_KEY'],
|
||||
'products': prods
|
||||
}
|
||||
headers = {'content-type': 'application/json', 'accept': 'application/json'}
|
||||
|
||||
resp = requests.post(url=url, data=json.dumps(data), headers=headers)
|
||||
|
||||
if resp.status_code == 200:
|
||||
successful += 1
|
||||
|
||||
return successful == len(crawler_urls)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
call_crawlers()
|
|
@ -1,2 +1 @@
|
|||
pymysql
|
||||
logging
|
||||
|
|
|
@ -40,3 +40,20 @@ def getShopsToCrawl() -> [int]:
|
|||
vendor_ids = list(map(lambda x: x[0], cur.fetchall()))
|
||||
|
||||
return vendor_ids
|
||||
|
||||
def getProductsToCrawl() -> [int]:
|
||||
"""
|
||||
Queries the list of product IDs and returns them
|
||||
:return: The list of IDs
|
||||
"""
|
||||
conn = __getConnection__()
|
||||
cur = conn.cursor()
|
||||
|
||||
query = 'SELECT product_id FROM products'
|
||||
|
||||
cur.execute(query)
|
||||
|
||||
# Extract the IDs from the returned tuples into a list
|
||||
product_ids = list(map(lambda x: x[0], cur.fetchall()))
|
||||
|
||||
return product_ids
|
||||
|
|
|
@ -1,14 +1,24 @@
|
|||
from flask import Flask
|
||||
from flask_restful import Resource, Api
|
||||
from flask_restful import Resource, Api, reqparse
|
||||
|
||||
app = Flask(__name__)
|
||||
api = Api(app)
|
||||
|
||||
# To parse request data
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument('key')
|
||||
parser.add_argument('products')
|
||||
|
||||
|
||||
class CrawlerApi(Resource):
|
||||
def get(self):
|
||||
return {'Hallo': 'Betterzon'}
|
||||
|
||||
def post(self):
|
||||
# Accept crawler request here
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
api.add_resource(CrawlerApi, '/')
|
||||
|
||||
|
|
|
@ -2,4 +2,3 @@ pymysql
|
|||
flask
|
||||
flask-sqlalchemy
|
||||
flask_restful
|
||||
logging
|
||||
|
|
|
@ -25,7 +25,7 @@ def __getConnection__() -> pymysql.Connection:
|
|||
return
|
||||
|
||||
|
||||
def getProductsForShop(vendor_id: int) -> [{}]:
|
||||
def getProductsForVendor(vendor_id: int) -> [{}]:
|
||||
"""
|
||||
Queries the product links for all products of the given shop
|
||||
:param vendor_id: The vendor / shop to query products for
|
||||
|
@ -43,8 +43,27 @@ def getProductsForShop(vendor_id: int) -> [{}]:
|
|||
|
||||
return products
|
||||
|
||||
def getProductLinksForProduct(product_id: int) -> [{}]:
|
||||
"""
|
||||
Queries all the product links for the given product
|
||||
:param product_id: The product to query data for
|
||||
:return: A list of product objects, each having the following parameters:
|
||||
product_id, vendor_id, url
|
||||
"""
|
||||
conn = __getConnection__()
|
||||
cur = conn.cursor()
|
||||
|
||||
def insertShopData(data_to_insert: [tuple]) -> bool:
|
||||
query = 'SELECT vendor_id, url FROM product_links WHERE product_id = %s'
|
||||
|
||||
cur.execute(query, (product_id,))
|
||||
|
||||
products = list(map(lambda x: {'product_id': product_id, 'vendor_id': x[0], 'url': x[1]}, cur.fetchall()))
|
||||
|
||||
return products
|
||||
|
||||
|
||||
|
||||
def insertData(data_to_insert: [tuple]) -> bool:
|
||||
"""
|
||||
Inserts the given list of tuples into the DB
|
||||
:param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price
|
||||
|
|
Loading…
Reference in New Issue
Block a user