mirror of
https://github.com/Mueller-Patrick/Betterzon.git
synced 2024-11-21 22:03:58 +00:00
BETTERZON-56: Adding crawler load-balancing script (#28)
This commit is contained in:
parent
04d12955cd
commit
f5fd1825d7
59
Crawler-Loadbalancer/dailycrawl.py
Normal file
59
Crawler-Loadbalancer/dailycrawl.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
|
||||||
|
import sql
|
||||||
|
|
||||||
|
|
||||||
|
def call_crawlers() -> bool:
|
||||||
|
"""
|
||||||
|
Fetches the list of all shops, does some load balancing magic and calls all registered crawler
|
||||||
|
instances to start them
|
||||||
|
:return: If the calls have been successful
|
||||||
|
"""
|
||||||
|
product_ids = sql.getProductsToCrawl()
|
||||||
|
|
||||||
|
# crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz']
|
||||||
|
crawler_urls = ['http://localhost:22026']
|
||||||
|
|
||||||
|
balanced_lists = []
|
||||||
|
|
||||||
|
products_per_crawler = len(product_ids) // len(crawler_urls)
|
||||||
|
rest = len(product_ids) % len(crawler_urls)
|
||||||
|
|
||||||
|
# Distrubute available products over available crawler instances
|
||||||
|
for crawler_id in range(len(crawler_urls)):
|
||||||
|
amount_of_prods = products_per_crawler
|
||||||
|
|
||||||
|
# If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3
|
||||||
|
if crawler_id < rest:
|
||||||
|
amount_of_prods += 1
|
||||||
|
|
||||||
|
# Assign the required amount of product ids to the current crawler and remove them from the
|
||||||
|
# list of all product ids
|
||||||
|
balanced_lists.append(product_ids[:amount_of_prods])
|
||||||
|
product_ids = product_ids[amount_of_prods:]
|
||||||
|
|
||||||
|
# Make the callouts to the instances
|
||||||
|
successful = 0
|
||||||
|
for crawler_id in range(len(crawler_urls)):
|
||||||
|
prods = balanced_lists[crawler_id]
|
||||||
|
url = crawler_urls[crawler_id]
|
||||||
|
|
||||||
|
# Send request
|
||||||
|
data = {
|
||||||
|
'key': os.environ['CRAWLER_ACCESS_KEY'],
|
||||||
|
'products': prods
|
||||||
|
}
|
||||||
|
headers = {'content-type': 'application/json', 'accept': 'application/json'}
|
||||||
|
|
||||||
|
resp = requests.post(url=url, data=json.dumps(data), headers=headers)
|
||||||
|
|
||||||
|
if resp.status_code == 200:
|
||||||
|
successful += 1
|
||||||
|
|
||||||
|
return successful == len(crawler_urls)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
call_crawlers()
|
|
@ -1,2 +1 @@
|
||||||
pymysql
|
pymysql
|
||||||
logging
|
|
||||||
|
|
|
@ -40,3 +40,20 @@ def getShopsToCrawl() -> [int]:
|
||||||
vendor_ids = list(map(lambda x: x[0], cur.fetchall()))
|
vendor_ids = list(map(lambda x: x[0], cur.fetchall()))
|
||||||
|
|
||||||
return vendor_ids
|
return vendor_ids
|
||||||
|
|
||||||
|
def getProductsToCrawl() -> [int]:
|
||||||
|
"""
|
||||||
|
Queries the list of product IDs and returns them
|
||||||
|
:return: The list of IDs
|
||||||
|
"""
|
||||||
|
conn = __getConnection__()
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
query = 'SELECT product_id FROM products'
|
||||||
|
|
||||||
|
cur.execute(query)
|
||||||
|
|
||||||
|
# Extract the IDs from the returned tuples into a list
|
||||||
|
product_ids = list(map(lambda x: x[0], cur.fetchall()))
|
||||||
|
|
||||||
|
return product_ids
|
||||||
|
|
|
@ -1,14 +1,24 @@
|
||||||
from flask import Flask
|
from flask import Flask
|
||||||
from flask_restful import Resource, Api
|
from flask_restful import Resource, Api, reqparse
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
api = Api(app)
|
api = Api(app)
|
||||||
|
|
||||||
|
# To parse request data
|
||||||
|
parser = reqparse.RequestParser()
|
||||||
|
parser.add_argument('key')
|
||||||
|
parser.add_argument('products')
|
||||||
|
|
||||||
|
|
||||||
class CrawlerApi(Resource):
|
class CrawlerApi(Resource):
|
||||||
def get(self):
|
def get(self):
|
||||||
return {'Hallo': 'Betterzon'}
|
return {'Hallo': 'Betterzon'}
|
||||||
|
|
||||||
|
def post(self):
|
||||||
|
# Accept crawler request here
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
api.add_resource(CrawlerApi, '/')
|
api.add_resource(CrawlerApi, '/')
|
||||||
|
|
||||||
|
|
|
@ -2,4 +2,3 @@ pymysql
|
||||||
flask
|
flask
|
||||||
flask-sqlalchemy
|
flask-sqlalchemy
|
||||||
flask_restful
|
flask_restful
|
||||||
logging
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ def __getConnection__() -> pymysql.Connection:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def getProductsForShop(vendor_id: int) -> [{}]:
|
def getProductsForVendor(vendor_id: int) -> [{}]:
|
||||||
"""
|
"""
|
||||||
Queries the product links for all products of the given shop
|
Queries the product links for all products of the given shop
|
||||||
:param vendor_id: The vendor / shop to query products for
|
:param vendor_id: The vendor / shop to query products for
|
||||||
|
@ -43,8 +43,27 @@ def getProductsForShop(vendor_id: int) -> [{}]:
|
||||||
|
|
||||||
return products
|
return products
|
||||||
|
|
||||||
|
def getProductLinksForProduct(product_id: int) -> [{}]:
|
||||||
|
"""
|
||||||
|
Queries all the product links for the given product
|
||||||
|
:param product_id: The product to query data for
|
||||||
|
:return: A list of product objects, each having the following parameters:
|
||||||
|
product_id, vendor_id, url
|
||||||
|
"""
|
||||||
|
conn = __getConnection__()
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
def insertShopData(data_to_insert: [tuple]) -> bool:
|
query = 'SELECT vendor_id, url FROM product_links WHERE product_id = %s'
|
||||||
|
|
||||||
|
cur.execute(query, (product_id,))
|
||||||
|
|
||||||
|
products = list(map(lambda x: {'product_id': product_id, 'vendor_id': x[0], 'url': x[1]}, cur.fetchall()))
|
||||||
|
|
||||||
|
return products
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def insertData(data_to_insert: [tuple]) -> bool:
|
||||||
"""
|
"""
|
||||||
Inserts the given list of tuples into the DB
|
Inserts the given list of tuples into the DB
|
||||||
:param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price
|
:param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price
|
||||||
|
|
Loading…
Reference in New Issue
Block a user