diff --git a/.gitignore b/.gitignore index ff35e3c..d099c29 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ speed-measure-plugin*.json !Backend.iml !CucumberTests.iml !Crawler.iml +!Crawler-Loadbalancer.iml # Include IntelliJ modules !/.idea/modules.xml diff --git a/.idea/modules.xml b/.idea/modules.xml index 44a6847..53cf20e 100644 --- a/.idea/modules.xml +++ b/.idea/modules.xml @@ -5,6 +5,7 @@ + diff --git a/Backend/src/models/products/products.router.ts b/Backend/src/models/products/products.router.ts index f2f3353..03649de 100644 --- a/Backend/src/models/products/products.router.ts +++ b/Backend/src/models/products/products.router.ts @@ -69,6 +69,25 @@ productsRouter.get('/search/:term', async (req: Request, res: Response) => { } }); +// GET products/list/[1,2,3] + +productsRouter.get('/list/:ids', async (req: Request, res: Response) => { + const ids: [number] = JSON.parse(req.params.ids); + + if (!ids) { + res.status(400).send('Missing parameters.'); + return; + } + + try { + const products: Products = await ProductService.findList(ids); + + res.status(200).send(products); + } catch (e) { + res.status(404).send(e.message); + } +}); + // GET products/bestDeals diff --git a/Backend/src/models/products/products.service.ts b/Backend/src/models/products/products.service.ts index 8cca5b0..2c612e2 100644 --- a/Backend/src/models/products/products.service.ts +++ b/Backend/src/models/products/products.service.ts @@ -122,6 +122,29 @@ export const findBySearchTerm = async (term: string): Promise => { return prodRows; }; +export const findList = async (ids: [number]): Promise => { + let conn; + let prodRows = []; + try { + conn = await pool.getConnection(); + const rows = await conn.query('SELECT product_id, name, asin, is_active, short_description, long_description, image_guid, date_added, last_modified, manufacturer_id, selling_rank, category_id FROM products WHERE product_id IN (?)', [ids]); + for (let row in rows) { + if (row !== 'meta') { + prodRows.push(rows[row]); + } + } + + } catch (err) { + throw err; + } finally { + if (conn) { + conn.end(); + } + } + + return prodRows; +}; + // export const create = async (newItem: Product): Promise => { // let conn; // try { diff --git a/Crawler-Loadbalancer/Crawler-Loadbalancer.iml b/Crawler-Loadbalancer/Crawler-Loadbalancer.iml new file mode 100644 index 0000000..ad3c0a3 --- /dev/null +++ b/Crawler-Loadbalancer/Crawler-Loadbalancer.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/Crawler-Loadbalancer/dailycrawl.py b/Crawler-Loadbalancer/dailycrawl.py new file mode 100644 index 0000000..a3da39c --- /dev/null +++ b/Crawler-Loadbalancer/dailycrawl.py @@ -0,0 +1,59 @@ +import json +import requests +import os + +import sql + + +def call_crawlers() -> bool: + """ + Fetches the list of all shops, does some load balancing magic and calls all registered crawler + instances to start them + :return: If the calls have been successful + """ + product_ids = sql.getProductsToCrawl() + + # crawler_urls = ['crawl.p4ddy.com', 'crawl.betterzon.xyz'] + crawler_urls = ['http://localhost:22026'] + + balanced_lists = [] + + products_per_crawler = len(product_ids) // len(crawler_urls) + rest = len(product_ids) % len(crawler_urls) + + # Distrubute available products over available crawler instances + for crawler_id in range(len(crawler_urls)): + amount_of_prods = products_per_crawler + + # If we e.g. have 7 products but 2 crawlers, the first needs to crawl 4 products and the 2nd 3 + if crawler_id < rest: + amount_of_prods += 1 + + # Assign the required amount of product ids to the current crawler and remove them from the + # list of all product ids + balanced_lists.append(product_ids[:amount_of_prods]) + product_ids = product_ids[amount_of_prods:] + + # Make the callouts to the instances + successful = 0 + for crawler_id in range(len(crawler_urls)): + prods = balanced_lists[crawler_id] + url = crawler_urls[crawler_id] + + # Send request + data = { + 'key': os.environ['CRAWLER_ACCESS_KEY'], + 'products': prods + } + headers = {'content-type': 'application/json', 'accept': 'application/json'} + + resp = requests.post(url=url, data=json.dumps(data), headers=headers) + + if resp.status_code == 200: + successful += 1 + + return successful == len(crawler_urls) + + +if __name__ == '__main__': + call_crawlers() diff --git a/Crawler-Loadbalancer/requirements.txt b/Crawler-Loadbalancer/requirements.txt new file mode 100644 index 0000000..d4a7eda --- /dev/null +++ b/Crawler-Loadbalancer/requirements.txt @@ -0,0 +1 @@ +pymysql diff --git a/Crawler-Loadbalancer/sql.py b/Crawler-Loadbalancer/sql.py new file mode 100644 index 0000000..13fa354 --- /dev/null +++ b/Crawler-Loadbalancer/sql.py @@ -0,0 +1,59 @@ +import pymysql +import os +import logging + + +def __getConnection__() -> pymysql.Connection: + """ + Opens a new pymysql connection and returns it + :return: A pymysql Connection object + """ + logger = logging.getLogger() + try: + conn = pymysql.connect( + user=os.environ['BETTERZON_CRAWLER_USER'], + password=os.environ['BETTERZON_CRAWLER_PASSWORD'], + host=os.environ['BETTERZON_CRAWLER_HOST'], + port=3306, + database=os.environ['BETTERZON_CRAWLER_DB'] + ) + + return conn + except pymysql.Error as e: + logger.error('SQL Connection error: %s', e) + return + + +def getShopsToCrawl() -> [int]: + """ + Queries the list of vendor IDs and returns them + :return: The list of IDs + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'SELECT vendor_id FROM vendors' + + cur.execute(query) + + # Extract the IDs from the returned tuples into a list + vendor_ids = list(map(lambda x: x[0], cur.fetchall())) + + return vendor_ids + +def getProductsToCrawl() -> [int]: + """ + Queries the list of product IDs and returns them + :return: The list of IDs + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'SELECT product_id FROM products' + + cur.execute(query) + + # Extract the IDs from the returned tuples into a list + product_ids = list(map(lambda x: x[0], cur.fetchall())) + + return product_ids diff --git a/Crawler/Crawler.iml b/Crawler/Crawler.iml index db1dd1c..8568e2d 100644 --- a/Crawler/Crawler.iml +++ b/Crawler/Crawler.iml @@ -2,13 +2,13 @@ - + - + \ No newline at end of file diff --git a/Crawler/api.py b/Crawler/api.py index b28ba1b..92617c4 100644 --- a/Crawler/api.py +++ b/Crawler/api.py @@ -1,14 +1,24 @@ from flask import Flask -from flask_restful import Resource, Api +from flask_restful import Resource, Api, reqparse app = Flask(__name__) api = Api(app) +# To parse request data +parser = reqparse.RequestParser() +parser.add_argument('key') +parser.add_argument('products') + class CrawlerApi(Resource): def get(self): return {'Hallo': 'Betterzon'} + def post(self): + # Accept crawler request here + args = parser.parse_args() + return args + api.add_resource(CrawlerApi, '/') diff --git a/Crawler/crawler.py b/Crawler/crawler.py new file mode 100644 index 0000000..99ff867 --- /dev/null +++ b/Crawler/crawler.py @@ -0,0 +1,78 @@ +import sql + + +def crawl(product_ids: [int]) -> dict: + """ + Crawls the given list of products and saves the results to sql + :param products: The list of product IDs to fetch + :return: A dict with the following fields: + total_crawls: number of total crawl tries (products * vendors per product) + successful_crawls: number of successful products + products_with_problems: list of products that have not been crawled successfully + """ + total_crawls = 0 + successful_crawls = 0 + products_with_problems = [] + + # Iterate over every product that has to be crawled + for product_id in product_ids: + # Get all links for this product + product_links = sql.getProductLinksForProduct(product_id) + + crawled_data = [] + + # Iterate over every link / vendor + for product_vendor_info in product_links: + total_crawls += 1 + + # Call the appropriate vendor crawling function and append the result to the list of crawled data + if product_vendor_info['vendor_id'] == 1: + # Amazon + crawled_data.append(__crawl_amazon__(product_vendor_info)) + elif product_vendor_info['vendor_id'] == 2: + # Apple + crawled_data.append(__crawl_apple__(product_vendor_info)) + elif product_vendor_info['vendor_id'] == 3: + # Media Markt + crawled_data.append(__crawl_mediamarkt__(product_vendor_info)) + else: + products_with_problems.append(product_vendor_info) + continue + + successful_crawls += 1 + + # Insert data to SQL + sql.insertData(crawled_data) + + return { + 'total_crawls': total_crawls, + 'successful_crawls': successful_crawls, + 'products_with_problems': products_with_problems + } + + +def __crawl_amazon__(product_info: dict) -> tuple: + """ + Crawls the price for the given product from amazon + :param product_info: A dict with product info containing product_id, vendor_id, url + :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) + """ + return (product_info['product_id'], product_info['vendor_id'], 123) + + +def __crawl_apple__(product_info: dict) -> tuple: + """ + Crawls the price for the given product from apple + :param product_info: A dict with product info containing product_id, vendor_id, url + :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) + """ + return (product_info['product_id'], product_info['vendor_id'], 123) + + +def __crawl_mediamarkt__(product_info: dict) -> tuple: + """ + Crawls the price for the given product from media markt + :param product_info: A dict with product info containing product_id, vendor_id, url + :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) + """ + pass diff --git a/Crawler/requirements.txt b/Crawler/requirements.txt index 04ca272..0b9c558 100644 --- a/Crawler/requirements.txt +++ b/Crawler/requirements.txt @@ -1,4 +1,4 @@ pymysql flask flask-sqlalchemy -flask_restful \ No newline at end of file +flask_restful diff --git a/Crawler/sql.py b/Crawler/sql.py new file mode 100644 index 0000000..1cf3a58 --- /dev/null +++ b/Crawler/sql.py @@ -0,0 +1,89 @@ +import logging + +import pymysql +import os + + +def __getConnection__() -> pymysql.Connection: + """ + Opens a new pymysql connection and returns it + :return: A pymysql Connection object + """ + logger = logging.getLogger() + try: + conn = pymysql.connect( + user=os.environ['BETTERZON_CRAWLER_USER'], + password=os.environ['BETTERZON_CRAWLER_PASSWORD'], + host=os.environ['BETTERZON_CRAWLER_HOST'], + port=3306, + database=os.environ['BETTERZON_CRAWLER_DB'] + ) + + return conn + except pymysql.Error as e: + logger.error('SQL Connection error: %s', e) + return + + +def getProductsForVendor(vendor_id: int) -> [{}]: + """ + Queries the product links for all products of the given shop + :param vendor_id: The vendor / shop to query products for + :return: A list of product objects, each having the following parameters: + product_id, vendor_id, url + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'SELECT product_id, url FROM product_links WHERE vendor_id = %s' + + cur.execute(query, (vendor_id,)) + + products = list(map(lambda x: {'product_id': x[0], 'vendor_id': vendor_id, 'url': x[1]}, cur.fetchall())) + + return products + +def getProductLinksForProduct(product_id: int) -> [dict]: + """ + Queries all the product links for the given product + :param product_id: The product to query data for + :return: A list of product objects, each having the following parameters: + product_id, vendor_id, url + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'SELECT vendor_id, url FROM product_links WHERE product_id = %s' + + cur.execute(query, (product_id,)) + + products = list(map(lambda x: {'product_id': product_id, 'vendor_id': x[0], 'url': x[1]}, cur.fetchall())) + + return products + + + +def insertData(data_to_insert: [tuple]) -> bool: + """ + Inserts the given list of tuples into the DB + :param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price + in exactly this order + :return: If the insert was successful + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'INSERT INTO prices (product_id, vendor_id, price_in_cents, timestamp) VALUES (%s, %s, %s, NOW())' + + affectedRows = cur.executemany(query, data_to_insert) + + if affectedRows != len(data_to_insert): + # Something went wrong, revert the changes + conn.rollback() + else: + conn.commit() + + cur.close() + conn.close() + + return affectedRows == len(data_to_insert) diff --git a/doku/AC_AddProducts.drawio b/doku/AC_AddProducts.drawio new file mode 100644 index 0000000..aa2fff0 --- /dev/null +++ b/doku/AC_AddProducts.drawio @@ -0,0 +1,137 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doku/AC_AddProducts.png b/doku/AC_AddProducts.png new file mode 100644 index 0000000..1386a5f Binary files /dev/null and b/doku/AC_AddProducts.png differ diff --git a/doku/AC_Administration.drawio b/doku/AC_Administration.drawio new file mode 100644 index 0000000..b2e81c2 --- /dev/null +++ b/doku/AC_Administration.drawio @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doku/AC_Administration.png b/doku/AC_Administration.png new file mode 100644 index 0000000..eeabc3c Binary files /dev/null and b/doku/AC_Administration.png differ diff --git a/doku/AC_Crawler.drawio b/doku/AC_Crawler.drawio index 84cd5f1..5cbcc88 100644 --- a/doku/AC_Crawler.drawio +++ b/doku/AC_Crawler.drawio @@ -1,142 +1,190 @@ - + - + - + + + + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - + - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + + - + - + - + - - + + - - + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doku/AC_Crawler.png b/doku/AC_Crawler.png index 498c96f..185fa41 100644 Binary files a/doku/AC_Crawler.png and b/doku/AC_Crawler.png differ