diff --git a/.gitignore b/.gitignore index ff35e3c..d099c29 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ speed-measure-plugin*.json !Backend.iml !CucumberTests.iml !Crawler.iml +!Crawler-Loadbalancer.iml # Include IntelliJ modules !/.idea/modules.xml diff --git a/.idea/modules.xml b/.idea/modules.xml index 44a6847..53cf20e 100644 --- a/.idea/modules.xml +++ b/.idea/modules.xml @@ -5,6 +5,7 @@ + diff --git a/Crawler-Loadbalancer/Crawler-Loadbalancer.iml b/Crawler-Loadbalancer/Crawler-Loadbalancer.iml new file mode 100644 index 0000000..ad3c0a3 --- /dev/null +++ b/Crawler-Loadbalancer/Crawler-Loadbalancer.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/Crawler-Loadbalancer/requirements.txt b/Crawler-Loadbalancer/requirements.txt new file mode 100644 index 0000000..89437b8 --- /dev/null +++ b/Crawler-Loadbalancer/requirements.txt @@ -0,0 +1,2 @@ +pymysql +logging diff --git a/Crawler-Loadbalancer/sql.py b/Crawler-Loadbalancer/sql.py new file mode 100644 index 0000000..69e11f8 --- /dev/null +++ b/Crawler-Loadbalancer/sql.py @@ -0,0 +1,42 @@ +import pymysql +import os +import logging + + +def __getConnection__() -> pymysql.Connection: + """ + Opens a new pymysql connection and returns it + :return: A pymysql Connection object + """ + logger = logging.getLogger() + try: + conn = pymysql.connect( + user=os.environ['BETTERZON_CRAWLER_USER'], + password=os.environ['BETTERZON_CRAWLER_PASSWORD'], + host=os.environ['BETTERZON_CRAWLER_HOST'], + port=3306, + database=os.environ['BETTERZON_CRAWLER_DB'] + ) + + return conn + except pymysql.Error as e: + logger.error('SQL Connection error: %s', e) + return + + +def getShopsToCrawl() -> [int]: + """ + Queries the list of vendor IDs and returns them + :return: The list of IDs + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'SELECT vendor_id FROM vendors' + + cur.execute(query) + + # Extract the IDs from the returned tuples into a list + vendor_ids = list(map(lambda x: x[0], cur.fetchall())) + + return vendor_ids diff --git a/Crawler/Crawler.iml b/Crawler/Crawler.iml index db1dd1c..8568e2d 100644 --- a/Crawler/Crawler.iml +++ b/Crawler/Crawler.iml @@ -2,13 +2,13 @@ - + - + \ No newline at end of file diff --git a/Crawler/requirements.txt b/Crawler/requirements.txt index 04ca272..70a6f57 100644 --- a/Crawler/requirements.txt +++ b/Crawler/requirements.txt @@ -1,4 +1,5 @@ pymysql flask flask-sqlalchemy -flask_restful \ No newline at end of file +flask_restful +logging diff --git a/Crawler/sql.py b/Crawler/sql.py new file mode 100644 index 0000000..11c30c2 --- /dev/null +++ b/Crawler/sql.py @@ -0,0 +1,70 @@ +import logging + +import pymysql +import os + + +def __getConnection__() -> pymysql.Connection: + """ + Opens a new pymysql connection and returns it + :return: A pymysql Connection object + """ + logger = logging.getLogger() + try: + conn = pymysql.connect( + user=os.environ['BETTERZON_CRAWLER_USER'], + password=os.environ['BETTERZON_CRAWLER_PASSWORD'], + host=os.environ['BETTERZON_CRAWLER_HOST'], + port=3306, + database=os.environ['BETTERZON_CRAWLER_DB'] + ) + + return conn + except pymysql.Error as e: + logger.error('SQL Connection error: %s', e) + return + + +def getProductsForShop(vendor_id: int) -> [{}]: + """ + Queries the product links for all products of the given shop + :param vendor_id: The vendor / shop to query products for + :return: A list of product objects, each having the following parameters: + product_id, vendor_id, url + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'SELECT product_id, url FROM product_links WHERE vendor_id = %s' + + cur.execute(query, (vendor_id,)) + + products = list(map(lambda x: {'product_id': x[0], 'vendor_id': vendor_id, 'url': x[1]}, cur.fetchall())) + + return products + + +def insertShopData(data_to_insert: [tuple]) -> bool: + """ + Inserts the given list of tuples into the DB + :param dataToInsert: A list of tuples, where each tuple has to contain product id, vendor id and the price + in exactly this order + :return: If the insert was successful + """ + conn = __getConnection__() + cur = conn.cursor() + + query = 'INSERT INTO prices (product_id, vendor_id, price_in_cents, timestamp) VALUES (%s, %s, %s, NOW())' + + affectedRows = cur.executemany(query, data_to_insert) + + if affectedRows != len(data_to_insert): + # Something went wrong, revert the changes + conn.rollback() + else: + conn.commit() + + cur.close() + conn.close() + + return affectedRows == len(data_to_insert)