From 26ba21156aba2a6d8de8766a095460d5b0ecf533 Mon Sep 17 00:00:00 2001 From: henningxtro Date: Wed, 19 May 2021 00:46:14 +0200 Subject: [PATCH] BETTERZON-58 (#53) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BETTERZON-58: Basic Functionality with scrapy * Added independent crawler function, yielding price * moved logic to amazon.py * . * moved scrapy files to unused folder * Added basic amazon crawler using beautifulsoup4 * Connected Api to Crawler * Fixed string concatenation for sql statement in getProductLinksForProduct * BETTERZON-58: Fixing SQL insert * BETTERZON-58: Adding access key verification * BETTERZON-58: Fixing API endpoint of the crawler - The list of products in the API request was treated like a string and henceforth, only the first product has been crawled * Added another selector for price on amazon (does not work for books) Co-authored-by: root Co-authored-by: Patrick Müller Co-authored-by: Patrick <50352812+Mueller-Patrick@users.noreply.github.com> --- Crawler/Crawler.iml | 4 +- Crawler/api.py | 15 +- Crawler/crawler.py | 185 ++++++++++-------- Crawler/crawler/spiders/amazon.py | 66 ------- Crawler/requirements.txt | 6 +- Crawler/sql.py | 1 - Crawler/unused/scrapy/amazonspider.py | 33 ++++ .../{ => unused/scrapy}/crawler/__init__.py | 0 Crawler/{ => unused/scrapy}/crawler/items.py | 0 .../scrapy}/crawler/middlewares.py | 0 .../{ => unused/scrapy}/crawler/pipelines.py | 0 .../{ => unused/scrapy}/crawler/settings.py | 0 Crawler/{ => unused/scrapy}/scrapy.cfg | 2 +- .../scrapy}/spiders/__init__.py | 0 Crawler/unused/scrapy/spiders/amazon.py | 25 +++ 15 files changed, 184 insertions(+), 153 deletions(-) delete mode 100644 Crawler/crawler/spiders/amazon.py create mode 100644 Crawler/unused/scrapy/amazonspider.py rename Crawler/{ => unused/scrapy}/crawler/__init__.py (100%) rename Crawler/{ => unused/scrapy}/crawler/items.py (100%) rename Crawler/{ => unused/scrapy}/crawler/middlewares.py (100%) rename Crawler/{ => unused/scrapy}/crawler/pipelines.py (100%) rename Crawler/{ => unused/scrapy}/crawler/settings.py (100%) rename Crawler/{ => unused/scrapy}/scrapy.cfg (92%) rename Crawler/{crawler => unused/scrapy}/spiders/__init__.py (100%) create mode 100644 Crawler/unused/scrapy/spiders/amazon.py diff --git a/Crawler/Crawler.iml b/Crawler/Crawler.iml index 8568e2d..db1dd1c 100644 --- a/Crawler/Crawler.iml +++ b/Crawler/Crawler.iml @@ -2,13 +2,13 @@ - + - + \ No newline at end of file diff --git a/Crawler/api.py b/Crawler/api.py index 92617c4..7b7e0c2 100644 --- a/Crawler/api.py +++ b/Crawler/api.py @@ -1,13 +1,17 @@ +import os + from flask import Flask from flask_restful import Resource, Api, reqparse +import crawler + app = Flask(__name__) api = Api(app) # To parse request data parser = reqparse.RequestParser() -parser.add_argument('key') -parser.add_argument('products') +parser.add_argument('key', type=str) +parser.add_argument('products', type=int, action='append') class CrawlerApi(Resource): @@ -17,7 +21,12 @@ class CrawlerApi(Resource): def post(self): # Accept crawler request here args = parser.parse_args() - return args + access_key = os.getenv('CRAWLER_ACCESS_KEY') + if(args['key'] == access_key): + crawler.crawl(args['products']) + return {'message': 'success'} + else: + return {'message': 'Wrong access key'} api.add_resource(CrawlerApi, '/') diff --git a/Crawler/crawler.py b/Crawler/crawler.py index 99ff867..45bd15a 100644 --- a/Crawler/crawler.py +++ b/Crawler/crawler.py @@ -1,78 +1,107 @@ -import sql - - -def crawl(product_ids: [int]) -> dict: - """ - Crawls the given list of products and saves the results to sql - :param products: The list of product IDs to fetch - :return: A dict with the following fields: - total_crawls: number of total crawl tries (products * vendors per product) - successful_crawls: number of successful products - products_with_problems: list of products that have not been crawled successfully - """ - total_crawls = 0 - successful_crawls = 0 - products_with_problems = [] - - # Iterate over every product that has to be crawled - for product_id in product_ids: - # Get all links for this product - product_links = sql.getProductLinksForProduct(product_id) - - crawled_data = [] - - # Iterate over every link / vendor - for product_vendor_info in product_links: - total_crawls += 1 - - # Call the appropriate vendor crawling function and append the result to the list of crawled data - if product_vendor_info['vendor_id'] == 1: - # Amazon - crawled_data.append(__crawl_amazon__(product_vendor_info)) - elif product_vendor_info['vendor_id'] == 2: - # Apple - crawled_data.append(__crawl_apple__(product_vendor_info)) - elif product_vendor_info['vendor_id'] == 3: - # Media Markt - crawled_data.append(__crawl_mediamarkt__(product_vendor_info)) - else: - products_with_problems.append(product_vendor_info) - continue - - successful_crawls += 1 - - # Insert data to SQL - sql.insertData(crawled_data) - - return { - 'total_crawls': total_crawls, - 'successful_crawls': successful_crawls, - 'products_with_problems': products_with_problems - } - - -def __crawl_amazon__(product_info: dict) -> tuple: - """ - Crawls the price for the given product from amazon - :param product_info: A dict with product info containing product_id, vendor_id, url - :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) - """ - return (product_info['product_id'], product_info['vendor_id'], 123) - - -def __crawl_apple__(product_info: dict) -> tuple: - """ - Crawls the price for the given product from apple - :param product_info: A dict with product info containing product_id, vendor_id, url - :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) - """ - return (product_info['product_id'], product_info['vendor_id'], 123) - - -def __crawl_mediamarkt__(product_info: dict) -> tuple: - """ - Crawls the price for the given product from media markt - :param product_info: A dict with product info containing product_id, vendor_id, url - :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) - """ - pass +import sql +import requests +from bs4 import BeautifulSoup + +HEADERS = ({'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 ' + 'Safari/537.36'}) + + +def crawl(product_ids: [int]) -> dict: + """ + Crawls the given list of products and saves the results to sql + :param products: The list of product IDs to fetch + :return: A dict with the following fields: + total_crawls: number of total crawl tries (products * vendors per product) + successful_crawls: number of successful products + products_with_problems: list of products that have not been crawled successfully + """ + total_crawls = 0 + successful_crawls = 0 + products_with_problems = [] + + # Iterate over every product that has to be crawled + for product_id in product_ids: + # Get all links for this product + product_links = sql.getProductLinksForProduct(product_id) + + crawled_data = [] + + # Iterate over every link / vendor + for product_vendor_info in product_links: + total_crawls += 1 + + # Call the appropriate vendor crawling function and append the result to the list of crawled data + if product_vendor_info['vendor_id'] == 1: + # Amazon + data = __crawl_amazon__(product_vendor_info) + if data: + crawled_data.append(data) + elif product_vendor_info['vendor_id'] == 2: + # Apple + data = __crawl_apple__(product_vendor_info) + if data: + crawled_data.append(data) + elif product_vendor_info['vendor_id'] == 3: + # Media Markt + data = __crawl_mediamarkt__(product_vendor_info) + if data: + crawled_data.append(data) + else: + products_with_problems.append(product_vendor_info) + continue + + successful_crawls += 1 + + # Insert data to SQL + sql.insertData(crawled_data) + + return { + 'total_crawls': total_crawls, + 'successful_crawls': successful_crawls, + 'products_with_problems': products_with_problems + } + + +def __crawl_amazon__(product_info: dict) -> tuple: + """ + Crawls the price for the given product from amazon + :param product_info: A dict with product info containing product_id, vendor_id, url + :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) + """ + page = requests.get(product_info['url'], headers=HEADERS) + soup = BeautifulSoup(page.content, features="lxml") + try: + price = int( + soup.find(id='priceblock_ourprice').get_text().replace(".", "").replace(",", "").replace("€", "").strip()) + if not price: + price = int(soup.find(id='price_inside_buybox').get_text().replace(".", "").replace(",", "").replace("€", "").strip()) + + except RuntimeError: + price = -1 + except AttributeError: + price = -1 + + if price != -1: + return (product_info['product_id'], product_info['vendor_id'], price) + else: + return None + + +def __crawl_apple__(product_info: dict) -> tuple: + """ + Crawls the price for the given product from apple + :param product_info: A dict with product info containing product_id, vendor_id, url + :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) + """ + # return (product_info['product_id'], product_info['vendor_id'], 123) + pass + + +def __crawl_mediamarkt__(product_info: dict) -> tuple: + """ + Crawls the price for the given product from media markt + :param product_info: A dict with product info containing product_id, vendor_id, url + :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents) + """ + pass diff --git a/Crawler/crawler/spiders/amazon.py b/Crawler/crawler/spiders/amazon.py deleted file mode 100644 index 12ea3d5..0000000 --- a/Crawler/crawler/spiders/amazon.py +++ /dev/null @@ -1,66 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -from urllib.parse import urlencode -from urllib.parse import urljoin -import re -import json - -queries = ['iphone'] -API = '' - - -def get_url(url): - payload = {'api_key': API, 'url': url, 'country_code': 'us'} - proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) - return proxy_url - - -class AmazonSpider(scrapy.Spider): - name = 'amazon' - - def start_requests(self): - for query in queries: - url = 'https://www.amazon.de/s?' + urlencode({'k': query}) - yield scrapy.Request(url=url, callback=self.parse_keyword_response) - - def parse_keyword_response(self, response): - products = response.xpath('//*[@data-asin]') - - for product in products: - asin = product.xpath('@data-asin').extract_first() - product_url = f"https://www.amazon.de/dp/{asin}" - yield scrapy.Request(url=product_url, callback=self.parse_product_page, meta={'asin': asin}) - - next_page = response.xpath('//li[@class="a-last"]/a/@href').extract_first() - if next_page: - url = urljoin("https://www.amazon.de", next_page) - yield scrapy.Request(url=url, callback=self.parse_keyword_response) - - def parse_product_page(self, response): - asin = response.meta['asin'] - title = response.xpath('//*[@id="productTitle"]/text()').extract_first() - image = re.search('"large":"(.*?)"', response.text).groups()[0] - rating = response.xpath('//*[@id="acrPopover"]/@title').extract_first() - number_of_reviews = response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract_first() - price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first() - - if not price: - price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \ - response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first() - - temp = response.xpath('//*[@id="twister"]') - sizes = [] - colors = [] - if temp: - s = re.search('"variationValues" : ({.*})', response.text).groups()[0] - json_acceptable = s.replace("'", "\"") - di = json.loads(json_acceptable) - sizes = di.get('size_name', []) - colors = di.get('color_name', []) - - bullet_points = response.xpath('//*[@id="feature-bullets"]//li/span/text()').extract() - seller_rank = response.xpath( - '//*[text()="Amazon Best Sellers Rank:"]/parent::*//text()[not(parent::style)]').extract() - yield {'asin': asin, 'Title': title, 'MainImage': image, 'Rating': rating, 'NumberOfReviews': number_of_reviews, - 'Price': price, 'AvailableSizes': sizes, 'AvailableColors': colors, 'BulletPoints': bullet_points, - 'SellerRank': seller_rank} diff --git a/Crawler/requirements.txt b/Crawler/requirements.txt index ba99df1..a704f27 100644 --- a/Crawler/requirements.txt +++ b/Crawler/requirements.txt @@ -1,5 +1,7 @@ pymysql -flask +flask==1.1.2 flask-sqlalchemy flask_restful -scrapy \ No newline at end of file +beautifulsoup4 +requests +lxml \ No newline at end of file diff --git a/Crawler/sql.py b/Crawler/sql.py index 1cf3a58..c1b2669 100644 --- a/Crawler/sql.py +++ b/Crawler/sql.py @@ -54,7 +54,6 @@ def getProductLinksForProduct(product_id: int) -> [dict]: cur = conn.cursor() query = 'SELECT vendor_id, url FROM product_links WHERE product_id = %s' - cur.execute(query, (product_id,)) products = list(map(lambda x: {'product_id': product_id, 'vendor_id': x[0], 'url': x[1]}, cur.fetchall())) diff --git a/Crawler/unused/scrapy/amazonspider.py b/Crawler/unused/scrapy/amazonspider.py new file mode 100644 index 0000000..5f88e20 --- /dev/null +++ b/Crawler/unused/scrapy/amazonspider.py @@ -0,0 +1,33 @@ +import scrapy +from scrapy.crawler import CrawlerProcess +import re + +class AmazonSpider(scrapy.Spider): + name = 'amazon' + allowed_domains = ['amazon.de'] + start_urls = ['https://amazon.de/dp/B083DRCPJG'] + + # def __init__(self, start_urls): + # self.start_urls = start_urls + + def parse(self, response): + price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first() + if not price: + price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \ + response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first() + + euros = re.match('(\d*),\d\d', price).group(1) + cents = re.match('\d*,(\d\d)', price).group(1) + priceincents = euros + cents + + yield {'price': priceincents} + + +def start_crawling(): + process = CrawlerProcess( + settings={'COOKIES_ENABLED': 'False', 'CONCURRENT_REQUESTS_PER_IP': 1, 'ROBOTSTXT_OBEY': False, + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36', + 'DOWNLOAD_DELAY': 3} + , install_root_handler=False) + process.crawl() + process.start() diff --git a/Crawler/crawler/__init__.py b/Crawler/unused/scrapy/crawler/__init__.py similarity index 100% rename from Crawler/crawler/__init__.py rename to Crawler/unused/scrapy/crawler/__init__.py diff --git a/Crawler/crawler/items.py b/Crawler/unused/scrapy/crawler/items.py similarity index 100% rename from Crawler/crawler/items.py rename to Crawler/unused/scrapy/crawler/items.py diff --git a/Crawler/crawler/middlewares.py b/Crawler/unused/scrapy/crawler/middlewares.py similarity index 100% rename from Crawler/crawler/middlewares.py rename to Crawler/unused/scrapy/crawler/middlewares.py diff --git a/Crawler/crawler/pipelines.py b/Crawler/unused/scrapy/crawler/pipelines.py similarity index 100% rename from Crawler/crawler/pipelines.py rename to Crawler/unused/scrapy/crawler/pipelines.py diff --git a/Crawler/crawler/settings.py b/Crawler/unused/scrapy/crawler/settings.py similarity index 100% rename from Crawler/crawler/settings.py rename to Crawler/unused/scrapy/crawler/settings.py diff --git a/Crawler/scrapy.cfg b/Crawler/unused/scrapy/scrapy.cfg similarity index 92% rename from Crawler/scrapy.cfg rename to Crawler/unused/scrapy/scrapy.cfg index 83a4eef..9c0c1bc 100644 --- a/Crawler/scrapy.cfg +++ b/Crawler/unused/scrapy/scrapy.cfg @@ -8,4 +8,4 @@ default = crawler.settings [deploy] #url = http://localhost:6800/ -project = crawler +project = crawler \ No newline at end of file diff --git a/Crawler/crawler/spiders/__init__.py b/Crawler/unused/scrapy/spiders/__init__.py similarity index 100% rename from Crawler/crawler/spiders/__init__.py rename to Crawler/unused/scrapy/spiders/__init__.py diff --git a/Crawler/unused/scrapy/spiders/amazon.py b/Crawler/unused/scrapy/spiders/amazon.py new file mode 100644 index 0000000..c74196b --- /dev/null +++ b/Crawler/unused/scrapy/spiders/amazon.py @@ -0,0 +1,25 @@ +import scrapy +import re + +class AmazonSpider(scrapy.Spider): + name = 'amazon' + allowed_domains = ['amazon.de'] + start_urls = ['https://amazon.de/dp/B083DRCPJG'] + + def parse(self, response): + price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first() + if not price: + price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \ + response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first() + + euros = re.match('(\d*),\d\d', price).group(1) + cents = re.match('\d*,(\d\d)', price).group(1) + priceincents = euros + cents + + yield {'price': priceincents} + + + + + +