diff --git a/Crawler/crawler/amazonspider.py b/Crawler/crawler/amazonspider.py new file mode 100644 index 0000000..e69de29 diff --git a/Crawler/crawler/spiders/amazon.py b/Crawler/crawler/spiders/amazon.py deleted file mode 100644 index 12ea3d5..0000000 --- a/Crawler/crawler/spiders/amazon.py +++ /dev/null @@ -1,66 +0,0 @@ -# -*- coding: utf-8 -*- -import scrapy -from urllib.parse import urlencode -from urllib.parse import urljoin -import re -import json - -queries = ['iphone'] -API = '' - - -def get_url(url): - payload = {'api_key': API, 'url': url, 'country_code': 'us'} - proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) - return proxy_url - - -class AmazonSpider(scrapy.Spider): - name = 'amazon' - - def start_requests(self): - for query in queries: - url = 'https://www.amazon.de/s?' + urlencode({'k': query}) - yield scrapy.Request(url=url, callback=self.parse_keyword_response) - - def parse_keyword_response(self, response): - products = response.xpath('//*[@data-asin]') - - for product in products: - asin = product.xpath('@data-asin').extract_first() - product_url = f"https://www.amazon.de/dp/{asin}" - yield scrapy.Request(url=product_url, callback=self.parse_product_page, meta={'asin': asin}) - - next_page = response.xpath('//li[@class="a-last"]/a/@href').extract_first() - if next_page: - url = urljoin("https://www.amazon.de", next_page) - yield scrapy.Request(url=url, callback=self.parse_keyword_response) - - def parse_product_page(self, response): - asin = response.meta['asin'] - title = response.xpath('//*[@id="productTitle"]/text()').extract_first() - image = re.search('"large":"(.*?)"', response.text).groups()[0] - rating = response.xpath('//*[@id="acrPopover"]/@title').extract_first() - number_of_reviews = response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract_first() - price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first() - - if not price: - price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \ - response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first() - - temp = response.xpath('//*[@id="twister"]') - sizes = [] - colors = [] - if temp: - s = re.search('"variationValues" : ({.*})', response.text).groups()[0] - json_acceptable = s.replace("'", "\"") - di = json.loads(json_acceptable) - sizes = di.get('size_name', []) - colors = di.get('color_name', []) - - bullet_points = response.xpath('//*[@id="feature-bullets"]//li/span/text()').extract() - seller_rank = response.xpath( - '//*[text()="Amazon Best Sellers Rank:"]/parent::*//text()[not(parent::style)]').extract() - yield {'asin': asin, 'Title': title, 'MainImage': image, 'Rating': rating, 'NumberOfReviews': number_of_reviews, - 'Price': price, 'AvailableSizes': sizes, 'AvailableColors': colors, 'BulletPoints': bullet_points, - 'SellerRank': seller_rank}