From 0a11b2b45396c05ed4e195852343f72506886d02 Mon Sep 17 00:00:00 2001 From: henningxtro Date: Sun, 16 May 2021 15:41:39 +0200 Subject: [PATCH] moved logic to amazon.py --- Crawler/amazonspider.py | 33 +++++++++++++++++++++++++++++++++ Crawler/crawler/amazonspider.py | 0 2 files changed, 33 insertions(+) create mode 100644 Crawler/amazonspider.py delete mode 100644 Crawler/crawler/amazonspider.py diff --git a/Crawler/amazonspider.py b/Crawler/amazonspider.py new file mode 100644 index 0000000..5f88e20 --- /dev/null +++ b/Crawler/amazonspider.py @@ -0,0 +1,33 @@ +import scrapy +from scrapy.crawler import CrawlerProcess +import re + +class AmazonSpider(scrapy.Spider): + name = 'amazon' + allowed_domains = ['amazon.de'] + start_urls = ['https://amazon.de/dp/B083DRCPJG'] + + # def __init__(self, start_urls): + # self.start_urls = start_urls + + def parse(self, response): + price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first() + if not price: + price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \ + response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first() + + euros = re.match('(\d*),\d\d', price).group(1) + cents = re.match('\d*,(\d\d)', price).group(1) + priceincents = euros + cents + + yield {'price': priceincents} + + +def start_crawling(): + process = CrawlerProcess( + settings={'COOKIES_ENABLED': 'False', 'CONCURRENT_REQUESTS_PER_IP': 1, 'ROBOTSTXT_OBEY': False, + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36', + 'DOWNLOAD_DELAY': 3} + , install_root_handler=False) + process.crawl() + process.start() diff --git a/Crawler/crawler/amazonspider.py b/Crawler/crawler/amazonspider.py deleted file mode 100644 index e69de29..0000000