mirror of
https://github.com/Mueller-Patrick/Betterzon.git
synced 2024-12-22 19:55:12 +00:00
moved logic to amazon.py
This commit is contained in:
parent
d2a4d93f54
commit
0a11b2b453
33
Crawler/amazonspider.py
Normal file
33
Crawler/amazonspider.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import scrapy
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
import re
|
||||||
|
|
||||||
|
class AmazonSpider(scrapy.Spider):
|
||||||
|
name = 'amazon'
|
||||||
|
allowed_domains = ['amazon.de']
|
||||||
|
start_urls = ['https://amazon.de/dp/B083DRCPJG']
|
||||||
|
|
||||||
|
# def __init__(self, start_urls):
|
||||||
|
# self.start_urls = start_urls
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first()
|
||||||
|
if not price:
|
||||||
|
price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \
|
||||||
|
response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first()
|
||||||
|
|
||||||
|
euros = re.match('(\d*),\d\d', price).group(1)
|
||||||
|
cents = re.match('\d*,(\d\d)', price).group(1)
|
||||||
|
priceincents = euros + cents
|
||||||
|
|
||||||
|
yield {'price': priceincents}
|
||||||
|
|
||||||
|
|
||||||
|
def start_crawling():
|
||||||
|
process = CrawlerProcess(
|
||||||
|
settings={'COOKIES_ENABLED': 'False', 'CONCURRENT_REQUESTS_PER_IP': 1, 'ROBOTSTXT_OBEY': False,
|
||||||
|
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
|
||||||
|
'DOWNLOAD_DELAY': 3}
|
||||||
|
, install_root_handler=False)
|
||||||
|
process.crawl()
|
||||||
|
process.start()
|
Loading…
Reference in New Issue
Block a user