Betterzon/Crawler/unused/scrapy/amazonspider.py

import scrapy
from scrapy.crawler import CrawlerProcess
import re

class AmazonSpider(scrapy.Spider):
    name = 'amazon'
    allowed_domains = ['amazon.de']
    start_urls = ['https://amazon.de/dp/B083DRCPJG']

    # def __init__(self, start_urls):
    #   self.start_urls = start_urls

    def parse(self, response):
        price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first()
        if not price:
            price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \
                    response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first()

        euros = re.match('(\d*),\d\d', price).group(1)
        cents = re.match('\d*,(\d\d)', price).group(1)
        priceincents = euros + cents

        yield {'price': priceincents}


def start_crawling():
    process = CrawlerProcess(
        settings={'COOKIES_ENABLED': 'False', 'CONCURRENT_REQUESTS_PER_IP': 1, 'ROBOTSTXT_OBEY': False,
                  'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
                  'DOWNLOAD_DELAY': 3}
        , install_root_handler=False)
    process.crawl()
    process.start()
BETTERZON-58 (#53) * BETTERZON-58: Basic Functionality with scrapy * Added independent crawler function, yielding price * moved logic to amazon.py * . * moved scrapy files to unused folder * Added basic amazon crawler using beautifulsoup4 * Connected Api to Crawler * Fixed string concatenation for sql statement in getProductLinksForProduct * BETTERZON-58: Fixing SQL insert * BETTERZON-58: Adding access key verification * BETTERZON-58: Fixing API endpoint of the crawler - The list of products in the API request was treated like a string and henceforth, only the first product has been crawled * Added another selector for price on amazon (does not work for books) Co-authored-by: root <root@DESKTOP-ARBPL82.localdomain> Co-authored-by: Patrick Müller <patrick@mueller-patrick.tech> Co-authored-by: Patrick <50352812+Mueller-Patrick@users.noreply.github.com> 2021-05-18 22:46:14 +00:00			`import scrapy`
			`from scrapy.crawler import CrawlerProcess`
			`import re`

			`class AmazonSpider(scrapy.Spider):`
			`name = 'amazon'`
			`allowed_domains = ['amazon.de']`
			`start_urls = ['https://amazon.de/dp/B083DRCPJG']`

			`# def __init__(self, start_urls):`
			`# self.start_urls = start_urls`

			`def parse(self, response):`
			`price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first()`
			`if not price:`
			`price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \`
			`response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first()`

			`euros = re.match('(\d*),\d\d', price).group(1)`
			`cents = re.match('\d*,(\d\d)', price).group(1)`
			`priceincents = euros + cents`

			`yield {'price': priceincents}`


			`def start_crawling():`
			`process = CrawlerProcess(`
			`settings={'COOKIES_ENABLED': 'False', 'CONCURRENT_REQUESTS_PER_IP': 1, 'ROBOTSTXT_OBEY': False,`
			`'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',`
			`'DOWNLOAD_DELAY': 3}`
			`, install_root_handler=False)`
			`process.crawl()`
			`process.start()`