mirror of
				https://github.com/Mueller-Patrick/Betterzon.git
				synced 2025-11-04 10:35:48 +00:00 
			
		
		
		
	* BETTERZON-58: Basic Functionality with scrapy * Added independent crawler function, yielding price * moved logic to amazon.py * . * moved scrapy files to unused folder * Added basic amazon crawler using beautifulsoup4 * Connected Api to Crawler * Fixed string concatenation for sql statement in getProductLinksForProduct * BETTERZON-58: Fixing SQL insert * BETTERZON-58: Adding access key verification * BETTERZON-58: Fixing API endpoint of the crawler - The list of products in the API request was treated like a string and henceforth, only the first product has been crawled * Added another selector for price on amazon (does not work for books) Co-authored-by: root <root@DESKTOP-ARBPL82.localdomain> Co-authored-by: Patrick Müller <patrick@mueller-patrick.tech> Co-authored-by: Patrick <50352812+Mueller-Patrick@users.noreply.github.com>
		
			
				
	
	
		
			34 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			34 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import scrapy
 | 
						|
from scrapy.crawler import CrawlerProcess
 | 
						|
import re
 | 
						|
 | 
						|
class AmazonSpider(scrapy.Spider):
 | 
						|
    name = 'amazon'
 | 
						|
    allowed_domains = ['amazon.de']
 | 
						|
    start_urls = ['https://amazon.de/dp/B083DRCPJG']
 | 
						|
 | 
						|
    # def __init__(self, start_urls):
 | 
						|
    #   self.start_urls = start_urls
 | 
						|
 | 
						|
    def parse(self, response):
 | 
						|
        price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first()
 | 
						|
        if not price:
 | 
						|
            price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \
 | 
						|
                    response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first()
 | 
						|
 | 
						|
        euros = re.match('(\d*),\d\d', price).group(1)
 | 
						|
        cents = re.match('\d*,(\d\d)', price).group(1)
 | 
						|
        priceincents = euros + cents
 | 
						|
 | 
						|
        yield {'price': priceincents}
 | 
						|
 | 
						|
 | 
						|
def start_crawling():
 | 
						|
    process = CrawlerProcess(
 | 
						|
        settings={'COOKIES_ENABLED': 'False', 'CONCURRENT_REQUESTS_PER_IP': 1, 'ROBOTSTXT_OBEY': False,
 | 
						|
                  'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
 | 
						|
                  'DOWNLOAD_DELAY': 3}
 | 
						|
        , install_root_handler=False)
 | 
						|
    process.crawl()
 | 
						|
    process.start()
 |