mirror of
https://github.com/Mueller-Patrick/Betterzon.git
synced 2026-04-26 23:30:11 +00:00
BETTERZON-58 (#53)
* BETTERZON-58: Basic Functionality with scrapy * Added independent crawler function, yielding price * moved logic to amazon.py * . * moved scrapy files to unused folder * Added basic amazon crawler using beautifulsoup4 * Connected Api to Crawler * Fixed string concatenation for sql statement in getProductLinksForProduct * BETTERZON-58: Fixing SQL insert * BETTERZON-58: Adding access key verification * BETTERZON-58: Fixing API endpoint of the crawler - The list of products in the API request was treated like a string and henceforth, only the first product has been crawled * Added another selector for price on amazon (does not work for books) Co-authored-by: root <root@DESKTOP-ARBPL82.localdomain> Co-authored-by: Patrick Müller <patrick@mueller-patrick.tech> Co-authored-by: Patrick <50352812+Mueller-Patrick@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
@@ -0,0 +1,25 @@
|
||||
import scrapy
|
||||
import re
|
||||
|
||||
class AmazonSpider(scrapy.Spider):
|
||||
name = 'amazon'
|
||||
allowed_domains = ['amazon.de']
|
||||
start_urls = ['https://amazon.de/dp/B083DRCPJG']
|
||||
|
||||
def parse(self, response):
|
||||
price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first()
|
||||
if not price:
|
||||
price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \
|
||||
response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first()
|
||||
|
||||
euros = re.match('(\d*),\d\d', price).group(1)
|
||||
cents = re.match('\d*,(\d\d)', price).group(1)
|
||||
priceincents = euros + cents
|
||||
|
||||
yield {'price': priceincents}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user