mirror of
				https://github.com/Mueller-Patrick/Betterzon.git
				synced 2025-10-26 06:15:48 +00:00 
			
		
		
		
	Compare commits
	
		
			No commits in common. "ce1c48d9f008d023850711c156528fe09d34c5c5" and "94b02d90d78739642477c2c230e529865a5ae724" have entirely different histories.
		
	
	
		
			ce1c48d9f0
			...
			94b02d90d7
		
	
		
|  | @ -82,25 +82,6 @@ pricesRouter.get('/bestDeals/:amount', async (req: Request, res: Response) => { | ||||||
|     } |     } | ||||||
| }); | }); | ||||||
| 
 | 
 | ||||||
| // GET prices/byProduct/list/[]
 |  | ||||||
| 
 |  | ||||||
| pricesRouter.get('/byProduct/list/:ids', async (req: Request, res: Response) => { |  | ||||||
|     const productIds: [number] = JSON.parse(req.params.ids); |  | ||||||
| 
 |  | ||||||
|     if (!productIds) { |  | ||||||
|         res.status(400).send('Missing parameters.'); |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     try { |  | ||||||
|         const prices: Prices = await PriceService.findListByProducts(productIds); |  | ||||||
| 
 |  | ||||||
|         res.status(200).send(prices); |  | ||||||
|     } catch (e) { |  | ||||||
|         res.status(404).send(e.message); |  | ||||||
|     } |  | ||||||
| }); |  | ||||||
| 
 |  | ||||||
| // POST items/
 | // POST items/
 | ||||||
| 
 | 
 | ||||||
| // pricesRouter.post('/', async (req: Request, res: Response) => {
 | // pricesRouter.post('/', async (req: Request, res: Response) => {
 | ||||||
|  |  | ||||||
|  | @ -195,6 +195,7 @@ export const getBestDeals = async (amount: number): Promise<Prices> => { | ||||||
|         let allPrices: Record<number, Price[]> = {}; |         let allPrices: Record<number, Price[]> = {}; | ||||||
| 
 | 
 | ||||||
|         // Get newest prices for every product at every vendor
 |         // Get newest prices for every product at every vendor
 | ||||||
|  | 
 | ||||||
|         const rows = await conn.query( |         const rows = await conn.query( | ||||||
|             'WITH summary AS (\n' + |             'WITH summary AS (\n' + | ||||||
|             '    SELECT p.product_id,\n' + |             '    SELECT p.product_id,\n' + | ||||||
|  | @ -221,11 +222,10 @@ export const getBestDeals = async (amount: number): Promise<Prices> => { | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         // Iterate over all prices to find the products with the biggest difference between amazon and other vendor
 |         // Iterate over all prices to find the products with the biggest difference between amazon and other vendor
 | ||||||
|         let deals: Price[] = []; |         let deals = []; | ||||||
| 
 |         for (let productId in Object.keys(allPrices)) { | ||||||
|         Object.keys(allPrices).forEach(productId => { |             if (allPrices[productId]) { | ||||||
|             if (allPrices[parseInt(productId)]) { |                 let pricesForProd = allPrices[productId]; | ||||||
|                 let pricesForProd = allPrices[parseInt(productId)]; |  | ||||||
| 
 | 
 | ||||||
|                 // Get amazon price and lowest price from other vendor
 |                 // Get amazon price and lowest price from other vendor
 | ||||||
|                 let amazonPrice = {} as Price; |                 let amazonPrice = {} as Price; | ||||||
|  | @ -234,7 +234,6 @@ export const getBestDeals = async (amount: number): Promise<Prices> => { | ||||||
|                     if (price.vendor_id === 1) { |                     if (price.vendor_id === 1) { | ||||||
|                         amazonPrice = price; |                         amazonPrice = price; | ||||||
|                     } else { |                     } else { | ||||||
|                         // If there is no lowest price yet or the price of the current iteration is lower, set / replace it
 |  | ||||||
|                         if (!lowestPrice.price_in_cents || lowestPrice.price_in_cents > price.price_in_cents) { |                         if (!lowestPrice.price_in_cents || lowestPrice.price_in_cents > price.price_in_cents) { | ||||||
|                             lowestPrice = price; |                             lowestPrice = price; | ||||||
|                         } |                         } | ||||||
|  | @ -246,25 +245,25 @@ export const getBestDeals = async (amount: number): Promise<Prices> => { | ||||||
|                     'product_id': lowestPrice.product_id, |                     'product_id': lowestPrice.product_id, | ||||||
|                     'vendor_id': lowestPrice.vendor_id, |                     'vendor_id': lowestPrice.vendor_id, | ||||||
|                     'price_in_cents': lowestPrice.price_in_cents, |                     'price_in_cents': lowestPrice.price_in_cents, | ||||||
|                     'timestamp': lowestPrice.timestamp, |                     'timestamp' :lowestPrice.timestamp, | ||||||
|                     'amazonDifference': (amazonPrice.price_in_cents - lowestPrice.price_in_cents), |                     'amazonDifference': (amazonPrice.price_in_cents - lowestPrice.price_in_cents), | ||||||
|                     'amazonDifferencePercent': ((1 - (lowestPrice.price_in_cents / amazonPrice.price_in_cents)) * 100), |                     'amazonDifferencePercent': ((1 - (lowestPrice.price_in_cents / amazonPrice.price_in_cents)) * 100), | ||||||
|                 }; |                 }; | ||||||
| 
 | 
 | ||||||
|                 // Push only deals were the amazon price is actually higher
 |                 // Push only deals were the amazon price is actually higher
 | ||||||
|                 if (deal.amazonDifferencePercent > 0) { |                 if(deal.amazonDifferencePercent > 0) { | ||||||
|                     deals.push(deal as Price); |                     deals.push(deal); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         }); |  | ||||||
| 
 | 
 | ||||||
|         // Sort to have the best deals on the top
 |         // Sort to have the best deals on the top
 | ||||||
|         deals.sort((a, b) => a.amazonDifferencePercent! < b.amazonDifferencePercent! ? 1 : -1); |         deals.sort((a, b) => a.amazonDifferencePercent < b.amazonDifferencePercent ? 1 : -1); | ||||||
| 
 | 
 | ||||||
|         // Return only as many records as requested or the maximum amount of found deals, whatever is less
 |         // Return only as many records as requested or the maximum amount of found deals, whatever is less
 | ||||||
|         let maxAmt = Math.min(amount, deals.length); |         let maxAmt = Math.min(amount, deals.length); | ||||||
| 
 | 
 | ||||||
|         for (let dealIndex = 0; dealIndex < maxAmt; dealIndex++) { |         for (let dealIndex = 0; dealIndex < maxAmt; dealIndex++){ | ||||||
|             //console.log(deals[dealIndex]);
 |             //console.log(deals[dealIndex]);
 | ||||||
|             priceRows.push(deals[dealIndex] as Price); |             priceRows.push(deals[dealIndex] as Price); | ||||||
|         } |         } | ||||||
|  | @ -281,70 +280,6 @@ export const getBestDeals = async (amount: number): Promise<Prices> => { | ||||||
|     return priceRows; |     return priceRows; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /** |  | ||||||
|  * Get the lowest, latest, non-amazon price for each given product |  | ||||||
|  * @param ids the ids of the products |  | ||||||
|  */ |  | ||||||
| export const findListByProducts = async (productIds: [number]): Promise<Prices> => { |  | ||||||
|     let conn; |  | ||||||
|     let priceRows: Price[] = []; |  | ||||||
|     try { |  | ||||||
|         conn = await pool.getConnection(); |  | ||||||
| 
 |  | ||||||
|         let allPrices: Record<number, Price[]> = {}; |  | ||||||
| 
 |  | ||||||
|         // Get newest prices for every given product at every vendor
 |  | ||||||
|         const rows = await conn.query( |  | ||||||
|             'WITH summary AS (\n' + |  | ||||||
|             '    SELECT p.product_id,\n' + |  | ||||||
|             '           p.vendor_id,\n' + |  | ||||||
|             '           p.price_in_cents,\n' + |  | ||||||
|             '           p.timestamp,\n' + |  | ||||||
|             '           ROW_NUMBER() OVER(\n' + |  | ||||||
|             '               PARTITION BY p.product_id, p.vendor_id\n' + |  | ||||||
|             '               ORDER BY p.timestamp DESC) AS rk\n' + |  | ||||||
|             '    FROM prices p' + |  | ||||||
|             '    WHERE p.product_id IN (?)' + |  | ||||||
|             '    AND p.vendor_id != 1)\n' + |  | ||||||
|             'SELECT s.*\n' + |  | ||||||
|             'FROM summary s\n' + |  | ||||||
|             'WHERE s.rk = 1', [productIds]); |  | ||||||
| 
 |  | ||||||
|         // Write returned values to allPrices map with product id as key and a list of prices as value
 |  | ||||||
|         for (let row in rows) { |  | ||||||
|             if (row !== 'meta') { |  | ||||||
|                 if (!allPrices[parseInt(rows[row].product_id)]) { |  | ||||||
|                     allPrices[parseInt(rows[row].product_id)] = []; |  | ||||||
|                 } |  | ||||||
| 
 |  | ||||||
|                 allPrices[parseInt(rows[row].product_id)].push(rows[row]); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         // Iterate over all products to find lowest price
 |  | ||||||
|         Object.keys(allPrices).forEach(productId => { |  | ||||||
|             if (allPrices[parseInt(productId)]) { |  | ||||||
|                 let pricesForProd = allPrices[parseInt(productId)]; |  | ||||||
| 
 |  | ||||||
|                 // Sort ascending by price so index 0 has the lowest price
 |  | ||||||
|                 pricesForProd.sort((a, b) => a.price_in_cents > b.price_in_cents ? 1 : -1); |  | ||||||
| 
 |  | ||||||
|                 // Push the lowest price to the return list
 |  | ||||||
|                 priceRows.push(pricesForProd[0]); |  | ||||||
|             } |  | ||||||
|         }); |  | ||||||
| 
 |  | ||||||
|     } catch (err) { |  | ||||||
|         throw err; |  | ||||||
|     } finally { |  | ||||||
|         if (conn) { |  | ||||||
|             conn.end(); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     return priceRows; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| // export const create = async (newItem: Product): Promise<void> => {
 | // export const create = async (newItem: Product): Promise<void> => {
 | ||||||
| //     let conn;
 | //     let conn;
 | ||||||
| //     try {
 | //     try {
 | ||||||
|  |  | ||||||
|  | @ -1,12 +0,0 @@ | ||||||
| # Define here the models for your scraped items |  | ||||||
| # |  | ||||||
| # See documentation in: |  | ||||||
| # https://docs.scrapy.org/en/latest/topics/items.html |  | ||||||
| 
 |  | ||||||
| import scrapy |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class CrawlerItem(scrapy.Item): |  | ||||||
|     # define the fields for your item here like: |  | ||||||
|     # name = scrapy.Field() |  | ||||||
|     pass |  | ||||||
|  | @ -1,103 +0,0 @@ | ||||||
| # Define here the models for your spider middleware |  | ||||||
| # |  | ||||||
| # See documentation in: |  | ||||||
| # https://docs.scrapy.org/en/latest/topics/spider-middleware.html |  | ||||||
| 
 |  | ||||||
| from scrapy import signals |  | ||||||
| 
 |  | ||||||
| # useful for handling different item types with a single interface |  | ||||||
| from itemadapter import is_item, ItemAdapter |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class CrawlerSpiderMiddleware: |  | ||||||
|     # Not all methods need to be defined. If a method is not defined, |  | ||||||
|     # scrapy acts as if the spider middleware does not modify the |  | ||||||
|     # passed objects. |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def from_crawler(cls, crawler): |  | ||||||
|         # This method is used by Scrapy to create your spiders. |  | ||||||
|         s = cls() |  | ||||||
|         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |  | ||||||
|         return s |  | ||||||
| 
 |  | ||||||
|     def process_spider_input(self, response, spider): |  | ||||||
|         # Called for each response that goes through the spider |  | ||||||
|         # middleware and into the spider. |  | ||||||
| 
 |  | ||||||
|         # Should return None or raise an exception. |  | ||||||
|         return None |  | ||||||
| 
 |  | ||||||
|     def process_spider_output(self, response, result, spider): |  | ||||||
|         # Called with the results returned from the Spider, after |  | ||||||
|         # it has processed the response. |  | ||||||
| 
 |  | ||||||
|         # Must return an iterable of Request, or item objects. |  | ||||||
|         for i in result: |  | ||||||
|             yield i |  | ||||||
| 
 |  | ||||||
|     def process_spider_exception(self, response, exception, spider): |  | ||||||
|         # Called when a spider or process_spider_input() method |  | ||||||
|         # (from other spider middleware) raises an exception. |  | ||||||
| 
 |  | ||||||
|         # Should return either None or an iterable of Request or item objects. |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     def process_start_requests(self, start_requests, spider): |  | ||||||
|         # Called with the start requests of the spider, and works |  | ||||||
|         # similarly to the process_spider_output() method, except |  | ||||||
|         # that it doesn’t have a response associated. |  | ||||||
| 
 |  | ||||||
|         # Must return only requests (not items). |  | ||||||
|         for r in start_requests: |  | ||||||
|             yield r |  | ||||||
| 
 |  | ||||||
|     def spider_opened(self, spider): |  | ||||||
|         spider.logger.info('Spider opened: %s' % spider.name) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class CrawlerDownloaderMiddleware: |  | ||||||
|     # Not all methods need to be defined. If a method is not defined, |  | ||||||
|     # scrapy acts as if the downloader middleware does not modify the |  | ||||||
|     # passed objects. |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def from_crawler(cls, crawler): |  | ||||||
|         # This method is used by Scrapy to create your spiders. |  | ||||||
|         s = cls() |  | ||||||
|         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |  | ||||||
|         return s |  | ||||||
| 
 |  | ||||||
|     def process_request(self, request, spider): |  | ||||||
|         # Called for each request that goes through the downloader |  | ||||||
|         # middleware. |  | ||||||
| 
 |  | ||||||
|         # Must either: |  | ||||||
|         # - return None: continue processing this request |  | ||||||
|         # - or return a Response object |  | ||||||
|         # - or return a Request object |  | ||||||
|         # - or raise IgnoreRequest: process_exception() methods of |  | ||||||
|         #   installed downloader middleware will be called |  | ||||||
|         return None |  | ||||||
| 
 |  | ||||||
|     def process_response(self, request, response, spider): |  | ||||||
|         # Called with the response returned from the downloader. |  | ||||||
| 
 |  | ||||||
|         # Must either; |  | ||||||
|         # - return a Response object |  | ||||||
|         # - return a Request object |  | ||||||
|         # - or raise IgnoreRequest |  | ||||||
|         return response |  | ||||||
| 
 |  | ||||||
|     def process_exception(self, request, exception, spider): |  | ||||||
|         # Called when a download handler or a process_request() |  | ||||||
|         # (from other downloader middleware) raises an exception. |  | ||||||
| 
 |  | ||||||
|         # Must either: |  | ||||||
|         # - return None: continue processing this exception |  | ||||||
|         # - return a Response object: stops process_exception() chain |  | ||||||
|         # - return a Request object: stops process_exception() chain |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     def spider_opened(self, spider): |  | ||||||
|         spider.logger.info('Spider opened: %s' % spider.name) |  | ||||||
|  | @ -1,13 +0,0 @@ | ||||||
| # Define your item pipelines here |  | ||||||
| # |  | ||||||
| # Don't forget to add your pipeline to the ITEM_PIPELINES setting |  | ||||||
| # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # useful for handling different item types with a single interface |  | ||||||
| from itemadapter import ItemAdapter |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class CrawlerPipeline: |  | ||||||
|     def process_item(self, item, spider): |  | ||||||
|         return item |  | ||||||
|  | @ -1,88 +0,0 @@ | ||||||
| # Scrapy settings for crawler project |  | ||||||
| # |  | ||||||
| # For simplicity, this file contains only settings considered important or |  | ||||||
| # commonly used. You can find more settings consulting the documentation: |  | ||||||
| # |  | ||||||
| #     https://docs.scrapy.org/en/latest/topics/settings.html |  | ||||||
| #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |  | ||||||
| #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html |  | ||||||
| 
 |  | ||||||
| BOT_NAME = 'crawler' |  | ||||||
| 
 |  | ||||||
| SPIDER_MODULES = ['crawler.spiders'] |  | ||||||
| NEWSPIDER_MODULE = 'crawler.spiders' |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Crawl responsibly by identifying yourself (and your website) on the user-agent |  | ||||||
| USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' |  | ||||||
| 
 |  | ||||||
| # Obey robots.txt rules |  | ||||||
| ROBOTSTXT_OBEY = False |  | ||||||
| 
 |  | ||||||
| # Configure maximum concurrent requests performed by Scrapy (default: 16) |  | ||||||
| #CONCURRENT_REQUESTS = 32 |  | ||||||
| 
 |  | ||||||
| # Configure a delay for requests for the same website (default: 0) |  | ||||||
| # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay |  | ||||||
| # See also autothrottle settings and docs |  | ||||||
| DOWNLOAD_DELAY = 3 |  | ||||||
| # The download delay setting will honor only one of: |  | ||||||
| #CONCURRENT_REQUESTS_PER_DOMAIN = 16 |  | ||||||
| CONCURRENT_REQUESTS_PER_IP = 1 |  | ||||||
| 
 |  | ||||||
| # Disable cookies (enabled by default) |  | ||||||
| COOKIES_ENABLED = False |  | ||||||
| 
 |  | ||||||
| # Disable Telnet Console (enabled by default) |  | ||||||
| #TELNETCONSOLE_ENABLED = False |  | ||||||
| 
 |  | ||||||
| # Override the default request headers: |  | ||||||
| #DEFAULT_REQUEST_HEADERS = { |  | ||||||
| #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |  | ||||||
| #   'Accept-Language': 'en', |  | ||||||
| #} |  | ||||||
| 
 |  | ||||||
| # Enable or disable spider middlewares |  | ||||||
| # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html |  | ||||||
| #SPIDER_MIDDLEWARES = { |  | ||||||
| #    'crawler.middlewares.CrawlerSpiderMiddleware': 543, |  | ||||||
| #} |  | ||||||
| 
 |  | ||||||
| # Enable or disable downloader middlewares |  | ||||||
| # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |  | ||||||
| #DOWNLOADER_MIDDLEWARES = { |  | ||||||
| #    'crawler.middlewares.CrawlerDownloaderMiddleware': 543, |  | ||||||
| #} |  | ||||||
| 
 |  | ||||||
| # Enable or disable extensions |  | ||||||
| # See https://docs.scrapy.org/en/latest/topics/extensions.html |  | ||||||
| #EXTENSIONS = { |  | ||||||
| #    'scrapy.extensions.telnet.TelnetConsole': None, |  | ||||||
| #} |  | ||||||
| 
 |  | ||||||
| # Configure item pipelines |  | ||||||
| # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html |  | ||||||
| #ITEM_PIPELINES = { |  | ||||||
| #    'crawler.pipelines.CrawlerPipeline': 300, |  | ||||||
| #} |  | ||||||
| 
 |  | ||||||
| # Enable and configure the AutoThrottle extension (disabled by default) |  | ||||||
| # See https://docs.scrapy.org/en/latest/topics/autothrottle.html |  | ||||||
| AUTOTHROTTLE_ENABLED = True |  | ||||||
| # The initial download delay |  | ||||||
| AUTOTHROTTLE_START_DELAY = 5 |  | ||||||
| # The maximum download delay to be set in case of high latencies |  | ||||||
| #AUTOTHROTTLE_MAX_DELAY = 60 |  | ||||||
| # The average number of requests Scrapy should be sending in parallel to |  | ||||||
| # each remote server |  | ||||||
| #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 |  | ||||||
| # Enable showing throttling stats for every response received: |  | ||||||
| #AUTOTHROTTLE_DEBUG = False |  | ||||||
| 
 |  | ||||||
| # Enable and configure HTTP caching (disabled by default) |  | ||||||
| # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings |  | ||||||
| #HTTPCACHE_ENABLED = True |  | ||||||
| #HTTPCACHE_EXPIRATION_SECS = 0 |  | ||||||
| #HTTPCACHE_DIR = 'httpcache' |  | ||||||
| #HTTPCACHE_IGNORE_HTTP_CODES = [] |  | ||||||
| #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |  | ||||||
|  | @ -1,4 +0,0 @@ | ||||||
| # This package will contain the spiders of your Scrapy project |  | ||||||
| # |  | ||||||
| # Please refer to the documentation for information on how to create and manage |  | ||||||
| # your spiders. |  | ||||||
|  | @ -1,66 +0,0 @@ | ||||||
| # -*- coding: utf-8 -*- |  | ||||||
| import scrapy |  | ||||||
| from urllib.parse import urlencode |  | ||||||
| from urllib.parse import urljoin |  | ||||||
| import re |  | ||||||
| import json |  | ||||||
| 
 |  | ||||||
| queries = ['iphone'] |  | ||||||
| API = '' |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_url(url): |  | ||||||
|     payload = {'api_key': API, 'url': url, 'country_code': 'us'} |  | ||||||
|     proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) |  | ||||||
|     return proxy_url |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class AmazonSpider(scrapy.Spider): |  | ||||||
|     name = 'amazon' |  | ||||||
| 
 |  | ||||||
|     def start_requests(self): |  | ||||||
|         for query in queries: |  | ||||||
|             url = 'https://www.amazon.de/s?' + urlencode({'k': query}) |  | ||||||
|             yield scrapy.Request(url=url, callback=self.parse_keyword_response) |  | ||||||
| 
 |  | ||||||
|     def parse_keyword_response(self, response): |  | ||||||
|         products = response.xpath('//*[@data-asin]') |  | ||||||
| 
 |  | ||||||
|         for product in products: |  | ||||||
|             asin = product.xpath('@data-asin').extract_first() |  | ||||||
|             product_url = f"https://www.amazon.de/dp/{asin}" |  | ||||||
|             yield scrapy.Request(url=product_url, callback=self.parse_product_page, meta={'asin': asin}) |  | ||||||
| 
 |  | ||||||
|         next_page = response.xpath('//li[@class="a-last"]/a/@href').extract_first() |  | ||||||
|         if next_page: |  | ||||||
|             url = urljoin("https://www.amazon.de", next_page) |  | ||||||
|             yield scrapy.Request(url=url, callback=self.parse_keyword_response) |  | ||||||
| 
 |  | ||||||
|     def parse_product_page(self, response): |  | ||||||
|         asin = response.meta['asin'] |  | ||||||
|         title = response.xpath('//*[@id="productTitle"]/text()').extract_first() |  | ||||||
|         image = re.search('"large":"(.*?)"', response.text).groups()[0] |  | ||||||
|         rating = response.xpath('//*[@id="acrPopover"]/@title').extract_first() |  | ||||||
|         number_of_reviews = response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract_first() |  | ||||||
|         price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first() |  | ||||||
| 
 |  | ||||||
|         if not price: |  | ||||||
|             price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \ |  | ||||||
|                     response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first() |  | ||||||
| 
 |  | ||||||
|         temp = response.xpath('//*[@id="twister"]') |  | ||||||
|         sizes = [] |  | ||||||
|         colors = [] |  | ||||||
|         if temp: |  | ||||||
|             s = re.search('"variationValues" : ({.*})', response.text).groups()[0] |  | ||||||
|             json_acceptable = s.replace("'", "\"") |  | ||||||
|             di = json.loads(json_acceptable) |  | ||||||
|             sizes = di.get('size_name', []) |  | ||||||
|             colors = di.get('color_name', []) |  | ||||||
| 
 |  | ||||||
|         bullet_points = response.xpath('//*[@id="feature-bullets"]//li/span/text()').extract() |  | ||||||
|         seller_rank = response.xpath( |  | ||||||
|             '//*[text()="Amazon Best Sellers Rank:"]/parent::*//text()[not(parent::style)]').extract() |  | ||||||
|         yield {'asin': asin, 'Title': title, 'MainImage': image, 'Rating': rating, 'NumberOfReviews': number_of_reviews, |  | ||||||
|                'Price': price, 'AvailableSizes': sizes, 'AvailableColors': colors, 'BulletPoints': bullet_points, |  | ||||||
|                'SellerRank': seller_rank} |  | ||||||
|  | @ -2,4 +2,3 @@ pymysql | ||||||
| flask | flask | ||||||
| flask-sqlalchemy | flask-sqlalchemy | ||||||
| flask_restful | flask_restful | ||||||
| scrapy |  | ||||||
|  | @ -1,11 +0,0 @@ | ||||||
| # Automatically created by: scrapy startproject |  | ||||||
| # |  | ||||||
| # For more information about the [deploy] section see: |  | ||||||
| # https://scrapyd.readthedocs.io/en/latest/deploy.html |  | ||||||
| 
 |  | ||||||
| [settings] |  | ||||||
| default = crawler.settings |  | ||||||
| 
 |  | ||||||
| [deploy] |  | ||||||
| #url = http://localhost:6800/ |  | ||||||
| project = crawler |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user