Added basic amazon crawler using beautifulsoup4

2025-12-08 01:55:48 +00:00 · 2021-05-16 22:05:32 +02:00 · 2021-05-16 22:05:32 +02:00 · 2067a47fb2
commit 2067a47fb2
parent dbc793cc08
2 changed files with 93 additions and 82 deletions
--- a/Crawler/crawler.py
+++ b/Crawler/crawler.py
@ -1,5 +1,10 @@
 import sql
-import amazonspider
+import requests
+from bs4 import BeautifulSoup
+
+HEADERS = ({'User-Agent':
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 '
+            'Safari/537.36'})


 def crawl(product_ids: [int]) -> dict:
@ -57,9 +62,14 @@ def __crawl_amazon__(product_info: dict) -> tuple:
    :param product_info: A dict with product info containing product_id, vendor_id, url
    :return: A tuple with the crawled data, containing (product_id, vendor_id, price_in_cents)
    """
+    page = requests.get(product_info['url'], headers= HEADERS)
+    soup = BeautifulSoup(page.content, features="lxml")
+    try:
+        price = int(soup.find(id='priceblock_ourprice').get_text().replace(".", "").replace(",", "").replace("€", "").strip())
+    except RuntimeError:
+        price = ''

-    amazonspider.start_crawling()
-    return (product_info['product_id'], product_info['vendor_id'], 123)
+    return (product_info['product_id'], product_info['vendor_id'], price)


 def __crawl_apple__(product_info: dict) -> tuple:
--- a/Crawler/requirements.txt
+++ b/Crawler/requirements.txt
@ -1,5 +1,6 @@
 pymysql
-flask
+flask==1.1.2
 flask-sqlalchemy
 flask_restful
-scrapy
+beautifulsoup4
+requests