From 38b9079592bd5f383316ca39db7293cf2c05e329 Mon Sep 17 00:00:00 2001
From: Patrick Mueller <patrick@mueller-patrick.tech>
Date: Sat, 2 Jul 2022 20:15:13 +0200
Subject: [PATCH] Initial commit

---
 .gitignore       |  3 ++
 article.py       |  6 ++++
 crawler.py       | 72 ++++++++++++++++++++++++++++++++++++++++++++++++
 firebonk_api.py  |  0
 main.py          |  7 +++++
 requirements.txt |  3 ++
 6 files changed, 91 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 article.py
 create mode 100644 crawler.py
 create mode 100644 firebonk_api.py
 create mode 100644 main.py
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..de036b7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+venv/
+*.iml
+.idea/
\ No newline at end of file
diff --git a/article.py b/article.py
new file mode 100644
index 0000000..228865a
--- /dev/null
+++ b/article.py
@@ -0,0 +1,6 @@
+class Article:
+	def __init__(self, url, image_url, title, summary):
+		self.url = url
+		self.image_url = image_url
+		self.title = title
+		self.summary = summary
diff --git a/crawler.py b/crawler.py
new file mode 100644
index 0000000..7b9f7ef
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,72 @@
+import logging
+from urllib.parse import urljoin
+import requests
+from bs4 import BeautifulSoup
+
+from article import Article
+
+logging.basicConfig(
+	format='%(asctime)s %(levelname)s:%(message)s',
+	level=logging.INFO
+)
+
+HEADERS = ({'User-Agent':
+				'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 '
+				'Safari/537.36'})
+
+
+class Crawler:
+	def __init__(self, base_url):
+		self.base_url = base_url
+
+	def check_for_new_articles(self) -> [Article]:
+		source = self.__download_url__(self.base_url)
+		article_urls = self.__extract_article_urls__(source)
+
+		article_urls.append(
+			'https://www.ka-news.de/region/karlsruhe/falschparker-werden-in-karlsruhe-ab-juli-frueher-abgeschleppt-art-2813321')
+
+		articles = []
+
+		for url in article_urls:
+			article_source = self.__download_url__(url)
+			soup = BeautifulSoup(article_source, 'html.parser')
+
+			if self.__check_for_yaa__(soup):
+				articles.append(self.__extract_info_from_article__(soup, url))
+
+		return articles
+
+	def __download_url__(self, url):
+		return requests.get(url, headers=HEADERS).text
+
+	def __extract_article_urls__(self, source) -> []:
+		soup = BeautifulSoup(source, 'html.parser')
+
+		urls = []
+
+		for article in soup.find_all('article'):
+			url = article.find_next('a').attrs['href']
+			urls.append(self.base_url + url)
+
+		return urls
+
+	def __check_for_yaa__(self, soup) -> bool:
+		author = soup.find(class_='article-author').text
+
+		if 'yaa' in author:
+			return True
+
+		return False
+
+	def __extract_info_from_article__(self, soup, url) -> Article:
+		title = soup.find('h1').text
+		summary = soup.find(class_='article-summary').text
+		image_url = self.base_url + soup.find('picture').find('img').attrs['src']
+
+		return Article(
+			title=title,
+			summary=summary,
+			url=url,
+			image_url=image_url
+		)
diff --git a/firebonk_api.py b/firebonk_api.py
new file mode 100644
index 0000000..e69de29
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..c363b99
--- /dev/null
+++ b/main.py
@@ -0,0 +1,7 @@
+from crawler import Crawler
+
+if __name__ == '__main__':
+	crawl = Crawler('https://www.ka-news.de')
+
+	articles = crawl.check_for_new_articles()
+	print(articles)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c61ac5c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+urllib3
+requests
+bs4
\ No newline at end of file