From 38b9079592bd5f383316ca39db7293cf2c05e329 Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Sat, 2 Jul 2022 20:15:13 +0200 Subject: [PATCH] Initial commit --- .gitignore | 3 ++ article.py | 6 ++++ crawler.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++++ firebonk_api.py | 0 main.py | 7 +++++ requirements.txt | 3 ++ 6 files changed, 91 insertions(+) create mode 100644 .gitignore create mode 100644 article.py create mode 100644 crawler.py create mode 100644 firebonk_api.py create mode 100644 main.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..de036b7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +venv/ +*.iml +.idea/ \ No newline at end of file diff --git a/article.py b/article.py new file mode 100644 index 0000000..228865a --- /dev/null +++ b/article.py @@ -0,0 +1,6 @@ +class Article: + def __init__(self, url, image_url, title, summary): + self.url = url + self.image_url = image_url + self.title = title + self.summary = summary diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..7b9f7ef --- /dev/null +++ b/crawler.py @@ -0,0 +1,72 @@ +import logging +from urllib.parse import urljoin +import requests +from bs4 import BeautifulSoup + +from article import Article + +logging.basicConfig( + format='%(asctime)s %(levelname)s:%(message)s', + level=logging.INFO +) + +HEADERS = ({'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 ' + 'Safari/537.36'}) + + +class Crawler: + def __init__(self, base_url): + self.base_url = base_url + + def check_for_new_articles(self) -> [Article]: + source = self.__download_url__(self.base_url) + article_urls = self.__extract_article_urls__(source) + + article_urls.append( + 'https://www.ka-news.de/region/karlsruhe/falschparker-werden-in-karlsruhe-ab-juli-frueher-abgeschleppt-art-2813321') + + articles = [] + + for url in article_urls: + article_source = self.__download_url__(url) + soup = BeautifulSoup(article_source, 'html.parser') + + if self.__check_for_yaa__(soup): + articles.append(self.__extract_info_from_article__(soup, url)) + + return articles + + def __download_url__(self, url): + return requests.get(url, headers=HEADERS).text + + def __extract_article_urls__(self, source) -> []: + soup = BeautifulSoup(source, 'html.parser') + + urls = [] + + for article in soup.find_all('article'): + url = article.find_next('a').attrs['href'] + urls.append(self.base_url + url) + + return urls + + def __check_for_yaa__(self, soup) -> bool: + author = soup.find(class_='article-author').text + + if 'yaa' in author: + return True + + return False + + def __extract_info_from_article__(self, soup, url) -> Article: + title = soup.find('h1').text + summary = soup.find(class_='article-summary').text + image_url = self.base_url + soup.find('picture').find('img').attrs['src'] + + return Article( + title=title, + summary=summary, + url=url, + image_url=image_url + ) diff --git a/firebonk_api.py b/firebonk_api.py new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py new file mode 100644 index 0000000..c363b99 --- /dev/null +++ b/main.py @@ -0,0 +1,7 @@ +from crawler import Crawler + +if __name__ == '__main__': + crawl = Crawler('https://www.ka-news.de') + + articles = crawl.check_for_new_articles() + print(articles) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c61ac5c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +urllib3 +requests +bs4 \ No newline at end of file