Initial commit

This commit is contained in:
Patrick Müller 2022-07-02 20:15:13 +02:00
commit 38b9079592
Signed by: Paddy
GPG Key ID: 37ABC11275CAABCE
6 changed files with 91 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
venv/
*.iml
.idea/

6
article.py Normal file
View File

@ -0,0 +1,6 @@
class Article:
def __init__(self, url, image_url, title, summary):
self.url = url
self.image_url = image_url
self.title = title
self.summary = summary

72
crawler.py Normal file
View File

@ -0,0 +1,72 @@
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from article import Article
logging.basicConfig(
format='%(asctime)s %(levelname)s:%(message)s',
level=logging.INFO
)
HEADERS = ({'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 '
'Safari/537.36'})
class Crawler:
def __init__(self, base_url):
self.base_url = base_url
def check_for_new_articles(self) -> [Article]:
source = self.__download_url__(self.base_url)
article_urls = self.__extract_article_urls__(source)
article_urls.append(
'https://www.ka-news.de/region/karlsruhe/falschparker-werden-in-karlsruhe-ab-juli-frueher-abgeschleppt-art-2813321')
articles = []
for url in article_urls:
article_source = self.__download_url__(url)
soup = BeautifulSoup(article_source, 'html.parser')
if self.__check_for_yaa__(soup):
articles.append(self.__extract_info_from_article__(soup, url))
return articles
def __download_url__(self, url):
return requests.get(url, headers=HEADERS).text
def __extract_article_urls__(self, source) -> []:
soup = BeautifulSoup(source, 'html.parser')
urls = []
for article in soup.find_all('article'):
url = article.find_next('a').attrs['href']
urls.append(self.base_url + url)
return urls
def __check_for_yaa__(self, soup) -> bool:
author = soup.find(class_='article-author').text
if 'yaa' in author:
return True
return False
def __extract_info_from_article__(self, soup, url) -> Article:
title = soup.find('h1').text
summary = soup.find(class_='article-summary').text
image_url = self.base_url + soup.find('picture').find('img').attrs['src']
return Article(
title=title,
summary=summary,
url=url,
image_url=image_url
)

0
firebonk_api.py Normal file
View File

7
main.py Normal file
View File

@ -0,0 +1,7 @@
from crawler import Crawler
if __name__ == '__main__':
crawl = Crawler('https://www.ka-news.de')
articles = crawl.check_for_new_articles()
print(articles)

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
urllib3
requests
bs4