Initial commit
This commit is contained in:
commit
38b9079592
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
venv/
|
||||
*.iml
|
||||
.idea/
|
6
article.py
Normal file
6
article.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
class Article:
|
||||
def __init__(self, url, image_url, title, summary):
|
||||
self.url = url
|
||||
self.image_url = image_url
|
||||
self.title = title
|
||||
self.summary = summary
|
72
crawler.py
Normal file
72
crawler.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
import logging
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from article import Article
|
||||
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s %(levelname)s:%(message)s',
|
||||
level=logging.INFO
|
||||
)
|
||||
|
||||
HEADERS = ({'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 '
|
||||
'Safari/537.36'})
|
||||
|
||||
|
||||
class Crawler:
|
||||
def __init__(self, base_url):
|
||||
self.base_url = base_url
|
||||
|
||||
def check_for_new_articles(self) -> [Article]:
|
||||
source = self.__download_url__(self.base_url)
|
||||
article_urls = self.__extract_article_urls__(source)
|
||||
|
||||
article_urls.append(
|
||||
'https://www.ka-news.de/region/karlsruhe/falschparker-werden-in-karlsruhe-ab-juli-frueher-abgeschleppt-art-2813321')
|
||||
|
||||
articles = []
|
||||
|
||||
for url in article_urls:
|
||||
article_source = self.__download_url__(url)
|
||||
soup = BeautifulSoup(article_source, 'html.parser')
|
||||
|
||||
if self.__check_for_yaa__(soup):
|
||||
articles.append(self.__extract_info_from_article__(soup, url))
|
||||
|
||||
return articles
|
||||
|
||||
def __download_url__(self, url):
|
||||
return requests.get(url, headers=HEADERS).text
|
||||
|
||||
def __extract_article_urls__(self, source) -> []:
|
||||
soup = BeautifulSoup(source, 'html.parser')
|
||||
|
||||
urls = []
|
||||
|
||||
for article in soup.find_all('article'):
|
||||
url = article.find_next('a').attrs['href']
|
||||
urls.append(self.base_url + url)
|
||||
|
||||
return urls
|
||||
|
||||
def __check_for_yaa__(self, soup) -> bool:
|
||||
author = soup.find(class_='article-author').text
|
||||
|
||||
if 'yaa' in author:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def __extract_info_from_article__(self, soup, url) -> Article:
|
||||
title = soup.find('h1').text
|
||||
summary = soup.find(class_='article-summary').text
|
||||
image_url = self.base_url + soup.find('picture').find('img').attrs['src']
|
||||
|
||||
return Article(
|
||||
title=title,
|
||||
summary=summary,
|
||||
url=url,
|
||||
image_url=image_url
|
||||
)
|
0
firebonk_api.py
Normal file
0
firebonk_api.py
Normal file
7
main.py
Normal file
7
main.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
from crawler import Crawler
|
||||
|
||||
if __name__ == '__main__':
|
||||
crawl = Crawler('https://www.ka-news.de')
|
||||
|
||||
articles = crawl.check_for_new_articles()
|
||||
print(articles)
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
urllib3
|
||||
requests
|
||||
bs4
|
Loading…
Reference in New Issue
Block a user