ka-news-yaa-crawler/crawler.py

import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

from article import Article

logging.basicConfig(
	format='%(asctime)s %(levelname)s:%(message)s',
	level=logging.INFO
)

HEADERS = ({'User-Agent':
				'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 '
				'Safari/537.36'})


class Crawler:
	def __init__(self, base_url):
		self.base_url = base_url

	def check_for_new_yaa_articles(self) -> [Article]:
		source = self.__download_url__(self.base_url)
		article_urls = self.__extract_article_urls__(source)

		articles = []

		for url in article_urls:
			article_source = self.__download_url__(url)
			soup = BeautifulSoup(article_source, 'html.parser')

			if self.__check_for_yaa__(soup):
				articles.append(self.__extract_info_from_article__(soup, url))

		return articles

	def __download_url__(self, url):
		try:
			return requests.get(url, headers=HEADERS).text
		except requests.exceptions.ConnectionError:
			print('An error has occured trying to download the HTML')

	def __extract_article_urls__(self, source) -> []:
		soup = BeautifulSoup(source, 'html.parser')

		urls = []

		for article in soup.find_all('article'):
			url = article.find_next('a').attrs['href']
			urls.append(self.base_url + url)

		return urls

	def __check_for_yaa__(self, soup) -> bool:
		if not soup.find(class_='article-author'):
			return False

		author = soup.find(class_='article-author').text

		if 'yaa' in author or 'Yannick Antritter' in author:
			return True

		return False

	def __extract_info_from_article__(self, soup, url) -> Article:
		title = soup.find('h1').text
		summary = soup.find(class_='article-summary').text
		image_url = self.base_url + soup.find('picture').find('img').attrs['src']

		return Article(
			title=title,
			summary=summary,
			url=url,
			image_url=image_url
		)