ka-news-yaa-crawler/crawler.py

import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

from article import Article

logging.basicConfig(
	format='%(asctime)s %(levelname)s:%(message)s',
	level=logging.INFO
)

HEADERS = ({'User-Agent':
				'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 '
				'Safari/537.36'})


class Crawler:
	def __init__(self, base_url):
		self.base_url = base_url

	def check_for_new_yaa_articles(self) -> [Article]:
		source = self.__download_url__(self.base_url)
		article_urls = self.__extract_article_urls__(source)

		articles = []

		for url in article_urls:
			article_source = self.__download_url__(url)
			soup = BeautifulSoup(article_source, 'html.parser')

			if self.__check_for_yaa__(soup):
				articles.append(self.__extract_info_from_article__(soup, url))

		return articles

	def __download_url__(self, url):
		try:
			return requests.get(url, headers=HEADERS).text
		except requests.exceptions.ConnectionError:
			print('An error has occured trying to download the HTML')

	def __extract_article_urls__(self, source) -> []:
		soup = BeautifulSoup(source, 'html.parser')

		urls = []

		for article in soup.find_all('article'):
			url = article.find_next('a').attrs['href']
			urls.append(self.base_url + url)

		return urls

	def __check_for_yaa__(self, soup) -> bool:
		if not soup.find(class_='article-author'):
			return False

		author = soup.find(class_='article-author').text

		if 'yaa' in author or 'Yannick Antritter' in author:
			return True

		return False

	def __extract_info_from_article__(self, soup, url) -> Article:
		title = soup.find('h1').text
		summary = soup.find(class_='article-summary').text
		image_url = self.base_url + soup.find('picture').find('img').attrs['src']

		return Article(
			title=title,
			summary=summary,
			url=url,
			image_url=image_url
		)
Initial commit 2022-07-02 18:15:13 +00:00			`import logging`
			`from urllib.parse import urljoin`
			`import requests`
			`from bs4 import BeautifulSoup`

			`from article import Article`

			`logging.basicConfig(`
			`format='%(asctime)s %(levelname)s:%(message)s',`
			`level=logging.INFO`
			`)`

			`HEADERS = ({'User-Agent':`
			`'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 '`
			`'Safari/537.36'})`


			`class Crawler:`
			`def __init__(self, base_url):`
			`self.base_url = base_url`

Adding system to check if article is already known 2022-07-02 22:22:30 +00:00			`def check_for_new_yaa_articles(self) -> [Article]:`
Initial commit 2022-07-02 18:15:13 +00:00			`source = self.__download_url__(self.base_url)`
			`article_urls = self.__extract_article_urls__(source)`

			`articles = []`

			`for url in article_urls:`
			`article_source = self.__download_url__(url)`
			`soup = BeautifulSoup(article_source, 'html.parser')`

			`if self.__check_for_yaa__(soup):`
			`articles.append(self.__extract_info_from_article__(soup, url))`

			`return articles`

			`def __download_url__(self, url):`
Adding try except block in case of connection errors 2022-08-16 20:07:04 +00:00			`try:`
			`return requests.get(url, headers=HEADERS).text`
			`except requests.exceptions.ConnectionError:`
			`print('An error has occured trying to download the HTML')`
Initial commit 2022-07-02 18:15:13 +00:00
			`def __extract_article_urls__(self, source) -> []:`
			`soup = BeautifulSoup(source, 'html.parser')`

			`urls = []`

			`for article in soup.find_all('article'):`
			`url = article.find_next('a').attrs['href']`
			`urls.append(self.base_url + url)`

			`return urls`

			`def __check_for_yaa__(self, soup) -> bool:`
Fixing NoneType error when no author is given 2022-07-06 16:51:10 +00:00			`if not soup.find(class_='article-author'):`
			`return False`

Initial commit 2022-07-02 18:15:13 +00:00			`author = soup.find(class_='article-author').text`

Adding different author spelling 2022-07-18 15:43:25 +00:00			`if 'yaa' in author or 'Yannick Antritter' in author:`
Initial commit 2022-07-02 18:15:13 +00:00			`return True`

			`return False`

			`def __extract_info_from_article__(self, soup, url) -> Article:`
			`title = soup.find('h1').text`
			`summary = soup.find(class_='article-summary').text`
			`image_url = self.base_url + soup.find('picture').find('img').attrs['src']`

			`return Article(`
			`title=title,`
			`summary=summary,`
			`url=url,`
			`image_url=image_url`
			`)`