import logging from urllib.parse import urljoin import requests from bs4 import BeautifulSoup from article import Article logging.basicConfig( format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO ) HEADERS = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 ' 'Safari/537.36'}) class Crawler: def __init__(self, base_url): self.base_url = base_url def check_for_new_yaa_articles(self) -> [Article]: source = self.__download_url__(self.base_url) article_urls = self.__extract_article_urls__(source) articles = [] for url in article_urls: article_source = self.__download_url__(url) soup = BeautifulSoup(article_source, 'html.parser') if self.__check_for_yaa__(soup): articles.append(self.__extract_info_from_article__(soup, url)) return articles def __download_url__(self, url): return requests.get(url, headers=HEADERS).text def __extract_article_urls__(self, source) -> []: soup = BeautifulSoup(source, 'html.parser') urls = [] for article in soup.find_all('article'): url = article.find_next('a').attrs['href'] urls.append(self.base_url + url) return urls def __check_for_yaa__(self, soup) -> bool: author = soup.find(class_='article-author').text if 'yaa' in author: return True return False def __extract_info_from_article__(self, soup, url) -> Article: title = soup.find('h1').text summary = soup.find(class_='article-summary').text image_url = self.base_url + soup.find('picture').find('img').attrs['src'] return Article( title=title, summary=summary, url=url, image_url=image_url )