From 318e9b1bb613b5cecacd88d3b0e5d95f7eb5eec9 Mon Sep 17 00:00:00 2001 From: Patrick Mueller Date: Sun, 3 Jul 2022 00:22:30 +0200 Subject: [PATCH] Adding system to check if article is already known --- crawler.py | 2 +- main.py | 37 +++++++++++++++++++++++++++++++++++-- requirements.txt | 3 ++- sql.py | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 sql.py diff --git a/crawler.py b/crawler.py index 7b9f7ef..dabcc98 100644 --- a/crawler.py +++ b/crawler.py @@ -19,7 +19,7 @@ class Crawler: def __init__(self, base_url): self.base_url = base_url - def check_for_new_articles(self) -> [Article]: + def check_for_new_yaa_articles(self) -> [Article]: source = self.__download_url__(self.base_url) article_urls = self.__extract_article_urls__(source) diff --git a/main.py b/main.py index c363b99..44f25e5 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,40 @@ +import sql from crawler import Crawler + +def __check_and_insert_in_database__(conn, article) -> bool: + """ + Checks, if the article is already known. If not, inserts it into the db. + :param conn: SQL connection + :param article: The article to check / insert + :return: If the article is already known + """ + cur = conn.cursor() + cur.execute('SELECT article_id FROM yaa_articles WHERE url = %s', article.url) + + res = cur.fetchall() + + if len(res) > 0: + cur.close() + return True + else: + cur.execute('INSERT INTO yaa_articles (title, summary, url, image_url) VALUES (%s, %s, %s, %s)', + (article.title, article.summary, article.url, article.image_url)) + conn.commit() + cur.close() + return False + + if __name__ == '__main__': crawl = Crawler('https://www.ka-news.de') + conn = sql.get_connection() - articles = crawl.check_for_new_articles() - print(articles) + articles = crawl.check_for_new_yaa_articles() + + for article in articles: + if not __check_and_insert_in_database__(conn, article): + print('New!') + else: + print('Old!') + + conn.close() diff --git a/requirements.txt b/requirements.txt index c61ac5c..cb5922e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ urllib3 requests -bs4 \ No newline at end of file +bs4 +pymysql \ No newline at end of file diff --git a/sql.py b/sql.py new file mode 100644 index 0000000..41433ef --- /dev/null +++ b/sql.py @@ -0,0 +1,34 @@ +import logging +import os + +import pymysql as pymysql + + +def get_connection() -> pymysql.Connection: + """ + Get a connection to SQL. + This function is used on the vServer for local testing + @return: pymysql connection object + """ + try: + if os.environ.get('IS_VSERVER') == 'true': + conn = pymysql.connect( + user=os.environ['vServer_SQL_User'], + password=os.environ['vServer_SQL_Password'], + host='localhost', + port=3306, + database='ka-news-yaa-crawler' + ) + else: + conn = pymysql.connect( + user=os.environ['DB_USER'], + password=os.environ['DB_PASSWORD'], + host=os.environ['DB_HOST'], + port=3306, + database='ka-news-yaa-crawler' + ) + + return conn + except pymysql.Error as e: + logging.error('SQL Connection error: %s', e) + return None