Adding system to check if article is already known

This commit is contained in:
Patrick Müller 2022-07-03 00:22:30 +02:00
parent 38b9079592
commit 318e9b1bb6
Signed by: Paddy
GPG Key ID: 37ABC11275CAABCE
4 changed files with 72 additions and 4 deletions

View File

@ -19,7 +19,7 @@ class Crawler:
def __init__(self, base_url):
self.base_url = base_url
def check_for_new_articles(self) -> [Article]:
def check_for_new_yaa_articles(self) -> [Article]:
source = self.__download_url__(self.base_url)
article_urls = self.__extract_article_urls__(source)

37
main.py
View File

@ -1,7 +1,40 @@
import sql
from crawler import Crawler
def __check_and_insert_in_database__(conn, article) -> bool:
"""
Checks, if the article is already known. If not, inserts it into the db.
:param conn: SQL connection
:param article: The article to check / insert
:return: If the article is already known
"""
cur = conn.cursor()
cur.execute('SELECT article_id FROM yaa_articles WHERE url = %s', article.url)
res = cur.fetchall()
if len(res) > 0:
cur.close()
return True
else:
cur.execute('INSERT INTO yaa_articles (title, summary, url, image_url) VALUES (%s, %s, %s, %s)',
(article.title, article.summary, article.url, article.image_url))
conn.commit()
cur.close()
return False
if __name__ == '__main__':
crawl = Crawler('https://www.ka-news.de')
conn = sql.get_connection()
articles = crawl.check_for_new_articles()
print(articles)
articles = crawl.check_for_new_yaa_articles()
for article in articles:
if not __check_and_insert_in_database__(conn, article):
print('New!')
else:
print('Old!')
conn.close()

View File

@ -1,3 +1,4 @@
urllib3
requests
bs4
bs4
pymysql

34
sql.py Normal file
View File

@ -0,0 +1,34 @@
import logging
import os
import pymysql as pymysql
def get_connection() -> pymysql.Connection:
"""
Get a connection to SQL.
This function is used on the vServer for local testing
@return: pymysql connection object
"""
try:
if os.environ.get('IS_VSERVER') == 'true':
conn = pymysql.connect(
user=os.environ['vServer_SQL_User'],
password=os.environ['vServer_SQL_Password'],
host='localhost',
port=3306,
database='ka-news-yaa-crawler'
)
else:
conn = pymysql.connect(
user=os.environ['DB_USER'],
password=os.environ['DB_PASSWORD'],
host=os.environ['DB_HOST'],
port=3306,
database='ka-news-yaa-crawler'
)
return conn
except pymysql.Error as e:
logging.error('SQL Connection error: %s', e)
return None