Adding system to check if article is already known
This commit is contained in:
parent
38b9079592
commit
318e9b1bb6
|
@ -19,7 +19,7 @@ class Crawler:
|
||||||
def __init__(self, base_url):
|
def __init__(self, base_url):
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
|
|
||||||
def check_for_new_articles(self) -> [Article]:
|
def check_for_new_yaa_articles(self) -> [Article]:
|
||||||
source = self.__download_url__(self.base_url)
|
source = self.__download_url__(self.base_url)
|
||||||
article_urls = self.__extract_article_urls__(source)
|
article_urls = self.__extract_article_urls__(source)
|
||||||
|
|
||||||
|
|
37
main.py
37
main.py
|
@ -1,7 +1,40 @@
|
||||||
|
import sql
|
||||||
from crawler import Crawler
|
from crawler import Crawler
|
||||||
|
|
||||||
|
|
||||||
|
def __check_and_insert_in_database__(conn, article) -> bool:
|
||||||
|
"""
|
||||||
|
Checks, if the article is already known. If not, inserts it into the db.
|
||||||
|
:param conn: SQL connection
|
||||||
|
:param article: The article to check / insert
|
||||||
|
:return: If the article is already known
|
||||||
|
"""
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute('SELECT article_id FROM yaa_articles WHERE url = %s', article.url)
|
||||||
|
|
||||||
|
res = cur.fetchall()
|
||||||
|
|
||||||
|
if len(res) > 0:
|
||||||
|
cur.close()
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
cur.execute('INSERT INTO yaa_articles (title, summary, url, image_url) VALUES (%s, %s, %s, %s)',
|
||||||
|
(article.title, article.summary, article.url, article.image_url))
|
||||||
|
conn.commit()
|
||||||
|
cur.close()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
crawl = Crawler('https://www.ka-news.de')
|
crawl = Crawler('https://www.ka-news.de')
|
||||||
|
conn = sql.get_connection()
|
||||||
|
|
||||||
articles = crawl.check_for_new_articles()
|
articles = crawl.check_for_new_yaa_articles()
|
||||||
print(articles)
|
|
||||||
|
for article in articles:
|
||||||
|
if not __check_and_insert_in_database__(conn, article):
|
||||||
|
print('New!')
|
||||||
|
else:
|
||||||
|
print('Old!')
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
urllib3
|
urllib3
|
||||||
requests
|
requests
|
||||||
bs4
|
bs4
|
||||||
|
pymysql
|
34
sql.py
Normal file
34
sql.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pymysql as pymysql
|
||||||
|
|
||||||
|
|
||||||
|
def get_connection() -> pymysql.Connection:
|
||||||
|
"""
|
||||||
|
Get a connection to SQL.
|
||||||
|
This function is used on the vServer for local testing
|
||||||
|
@return: pymysql connection object
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if os.environ.get('IS_VSERVER') == 'true':
|
||||||
|
conn = pymysql.connect(
|
||||||
|
user=os.environ['vServer_SQL_User'],
|
||||||
|
password=os.environ['vServer_SQL_Password'],
|
||||||
|
host='localhost',
|
||||||
|
port=3306,
|
||||||
|
database='ka-news-yaa-crawler'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn = pymysql.connect(
|
||||||
|
user=os.environ['DB_USER'],
|
||||||
|
password=os.environ['DB_PASSWORD'],
|
||||||
|
host=os.environ['DB_HOST'],
|
||||||
|
port=3306,
|
||||||
|
database='ka-news-yaa-crawler'
|
||||||
|
)
|
||||||
|
|
||||||
|
return conn
|
||||||
|
except pymysql.Error as e:
|
||||||
|
logging.error('SQL Connection error: %s', e)
|
||||||
|
return None
|
Loading…
Reference in New Issue
Block a user