diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c0fee05 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__/ +.vscode/ +final_task/rss_reader/parser.log +final_task/rss_reader/client/parser.log +final_task/rss_reader/client/__pycache__/ +final_task/env/ \ No newline at end of file diff --git a/README.md b/README.md index f3171ab..714e84c 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,4 @@ Congrats! You have successfully forked our repository. 2. Pull request name *MUST* be in format: `YourFirstName_YourLastName_EmailYouUsedWhileRegisteringOnThisCourse` 3. Pull request which have any other name format, or invalid e-mail *will be ignored completely until you fix it*. So make sure you specified correct e-mail. 4. In pull request description specify your current iteration. You also can add there any other info you want us to know before we start code review. -5. *Pull request must NOT contain any .pyc files, any virtual environment files/folders, any IDE technical files*. - - - +5. *Pull request must NOT contain any .pyc files, any virtual environment files/folders, any IDE technical files*. \ No newline at end of file diff --git a/final_task/README.md b/final_task/README.md index 7af281f..5ada52f 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -1,3 +1,97 @@ -# Your readme here -Some text. -Checkout how to write this file using *markdown*. +# RSS-READER + +## Command-line utility which receives RSS URL and prints results in human-readable format. + +### **Example:** +python rss_reader.py https://news.yahoo.com/rss - -limit 1 + +### **Output**: + +**Feed: Yahoo News - Latest News & Headlines** + +**Title**: Families come from across U.S. to grieve relatives slain in Mexico + +**Date**: Thu, 07 Nov 2019 01:06:45 -0500 + +**Link**: https://news.yahoo.com/under-armed-escort-mourner-convoys-060645935.html + +**Description**: An American man whose grandchildren were slain in a massacre in Mexico demanded justice on Thursday for other victims of the country's drug war, as relatives gathered from +across the United States for a funeral guarded by heavily armed military. Kenneth Miller lost his daughter-in-law and four grandchildren, all dual citizens, in an ambush on Monday in th +e northern border state of Sonora that killed three mothers and six children. The attack on members of breakaway Mormon communities who settled in Mexico decades ago prompted U.S. Pres +ident Donald Trump to urge Mexico and the United States to "wage war" together on drug cartels. + +**Links**: + +``` +[1]: https://news.yahoo.com/under-armed-escort-mourner-conv... (link) +[2]: http://l.yimg.com/uu/api/res/1.2/rRx_J3xHKYzIQ4EsiCPRT... +``` + +``` +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --version Print version info + --json Print result as JSON in stdout + --verbose Outputs verbose status messages + --limit LIMIT Limit news topics if this parameter provided +``` + +## Installation + +The recommended way to install rss-reader is with pip: + + +``` +pip install rssreaderih +``` + +or from source distribution: + +``` +python setup.py install +``` + +## Data caching + +I wrote a program that creates a database for convenient storage of news. The **postgresql** database is perfectly suited for this. Pictures are also stored in the database in binary format. + +## Converting + +To convert data to html format, I used the **dominate** library. + +Example: +``` +html_document = dominate.document(title="HTML document") + +with html_document: + with div(): + h2("Title: " + news_title) + p("Link: " + news_link) + p("Description: " + news_description) +``` + +To convert data to pdf format from html document, I used the **xhtml2pdf** library. + +Example: +``` +from xhtml2pdf import pisa + +pdf_file = pisa.CreatePDF(sourceHtmlFile) +``` + +## Deploying + +The application has a **dockerfile** for creating an application image. And **docker-compose.yml** file for linking application and database images. + +To deploy application use this command: +``` +docker-compose up +``` + +If you made changes to the application then use command: +``` +docker-compose up --build +``` diff --git a/final_task/rss_reader/.dockerignore b/final_task/rss_reader/.dockerignore new file mode 100644 index 0000000..c9af2ac --- /dev/null +++ b/final_task/rss_reader/.dockerignore @@ -0,0 +1,3 @@ +__pycache__/ +.vscode/ +client/__pycache__/ diff --git a/final_task/rss_reader/.env.dev b/final_task/rss_reader/.env.dev new file mode 100644 index 0000000..b6619f5 --- /dev/null +++ b/final_task/rss_reader/.env.dev @@ -0,0 +1 @@ +FLASK_ENV=development \ No newline at end of file diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/__init__.py similarity index 100% rename from final_task/rss_reader/rss_reader.py rename to final_task/rss_reader/__init__.py diff --git a/final_task/rss_reader/app.py b/final_task/rss_reader/app.py new file mode 100644 index 0000000..7c9bacb --- /dev/null +++ b/final_task/rss_reader/app.py @@ -0,0 +1,50 @@ +from flask import Flask, request, Response, send_from_directory +import os +import sys + +from . import collect_news, version, get_cache, logg + + +app = Flask(__name__) + + +@app.route('/print/', methods=['GET', 'POST']) +def getNews(): + req = request.get_json() + news = collect_news.collectNews(req['limit'], req['tojson'], req['tohtml'], req['topdf'], req['color'], req['url']) + return sendResponse(req, news) + + +@app.route('/getcache/') +def getCacheNews(): + req = request.get_json() + if(req['tohtml'] or req['topdf']): + news = get_cache.createHtmlFromDB(req['limit'], req['tohtml'], req['topdf'], req['date']) + else: + news = get_cache.collectNewsFromDB(req['limit'], req['tojson'], req['color'], req['date']) + return sendResponse(req, news) + + +def sendResponse(req, news): + if(req['topdf']): + try: + return send_from_directory(req['topdf'], filename=news, as_attachment=True) + except FileNotFoundError: + abort(404) + else: + return Response(news) + + +@app.route('/version/', methods=['GET', 'POST']) +def getVersion(): + req = request.get_json() + return version.VERSION + + +@app.route('/verbose/', methods=['GET', 'POST']) +def setLogging(): + logg.makeVerbose() + + +if __name__ == '__main__': + app.run() diff --git a/final_task/rss_reader/client/__init__.py b/final_task/rss_reader/client/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/final_task/rss_reader/client/arg_parser.py b/final_task/rss_reader/client/arg_parser.py new file mode 100644 index 0000000..47e4f9a --- /dev/null +++ b/final_task/rss_reader/client/arg_parser.py @@ -0,0 +1,19 @@ +import argparse + + +def createArgparser(vers): + '''Add argument commands''' + arguments = argparse.ArgumentParser(description='Pure Python command-line RSS reader') + + arguments.add_argument('source', type=str, nargs='?', help='RSS URL') + arguments.add_argument('--version', action='version', version=f'{vers}', + help='Print version info') + arguments.add_argument('--json', action='store_true', help='Print result as JSON in stdout') + arguments.add_argument('--verbose', action='store_true', help='Outputs verbose') + arguments.add_argument('--limit', action='store', type=int, help='Limit news topics') + arguments.add_argument('--date', action='store', help='Print news from the specified day') + arguments.add_argument('--tohtml', action='store', help='Convert news in html format') + arguments.add_argument('--topdf', action='store', help='Convert news in pdf format') + arguments.add_argument('--colorize', action='store_true', help='print news in colorized mode') + + return arguments.parse_args() diff --git a/final_task/rss_reader/client/logg.py b/final_task/rss_reader/client/logg.py new file mode 100644 index 0000000..6671da3 --- /dev/null +++ b/final_task/rss_reader/client/logg.py @@ -0,0 +1,19 @@ +import logging +import sys + + +# Set basic configs for logging +stdoutHandler = logging.StreamHandler(sys.stdout) +fileHandler = logging.FileHandler("parser.log", "a", encoding="utf-8") +logging.basicConfig(format=u'%(levelname)-8s [%(asctime)s] %(message)s', + level=logging.DEBUG, + handlers=[fileHandler]) + + +def makeVerbose(): + ''' + 1. print logs in stdout if there is --verbose argument + ''' + stderrLogger = logging.StreamHandler() + stderrLogger.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) + logging.getLogger().addHandler(stderrLogger) diff --git a/final_task/rss_reader/client/rss_reader.py b/final_task/rss_reader/client/rss_reader.py new file mode 100644 index 0000000..19f82f1 --- /dev/null +++ b/final_task/rss_reader/client/rss_reader.py @@ -0,0 +1,112 @@ +from urllib.request import Request, urlopen +from datetime import datetime +from colored import stylize +import arg_parser +import feedparser +import requests +import colored +import html +import sys +import os + +import logg + + +def main(version): + + args = arg_parser.createArgparser(version) + params = dict() + + if (args.verbose): + logg.makeVerbose() + requests.get('http://127.0.0.1:5000/verbose/') + + if(args.colorize): + color = [colored.fg(150), colored.fg(50), colored.fg(189)] + else: + color = [colored.attr('reset'), colored.attr('reset'), colored.attr('reset')] + + params = {'limit':args.limit, 'tojson': args.json, + 'tohtml':args.tohtml, 'topdf':args.topdf, 'color':color} + + if (args.date): + params['date'] = args.date + r = requests.get('http://127.0.0.1:5000/getcache/', json=params) + news = r.text + + if(args.tohtml): + saveHTML(news, args.tohtml) + elif(args.topdf): + pdf_document = bytes(news, 'utf-8') + savePDF(pdf_document, args.topdf) + else: + print(news) + else: + try: + checkConnection(args.source) + params['url'] = args.source + r = requests.get('http://127.0.0.1:5000/print/', json=params) + news = r.text + + if(args.tohtml): + saveHTML(news, args.tohtml) + elif(args.topdf): + pdf_document = bytes(news, 'utf-8') + savePDF(pdf_document, args.topdf) + else: + print(news) + except Exception as e: + logg.logging.error("Connection error" + str(e)) + print("Connection error: ", e) + + + +def saveHTML(html_document, html_path): + ''' + 1. create folder with html file + 2. write html structure in file + ''' + if not os.path.exists(html_path): + os.makedirs(html_path) + time_name = datetime.strftime(datetime.now(), "%H%M%S") + file_name = 'NewsFeed' + '-' + time_name + '.html' + html_file = os.path.join(html_path, file_name) + + with open(html_file, 'w', encoding='utf-8') as f: + f.write(str(html_document)) + + +def savePDF(doc, pdf_path): + ''' + 1. create folder with pdf file + 2. write pdf in file + ''' + if not os.path.exists(pdf_path): + os.makedirs(pdf_path) + time_name = datetime.strftime(datetime.now(), "%H%M%S") + file_name = 'NewsFeed' + '-' + time_name + '.pdf' + pdf_file = os.path.join(pdf_path, file_name) + + with open(pdf_file, "w+b") as resultFile: + resultFile.write(doc) + + +def checkConnection(source): + '''Check connection to server''' + try: + source = Request(source) + response = urlopen(source) + except Exception as e: + raise Exception(e) + else: + logg.logging.info('Website is working') + + +if __name__ == "__main__": + # Check connection to server + try: + version = (requests.get('http://127.0.0.1:5000/version/')).text + main(version) + except requests.exceptions.ConnectionError as error: + print("ConnectionError: " + str(error)) + logg.logging.error("ConnectionError: " + str(error)) diff --git a/final_task/rss_reader/collect_news.py b/final_task/rss_reader/collect_news.py new file mode 100644 index 0000000..c2fa853 --- /dev/null +++ b/final_task/rss_reader/collect_news.py @@ -0,0 +1,51 @@ +import feedparser +import html + +from . import logg, converter, news_parser + + +def collectNews(limit, tojson, tohtml, topdf, color, source): + ''' + 1. cache news + 2. create html or pdf document + 3. or return news in json or normal format + ''' + news = list() + + channel = feedparser.parse(source) + news.append(color[0] + "Feed: " + channel.feed.title + '\n') + limit = limit or len(channel.entries) + + news_parser.cacheNews(channel) + + if (tohtml or topdf): + html_doc = converter.createHtmlStructure(channel, limit, tohtml, topdf) + return html_doc + else: + for index, item in enumerate(channel.entries): + if (index == limit): + break + + if(index%2==0): + news.append(color[1]) + else: + news.append(color[2]) + + logg.createLogs(item) + + if (tojson): + news.append(news_parser.intoJson(item)) + else: + news.append("\nTitle: " + html.unescape(item.title)) + news.append("\nDate: " + item.published) + news.append("\nLink: " + item.link + '\n') + description = news_parser.getDescription(item.description) + if(description): + news.append(color[0] + "Description: " + description + '\n') + news.append(color[1] + "Links:" + "\n[1]: " + item.link + "(link)") + media_content = news_parser.checkMediaContent(item) + if(media_content): + news.append("\n[2]: " + media_content + '\n') + + return news + diff --git a/final_task/rss_reader/config.py b/final_task/rss_reader/config.py new file mode 100644 index 0000000..e8963fa --- /dev/null +++ b/final_task/rss_reader/config.py @@ -0,0 +1,20 @@ +from configparser import ConfigParser + + +def config(filename='database.ini', section='docker'): + # create a parser + parser = ConfigParser() + # read config file + parser.read(filename) + + # get section, default to postgresql + db = {} + if parser.has_section(section): + params = parser.items(section) + print(params) + for param in params: + db[param[0]] = param[1] + else: + raise Exception('Section {0} not found in the {1} file'.format(section, filename)) + + return db diff --git a/final_task/rss_reader/converter.py b/final_task/rss_reader/converter.py new file mode 100644 index 0000000..1beee14 --- /dev/null +++ b/final_task/rss_reader/converter.py @@ -0,0 +1,33 @@ +from dominate.tags import div, h2, img, p, link +import dominate +import html + +from . import news_parser, topdf + + +def createHtmlStructure(channel, limit, html_path, pdf_path): + ''' + 1. in loop create html structure + 2. return html_structure + 3. or file name of pdf for send_from_directory function + ''' + html_document = dominate.document(title="HTML document") + + for index, item in enumerate(channel.entries): + if (index == limit): + break + with html_document: + with div(): + h2("Title: " + html.unescape(item.title)) + p("Link: " + item.link) + media_content = news_parser.checkMediaContent(item) + if (media_content): + img(src=media_content) + description = news_parser.getDescription(item.description) + if (description): + p("Description: " + description) + + if (html_path): + return str(html_document) + elif(pdf_path): + return topdf.convertHtmlToPdf(str(html_document), pdf_path) diff --git a/final_task/rss_reader/database.ini b/final_task/rss_reader/database.ini new file mode 100644 index 0000000..b6128d3 --- /dev/null +++ b/final_task/rss_reader/database.ini @@ -0,0 +1,12 @@ +[postgresql] +database=postgres +user=postgres +password=rssreader +host=localhost +port=5432 +[docker] +database=postgres +user=postgres +password=rssreader +host=db +port=5432 \ No newline at end of file diff --git a/final_task/rss_reader/docker-compose.yml b/final_task/rss_reader/docker-compose.yml new file mode 100644 index 0000000..7a3644e --- /dev/null +++ b/final_task/rss_reader/docker-compose.yml @@ -0,0 +1,29 @@ +version: '3.7' + +services: + + flask: + build: . + restart: always + container_name: flask_app + ports: + - 5000:5000 + volumes: + - .:/rss_reader_app + # env_file: + # - ./.env.dev + depends_on: + - db + links: + - db + + db: + image: postgres:12.1-alpine + restart: always + ports: + - 5432:5432 + volumes: + - 'pg_data:/var/lib/postgresql/data' + +volumes: + pg_data: \ No newline at end of file diff --git a/final_task/rss_reader/dockerfile b/final_task/rss_reader/dockerfile new file mode 100644 index 0000000..0715940 --- /dev/null +++ b/final_task/rss_reader/dockerfile @@ -0,0 +1,19 @@ +FROM python:3.8.0-slim + +WORKDIR /rss_reader_app + +# set environment variables +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 + +# install psycopg2 dependencies +RUN apt update \ + && apt install -y libpq-dev gcc python3-dev + +COPY ./requirements.txt /rss_reader_app/requirements.txt +RUN pip install -r requirements.txt + +# copy project +COPY . /rss_reader_app + +CMD [ "flask", "run" , "--host=0.0.0.0"] \ No newline at end of file diff --git a/final_task/rss_reader/get_cache.py b/final_task/rss_reader/get_cache.py new file mode 100644 index 0000000..1553d2d --- /dev/null +++ b/final_task/rss_reader/get_cache.py @@ -0,0 +1,109 @@ +from dominate.tags import div, h2, img, p, link +from datetime import datetime +import dominate +import psycopg2 +import base64 +import time +import json + +from . import logg, converter, news_parser, topdf, +from . import config + + +def dateToStamp(arg_date): + ''' + 1. convert --date into timestamp + ''' + arg_date = str(arg_date) + arg_date = time.mktime(datetime.strptime(arg_date, '%Y%m%d').timetuple()) + return arg_date + + +def getCacheFromDB(arg_date): + ''' + 1. connect to database + 2. select from table news with published date equals --date + 3. convert news into html or pdf if there are --tohtml or --topdf arguments + 3. or print news in stdout + ''' + con = None + try: + params = config.config() + + with psycopg2.connect(**params) as con: + with con.cursor() as cur: + cur.execute('''SELECT title, link, image, description FROM news WHERE pub_date_stamp >= %s and pub_date_stamp < %s''', + (dateToStamp(arg_date), dateToStamp(int(arg_date) + 1))) + + records = cur.fetchall() + return records + + except psycopg2.ProgrammingError as e: + print("psycopg2.ProgrammingError: " + str(e)) + logg.logging.error(str(e)) + finally: + if con is not None: + con.close() + logg.logging.info("Database connection closed") + + +def collectNewsFromDB(limit, tojson, color, arg_date): + ''' + 1. create list for news + 2. collect all cache news from db in list + ''' + records = getCacheFromDB(arg_date) + + news = list() + news.append(color[0] + 'Cache News: ') + + for index, row in enumerate(records): + if(limit and index == limit): + break + + if(index%2==0): + news.append(color[1]) + else: + news.append(color[2]) + + if (tojson): + json_news = { + 'Title: ': row[0], + 'Link: ': row[1], + } + if(row[3]): + json_news['Description'] = row[3] + news.append(json.dumps(json_news)) + else: + news.append("\nTitle: " + row[0]) + news.append("\nLink: " + row[1] + '\n') + if (row[3]): + news.append(color[0] + "Description: " + row[3] + '\n') + return news + + +def createHtmlFromDB(limit, html_path, pdf_path, arg_date): + ''' + 1. in loop create html structure + 2. create html document or convert html structure into pdf + ''' + records = getCacheFromDB(arg_date) + + html_document = dominate.document(title="HTML document") + + for index, row in enumerate(records): + if(limit and index == limit): + break + with html_document: + with div(): + h2("Title: " + row[0]) + p("Link: " + row[1]) + if (row[2]): + img(src="data:image/jpg;base64," + base64.b64encode(row[2]).decode('ascii')) + if (row[3]): + p("Description: " + row[3]) + + if (html_path): + return str(html_document) + elif (pdf_path): + return topdf.convertHtmlToPdf(str(html_document), pdf_path) diff --git a/final_task/rss_reader/logg.py b/final_task/rss_reader/logg.py new file mode 100644 index 0000000..6a9bef1 --- /dev/null +++ b/final_task/rss_reader/logg.py @@ -0,0 +1,33 @@ +import logging +import sys + +from . import news_parser + + +# Set basic configs for logging +stdoutHandler = logging.StreamHandler(sys.stdout) +fileHandler = logging.FileHandler("parser.log", "a", encoding="utf-8") +logging.basicConfig(format=u'%(levelname)-8s [%(asctime)s] %(message)s', + level=logging.DEBUG, + handlers=[fileHandler]) + + +def makeVerbose(): + ''' + 1. print logs in stdout if there is --verbose argument + ''' + stderrLogger = logging.StreamHandler() + stderrLogger.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) + logging.getLogger().addHandler(stderrLogger) + + +def createLogs(item): + ''' + 1. log news in log file + ''' + logging.debug("Title: " + str(item.title)) + logging.debug("Date: " + str(item.published)) + logging.debug("Link: " + str(item.link)) + logging.debug("Description: " + news_parser.getDescription(item.description)) + logging.debug("Links:"+"\n[1]: " + str(item.link) + + "(link)\n[2]: " + str(news_parser.checkMediaContent(item)) + '\n') \ No newline at end of file diff --git a/final_task/rss_reader/news_parser.py b/final_task/rss_reader/news_parser.py new file mode 100644 index 0000000..ef96d4f --- /dev/null +++ b/final_task/rss_reader/news_parser.py @@ -0,0 +1,108 @@ +from email.utils import parsedate_to_datetime +from contextlib import closing +from bs4 import BeautifulSoup +from datetime import datetime +import requests +import sqlite3 +import psycopg2 +import json +import time +import html +import os + +from . import logg +from . import config + + +def checkMediaContent(item): + '''check if there is media content''' + media_content = '' + if ('media_content' in item.keys()): + media_content = item.media_content[0]['url'] + elif ('media_thumbnail' in item.keys()): + media_content = item.media_thumbnail[0]['url'] + return media_content + + +def getDescription(description): + '''return description without html tags''' + return BeautifulSoup(description, features="html.parser").getText() + + +def intoJson(item): + '''print news in json format''' + json_news = { + 'Title: ': html.unescape(item.title), + 'Date: ': item.published, + 'Link: ': item.link + } + description = getDescription(item.description) + media_link = checkMediaContent(item) + if(description): + json_news['Description: '] = description + if(media_link): + json_news['Media link: '] = media_link + return json.dumps(json_news) + + +def cacheNews(channel): + ''' + 1. connect to database + 2. create table in database + 3. insert news into table + ''' + con = None + try: + params = config.config() + + with psycopg2.connect(**params) as con: + with con.cursor() as cur: + cur.execute("""CREATE TABLE IF NOT EXISTS news + (title text, link text, image bytea, + description text, pub_date_stamp real, + UNIQUE (title, link, pub_date_stamp)) + """) + + news = insertNewsIntoTable(channel) + cur.executemany("INSERT INTO news VALUES (%s,%s,%s,%s,%s) ON CONFLICT DO NOTHING", news) + con.commit() + logg.logging.info("News cached into database") + + except (Exception, psycopg2.DatabaseError) as e: + logg.logging.error(str(e)) + finally: + if con is not None: + con.close() + logg.logging.info("Database connection closed") + + +def insertNewsIntoTable(channel): + ''' + 1. fill table with news + 2. convert date into timestamp + ''' + news = list() + + for index, item in enumerate(channel.entries): + + description = getDescription(item.description) + + try: + pub_date_stamp = time.mktime(parsedate_to_datetime(item.published).timetuple()) + except ValueError as error: + logg.logging.error("ValueError: " + str(error)) + + media_content = checkMediaContent(item) + image = '' + + if (media_content): + try: + response = requests.get(media_content) + image = psycopg2.Binary(response.content) + except Exception as error: + logg.logging.error("Exception: " + str(e)) + + row = (html.unescape(item.title), item.link, image, description, pub_date_stamp) + news.append(row) + return news + diff --git a/final_task/rss_reader/requirements.txt b/final_task/rss_reader/requirements.txt index e69de29..a8b2ffe 100644 --- a/final_task/rss_reader/requirements.txt +++ b/final_task/rss_reader/requirements.txt @@ -0,0 +1,10 @@ +beautifulsoup4==4.8.1 +colored==1.4.0 +dominate==2.4.0 +feedparser==5.2.1 +requests==2.22.0 +urllib3==1.25.6 +xhtml2pdf==0.2.3 +argparse==1.4.0 +psycopg2==2.8.4 +flask==1.1.1 diff --git a/final_task/rss_reader/topdf.py b/final_task/rss_reader/topdf.py new file mode 100644 index 0000000..9e7518b --- /dev/null +++ b/final_task/rss_reader/topdf.py @@ -0,0 +1,49 @@ +from xhtml2pdf import pisa +from datetime import datetime +import urllib.request +import urllib.parse +import urllib +import html +import cgi +import sys +import os + + +cgi.escape = html.escape + + +def splithost_polyfill(url): + '''This function replaces deprecated splithost function. + Same result is achieved by mean of splitting original URL into components + and joining extracted components into host and path strings, retaining + format of original function''' + parsed = urllib.parse.urlsplit(url) + netloc = parsed[1] if parsed[1] else None + path = parsed[2] + path += '?' + parsed[3] if parsed[3] else '' + path += '#' + parsed[4] if parsed[4] else '' + return netloc, path + + +def convertHtmlToPdf(html_document, pdf_path): + ''' + 1. replace splithost with custom function splithost_polyfill cause splithost was removed + from python 3.8 + 2. open output file for writing + 3. convert HTML to PDF + 4. return True on success and False on errors + ''' + urllib.splithost = splithost_polyfill + urllib.request.splithost = splithost_polyfill + + if not os.path.exists(pdf_path): + os.makedirs(pdf_path) + time_name = datetime.strftime(datetime.now(), "%H%M%S") + file_name = 'NewsFeed' + '-' + time_name + '.pdf' + pdf_file = os.path.join(pdf_path, file_name) + + with open(pdf_file, "w+b") as resultFile: + pisaStatus = pisa.CreatePDF( + html_document, + dest=resultFile) + return file_name diff --git a/final_task/rss_reader/version.py b/final_task/rss_reader/version.py new file mode 100644 index 0000000..ef730b2 --- /dev/null +++ b/final_task/rss_reader/version.py @@ -0,0 +1 @@ +VERSION = '0.6' \ No newline at end of file diff --git a/final_task/setup.py b/final_task/setup.py index e69de29..fbeefeb 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -0,0 +1,28 @@ +from setuptools import setup, find_packages +import os +from rss_reader import version + + +this_directory = os.path.abspath(os.path.dirname(__file__)) +with open(os.path.join(this_directory, 'README.md'), encoding='utf-8') as fh: + long_description = fh.read() + + +setup( + name="rss_reader", + version=version.VERSION, + packages=find_packages(), + install_requires=['argparse','beautifulsoup4','dominate','feedparser', 'urllib3', 'xhtml2pdf', 'colored', 'requests', 'psycopg2','flask'], + author="ilya khonenko", + author_email="honenkoi@gmail.com", + url="https://github.com/kingofmidas", + description="This is rss-reader", + long_description = long_description, + long_description_content_type="text/markdown", + classifiers=[ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + ], + keywords="rss reader", + python_requires='>=3.8', +) diff --git a/final_task/tests/collectnews_test.py b/final_task/tests/collectnews_test.py new file mode 100644 index 0000000..b2ff47c --- /dev/null +++ b/final_task/tests/collectnews_test.py @@ -0,0 +1,27 @@ +import unittest +from unittest.mock import patch +from feedparser import parse +from rss_reader import collect_news +from expected_results import normal_result, json_result, html_result + + +class TestParser(unittest.TestCase): + def setUp(self): + self.item = '' \ + 'Yahoo News - Latest News & Headlines' \ + 'item_title' \ + 'item_description' \ + 'item_linkitem_date' \ + 'media_link' + self.parse_item = parse(self.item) + + + def testCollectNews(self): + self.assertEqual(collect_news.collectNews(1, False, '', '', ['','',''], self.item), normal_result) + self.assertEqual(collect_news.collectNews(1, True, '', '', ['','',''], self.item)[2], json_result) + self.assertEqual(collect_news.collectNews(1, False, 'tohtml', '', ['','',''], self.item), html_result) + + +if __name__=='__main__': + + unittest.main() \ No newline at end of file diff --git a/final_task/tests/expected_results.py b/final_task/tests/expected_results.py new file mode 100644 index 0000000..c66138b --- /dev/null +++ b/final_task/tests/expected_results.py @@ -0,0 +1,35 @@ +normal_result = ['Feed: Yahoo News - Latest News & Headlines\n', '', '\nTitle: item_title', '\nDate: item_date', '\nLink: item_link\n', 'Description: item_description\n', 'Links:\n[1]: item_link(link)', '\n[2]: media_link_url\n'] + + +json_result = '{"Title: ": "item_title", "Date: ": "item_date", "Link: ": "item_link", "Description: ": "item_description", "Media link: ": "media_link_url"}' + + +html_result = ''' + + + HTML document + + +
+

Title: item_title

+

Link: item_link

+ +

Description: item_description

+
+ +''' + + +html_result_from_db = ''' + + + HTML document + + +
+

Title: item_title

+

Link: item_link

+

Description: item_description

+
+ +''' diff --git a/final_task/tests/getcache_test.py b/final_task/tests/getcache_test.py new file mode 100644 index 0000000..7019316 --- /dev/null +++ b/final_task/tests/getcache_test.py @@ -0,0 +1,62 @@ +import unittest +from unittest.mock import patch +from rss_reader import get_cache +from expected_results import html_result_from_db + + +class TestParser(unittest.TestCase): + + def testGetCacheFromDB(self): + with patch("psycopg2.connect") as mock_connect: + expected = ['title', 'link', 'image', 'description'] + + mock_con_cm = mock_connect.return_value + mock_con = mock_con_cm.__enter__.return_value + + mock_cur_cm = mock_con.cursor.return_value + mock_cur = mock_cur_cm.__enter__.return_value + + mock_cur.fetchall.return_value = expected + + result = get_cache.getCacheFromDB('20191125') + self.assertEqual(result, expected) + + mock_connect.assert_called_with(database="postgres",user='postgres',password='rssreader',host='localhost',port='5432') + + + def testCollectNewsFromDB(self): + with patch('rss_reader.get_cache.getCacheFromDB') as mock_cache: + mock_cache.return_value = [('title', 'link', 'image', 'description')] + + expected = ['Cache News: ', '', '\nTitle: title', '\nLink: link\n', 'Description: description\n'] + + result = get_cache.collectNewsFromDB(1, False, ['','',''], '20191125') + self.assertEqual(result, expected) + + expectedJson = ['Cache News: ', '', '{"Title: ": "title", "Link: ": "link", "Description": "description"}'] + + result = get_cache.collectNewsFromDB(1, True, ['','',''], '20191125') + self.assertEqual(result, expectedJson) + + + def testCreateHtmlFromDB(self): + with patch('rss_reader.get_cache.getCacheFromDB') as mock_cache: + mock_cache.return_value = [('item_title', 'item_link', '', 'item_description')] + + expected = html_result_from_db + + result = get_cache.createHtmlFromDB(1, 'html_path', '', '20191125') + self.assertEqual(result, expected) + + + def testDateToStamp(self): + expected = 1574629200.0 + result = get_cache.dateToStamp('20191125') + + self.assertEqual(result, expected) + + +if __name__=='__main__': + + unittest.main() + diff --git a/final_task/tests/parser.log b/final_task/tests/parser.log new file mode 100644 index 0000000..fcd70ed --- /dev/null +++ b/final_task/tests/parser.log @@ -0,0 +1,20 @@ +ERROR [2019-11-27 01:01:27,452] cannot unpack non-iterable NoneType object +DEBUG [2019-11-27 01:01:27,453] Title: item_title +DEBUG [2019-11-27 01:01:27,453] Date: item_date +DEBUG [2019-11-27 01:01:27,453] Link: item_link +DEBUG [2019-11-27 01:01:27,454] Description: item_description +DEBUG [2019-11-27 01:01:27,454] Links: +[1]: item_link(link) +[2]: media_link_url + +ERROR [2019-11-27 01:01:27,659] cannot unpack non-iterable NoneType object +DEBUG [2019-11-27 01:01:27,660] Title: item_title +DEBUG [2019-11-27 01:01:27,660] Date: item_date +DEBUG [2019-11-27 01:01:27,661] Link: item_link +DEBUG [2019-11-27 01:01:27,661] Description: item_description +DEBUG [2019-11-27 01:01:27,662] Links: +[1]: item_link(link) +[2]: media_link_url + +ERROR [2019-11-27 01:01:27,884] cannot unpack non-iterable NoneType object +ERROR [2019-11-27 01:03:26,385] cannot unpack non-iterable NoneType object diff --git a/final_task/tests/parser_test.py b/final_task/tests/parser_test.py new file mode 100644 index 0000000..10a4c16 --- /dev/null +++ b/final_task/tests/parser_test.py @@ -0,0 +1,63 @@ +import unittest +from unittest.mock import patch +from feedparser import parse +from rss_reader import converter, news_parser +from expected_results import json_result, html_result + + +class TestParser(unittest.TestCase): + + def setUp(self): + self.item = '' \ + 'Yahoo News - Latest News & Headlines' \ + 'item_title' \ + 'item_description' \ + 'item_linkitem_date' \ + 'media_link' + self.parse_item = parse(self.item) + + + def testGetDescription(self): + self.assertEqual(news_parser.getDescription(self.parse_item.entries[0].description), 'item_description') + + + def testJson(self): + self.assertEqual(news_parser.intoJson(self.parse_item.entries[0]), json_result) + + + def testCheckMediaContent(self): + self.assertEqual(news_parser.checkMediaContent(self.parse_item.entries[0]), 'media_link_url') + + + def testHtml(self): + html_doc = converter.createHtmlStructure(self.parse_item, 1, 'html_path', '') + self.assertEqual(html_doc, html_result) + + + def testCacheNews(self): + with patch("psycopg2.connect") as mock_connect: + mock_con_cm = mock_connect.return_value + mock_con = mock_con_cm.__enter__.return_value + + mock_cur_cm = mock_con.cursor.return_value + mock_cur = mock_cur_cm.__enter__.return_value + + news_parser.cacheNews(self.parse_item) + mock_connect.assert_called_with(database="postgres",user='postgres',password='rssreader',host='localhost',port='5432') + mock_cur.executemany.called_with("INSERT INTO news VALUES (%s,%s,%s,%s,%s) ON CONFLICT DO NOTHING") + + + def testInsertNewsIntoTable(self): + item = '' \ + 'Yahoo News - Latest News & Headlines' \ + 'item_title' \ + 'item_description' \ + 'item_linkMon, 25 Nov 2019 13:04:03' + parse_item = parse(item) + news = news_parser.insertNewsIntoTable(parse_item) + self.assertEqual(news, [('item_title', 'item_link', '', 'item_description', 1574676243.0)]) + + +if __name__=='__main__': + + unittest.main() \ No newline at end of file diff --git a/final_task/tests/topdf_test.py b/final_task/tests/topdf_test.py new file mode 100644 index 0000000..c700f54 --- /dev/null +++ b/final_task/tests/topdf_test.py @@ -0,0 +1,30 @@ +import unittest +from unittest.mock import patch +from rss_reader import topdf +from expected_results import html_result + + +class TestParser(unittest.TestCase): + + @patch('rss_reader.topdf.datetime') + @patch('builtins.open') + def testConvertToPdf(self, mock_file, mock_time): + mock_time.strftime.return_value = '191749' + + result = topdf.convertHtmlToPdf(html_result, 'pdf_path/') + expected = 'NewsFeed-191749.pdf' + + mock_file.assert_called_with('pdf_path/NewsFeed-191749.pdf', "w+b") + self.assertEqual(result, expected) + + + def testSplitHost(self): + url = 'https://news.yahoo.com/rss' + expected = ('news.yahoo.com', '/rss') + result = topdf.splithost_polyfill(url) + self.assertEqual(result, expected) + + +if __name__=='__main__': + + unittest.main()