diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1264297 --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +.vscode/ +__pycache__/ +jsonout.txt +*.pyc +.mypy_cache +build/ +dist/ +rss_reader.egg-info/ +*.json +*.pkl +*.pdf +*.html +temp-img-2250147051588681835.jpg +final_task/tests/xml/no_items_feed.xml +final_task/tests/xml/no_items_fields.xml +.coverage +.coveragerc \ No newline at end of file diff --git a/final_task/MANIFEST.in b/final_task/MANIFEST.in new file mode 100644 index 0000000..69ce37c --- /dev/null +++ b/final_task/MANIFEST.in @@ -0,0 +1 @@ +include rss_reader/Arial-Unicode-Regular.ttf diff --git a/final_task/README.md b/final_task/README.md index 7af281f..6ea183d 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -1,3 +1,90 @@ -# Your readme here -Some text. -Checkout how to write this file using *markdown*. +# Python RSS parser +*** +Yet another RSS parser +*** +# Quick start + +## Usage + usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [source] + + RRS feed receiver + + positional arguments: + source URL for RSS feed + + optional arguments: + -h, --help show this help message and exit + --version prints version + --json converts news to JSON + --verbose output verbose status messages + --limit LIMIT determines the number of showed news. + --date DATE shows cached news at given date + --to_pdf TO_PDF coverts news to PDF. + --to_html TO_HTML coverts news to HTML. + + TO_PDF/TO_HTML - path to directory for file + File's name is in format feed-*current datetime*.*extention* + +If there are no news found while using both `--date` and `--to_pdf` or `--to_html` convertion does not happen + +## Installation +1. Install setuptools + + pip install setuptools +2. Download source code +3. Unpack downloaded *.zip +4. Go to `FinalTaskRssParser-master/final_task` +5. In terminal execute: + + python setup.py sdist +6. Go to `/dist` directory +7. Execute `pip install rss_reader-1.4.tar.gz` + +Done! +To see help use + + rss-reader --help + +## JSON format + { + "description": "description", + "link": "link", + "news_list": [ + news_item, + news_item, + news_item, + ... + ], + "title": "title" + } + +news_item is represented as: + + { + "date": "date", + "desctiption": "description", + "img": "base64", + "link": "link", + "media": "media", + "published": "published", + "source": "source", + "title": "title" + } +Base64 string is pretty long, so it've been shortened to `"base64"` while printing, but it is stored as valid string in memory and cache +## Caching +TinyDB have been used for caching. + +Items are stored in json format. +News are stored in db.json +##### Database item format + "id": { + "date": "date", + "img": "base64_representation_of_an_image" + "desctiption": "description", + "link": "link", + "media": "media", + "published": "published", + "source": "source", + "title": "title" + }, +`date` is stored in format `yyyy%mm%dd` \ No newline at end of file diff --git a/final_task/rss_reader/Arial-Unicode-Regular.ttf b/final_task/rss_reader/Arial-Unicode-Regular.ttf new file mode 100644 index 0000000..79b31cd Binary files /dev/null and b/final_task/rss_reader/Arial-Unicode-Regular.ttf differ diff --git a/final_task/rss_reader/__init__.py b/final_task/rss_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/final_task/rss_reader/converters.py b/final_task/rss_reader/converters.py new file mode 100644 index 0000000..329ecd4 --- /dev/null +++ b/final_task/rss_reader/converters.py @@ -0,0 +1,191 @@ +from pkg_resources import resource_filename +import datetime +import logging +import os +import warnings +import platform + +import requests +from dominate import document +from dominate.tags import h1, h3, h5, p, a, div, img +from fpdf import FPDF + +from exceptions_ import ConvertionError + +FONT_BLACK = resource_filename(__name__, 'Arial-Unicode-Regular.ttf') + +LOGGER = logging.getLogger('rss_logger') + + +def path_validation(path, mode): + ''' + Raises an exception if the path is not valid + Otherwise returns correct path with filename + + Mode is boolean to determine file extention + True - .html + False - .pdf + ''' + get_extention = (lambda mode: '.html' if mode else '.pdf') + path = os.path.abspath(path) + LOGGER.debug('CHECKING PATH...') + if os.path.exists(path): + LOGGER.debug('PATH IS OK') + if platform.system() == 'Linux' or platform.system() == 'Darwin': + path += '/feed-' + str(datetime.datetime.now()) + get_extention(mode) + elif platform.system() == 'Windows': + # Processing path to be valid in Windows + path += '\\feed-' + str(datetime.datetime.now()) + get_extention(mode) + # Second replace replaces ':', so path will be like C-/Users/... + # Third replace restores correct path C:/Users/... + # Had to say that I'm not proud of this solution at all + path = path.replace(' ', '_').replace(':', '-').replace('-', ':', 1) + else: + raise ConvertionError('Unknown OS. Try Windows or UNIX/XNU') + return path + else: + raise ConvertionError('Wrong path') + + +def get_html_doc(news_list): + ''' + Converts news to .html + + news_list - is a list of dicts + ''' + LOGGER.debug('CONVERTING TO HTML') + with document(title='RSS FEED') as doc: + h1('News:') + for news_item in news_list: + with div(): + h3(news_item['title']) + h5('IMAGE') + LOGGER.debug('PROCESSING IMAGE') + if news_item['img'] is None: + p('NO IMAGE') + else: + # Image is stored in base64. It's needed to skip first 2 and + # the last chars to take valid_base64_string because it's stored as + # b'valid_base64_string' + img(src='data:image/png;base64, ' + str(news_item['img'])[2:-1]) + LOGGER.debug('DONE') + h5('DESCRIPTION: ') + if not news_item['description']: + p('NO DESCRIPTION') + else: + p(news_item['description']) + p(news_item['published']) + p('SOURCE: ' + news_item['source']) + a('LINK', href=news_item['link']) + return str(doc) + + +def to_html(path, item_list): + try: + path = path_validation(path, True) + except ConvertionError as exc: + raise exc + document = get_html_doc(item_list) + LOGGER.debug('WRITING .html') + with open(path, 'w', encoding='utf-8') as html_file: + html_file.write(str(document)) + + +def get_image_path(url): + ''' + FPDF can't handle image in base64 + The function tries to take the image from the source + If it does it create temp-img file and returns path + to it + If it doesn't it raises an requests.ConnectionError + exception which handled in Image adding section + ''' + LOGGER.debug('GETTING IMAGE FROM URL...') + temp_img = 'temp-img' + str(hash(url)) + '.jpg' + img = requests.get(url).content + with open(temp_img, 'wb') as img_out: + img_out.write(img) + LOGGER.debug('DONE') + return temp_img + + +def get_pdf_doc(news_list): + ''' + Converts news to .pdf + + news_list - is a list of dicts + ''' + LOGGER.debug('CONVERTING TO PDF') + pdf = FPDF(format='A4') + LOGGER.debug('SETIING FONTS') + pdf.add_font("ArialUni", style="", fname=FONT_BLACK, uni=True) + pdf.add_font("ArialUni", style='B', fname=FONT_BLACK, uni=True) + pdf.set_font("ArialUni", 'B', size=24) + pdf.add_page() + pdf.set_xy(0, 0) + pdf.cell(50, 30, txt='News Feed:', ln=1, align='L') + for news_item in news_list: + pdf.set_font("ArialUni", '', size=12) + pdf.set_x(4) + pdf.cell(20, 6, 'Title:', ln=1) + pdf.set_font("ArialUni", '', size=12) + pdf.set_x(20) + pdf.multi_cell(150, 5, news_item['title']) + pdf.set_x(4) + pdf.set_font("ArialUni", '', size=12) + pdf.cell(20, 6, 'Image:', ln=1) + # Image adding + LOGGER.debug('IMAGE ADDING') + if news_item['img'] is None: + LOGGER.debug('IMAGE IS NONE') + pdf.set_font("ArialUni", '', size=12) + pdf.set_x(20) + pdf.cell(20, 6, 'No image', ln=1) + pdf.set_x(4) + else: + try: + img_path = get_image_path(news_item['media']) + pdf.image(img_path, x=20) + os.remove(img_path) + except (requests.Timeout, requests.TooManyRedirects, requests.ConnectionError) as exc: + print(str(exc)) + pdf.set_x(20) + pdf.multi_cell(150, 6, str(exc)) + except Exception as exc: + print(str(exc)) + pdf.set_x(20) + pdf.multi_cell(150, 6, 'Image error') + os.remove(img_path) + # End image adding + pdf.set_x(4) + pdf.cell(20, 6, 'Description:', ln=1) + pdf.set_x(20) + if not news_item['description']: + pdf.multi_cell(150, 5, news_item['title']) + else: + pdf.multi_cell(150, 5, news_item['description']) + pdf.set_x(4) + pdf.cell(20, 6, 'LINK', link=news_item['link'], ln=1) + pdf.set_x(4) + pdf.cell(20, 6, 'Source: ' + news_item['source'], link=news_item['source'], ln=1) + pdf.cell(0, 10, '='*85, align='C', ln=1) + return pdf + + +def to_pdf(path, news_list): + try: + path = path_validation(path, False) + except ConvertionError as exc: + raise exc + pdf = get_pdf_doc(news_list) + LOGGER.debug('SAVING .pdf') + print(path) + try: + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + pdf.output(path) + except OSError as exc: + raise ConvertionError('Wrong path') + except Exception: + raise ConvertionError('News contain unsupported characters. Stop exporting') + LOGGER.debug('DONE!') diff --git a/final_task/rss_reader/exceptions_.py b/final_task/rss_reader/exceptions_.py new file mode 100644 index 0000000..2332021 --- /dev/null +++ b/final_task/rss_reader/exceptions_.py @@ -0,0 +1,13 @@ +class FeedError(Exception): + ''' + Raise when something is wrong with feed + ''' + pass + + +class InvalidArgs(FeedError): + pass + + +class ConvertionError(FeedError): + pass diff --git a/final_task/rss_reader/requirements.txt b/final_task/rss_reader/requirements.txt index e69de29..a1cfea6 100644 --- a/final_task/rss_reader/requirements.txt +++ b/final_task/rss_reader/requirements.txt @@ -0,0 +1,9 @@ +feedparser==5.2.1 +argparse==1.4.0 +jsonpickle==1.2 +tinydb==3.15.1 +requests==2.22.0 +dominate==2.4.0 +beautifulsoup4==4.8.1 +fpdf==1.7.2 +colorama==0.4.1 \ No newline at end of file diff --git a/final_task/rss_reader/rss_feed.py b/final_task/rss_reader/rss_feed.py new file mode 100755 index 0000000..d91cb82 --- /dev/null +++ b/final_task/rss_reader/rss_feed.py @@ -0,0 +1,83 @@ +import json +import logging +import re +import sys +from codecs import encode, decode + +import jsonpickle +import requests +from colorama import Fore, Back, Style, init +from tinydb import Query, TinyDB, where + +LOGGER = logging.getLogger('rss_logger') +init() + + +class RssFeed: + ''' + Contatins RSS channel title, description, link and list of news + Contatins method for parsing class to json via jsonpickle module + ''' + def __init__(self, title, description, link, news_list): + LOGGER.debug('INIT RSS FEED CLASS') + self.news_list = news_list + self.title = title + self.description = description + self.link = link + + def print_feed(self): + result = '\n' + result += ' '*36 + self.title + '\n' + result += ' '*36 + Back.BLACK + '='*len(self.title) + Back.RESET + '\n' + # This line do some math to align descrtiption and title + result += ' '*int(abs((36 + len(self.title)/2 - len(self.description)/2))) + self.description + '\n\n' + result += Back.RED + '='*120 + Back.RESET + '\n' + for _, item in enumerate(self.news_list): + result += str(item) + '\n' + result += Back.RED + '='*120 + Back.RESET + '\n' + print(result) + + def to_json(self): + ''' + Parses rss_feed class to JSON + Method uses jsonpickle module because of using list of classes as field. + load_backend() and set_encoder_options() loads standart json lib and set + proper params to beautify json string + ''' + LOGGER.debug('FORMATTING FEED TO JSON') + jsonpickle.load_backend('json', 'dumps', 'loads') + jsonpickle.set_preferred_backend('json') + # ensure_ascii = False to solve encoding problems + jsonpickle.set_encoder_options('json', indent=4, sort_keys=False, ensure_ascii=False) + json_string = jsonpickle.encode(self, make_refs=False, unpicklable=False) + # Regex finds base64 string and replaces it for shorter output + json_string = re.sub(r'(\"img\":\ )\"b\'.*?\'', r'\1"base64 image', json_string) + # Unescaping + json_string = decode(encode(json_string, 'latin-1', 'backslashreplace'), 'unicode-escape') + LOGGER.debug('PRINTING JSON') + print(json_string) + + def cache(self, cache_store): + ''' + Using TinyDB for simple caching. Database stores RssItem class in + json format. + ''' + LOGGER.debug('INIT DATABASE') + database = TinyDB(cache_store, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False) + LOGGER.debug('CACHING...') + for _, news_item in enumerate(self.news_list): + current_news = Query() + # Checking if the news is already stored in database + if not database.contains(current_news.link == news_item.link): + database.insert(news_item.__dict__) + LOGGER.debug('DONE!') + database.close() + + def get_news_as_dicts(self, limit): + news_list_dicts = [] + if limit is None or limit > len(self.news_list) or limit < 0: + limit = len(self.news_list) + self.news_list = self.news_list[:limit] + for news_item in self.news_list: + news_list_dicts.append(news_item.asdict()) + return news_list_dicts diff --git a/final_task/rss_reader/rss_item.py b/final_task/rss_reader/rss_item.py new file mode 100755 index 0000000..1f7c44e --- /dev/null +++ b/final_task/rss_reader/rss_item.py @@ -0,0 +1,59 @@ +import html +import re +from colorama import Fore, Back, Style, init +from codecs import encode, decode +from dataclasses import asdict, dataclass + +import jsonpickle +init() + + +@dataclass +class RssItem: + ''' + Represents a single news from RSS channel + + source: stores link to rss channel of the news + date: stores date on YYYY%MM%DD format + media: contains link to an image + img: contains base64 representation of an image + ''' + title: str + published: str + description: str + link: str + media: str + source: str + date: str + img: str + + def __post_init__(self): + self.title = html.unescape(self.title) + self.published = html.unescape(self.published) + self.description = html.unescape(self.description) + + @classmethod + def from_dict(cls, item_dict) -> 'RssItem': + return cls(**item_dict) + + def __str__(self): + return f'TITLE: {Back.BLACK + Fore.WHITE + self.title + Style.RESET_ALL}\ + \n\t|| DESCRIPTION: {Fore.MAGENTA + self.description + Fore.RESET}\ + \n\t|| PUBLISHED: {Fore.GREEN + self.published + Fore.RESET}\ + \n\t|| LINK: {Fore.BLUE + self.link + Fore.RESET}\ + \n\t|| MEDIA: {Fore.YELLOW + self.media + Fore.RESET}' + + def to_json(self): + jsonpickle.load_backend('json', 'dumps', 'loads') + jsonpickle.set_preferred_backend('json') + # ensure_ascii = False to solve encoding problems + jsonpickle.set_encoder_options('json', indent=4, sort_keys=False, ensure_ascii=False) + json_string = jsonpickle.encode(self, make_refs=False, unpicklable=False) + # Regex finds base64 string and replaces it + json_string = re.sub(r'(\"img\":\ )\"b\'.*?\'', r'\1"base64 image', json_string) + # Unescaping + json_string = decode(encode(json_string, 'latin-1', 'backslashreplace'), 'unicode-escape') + print(json_string) + + def asdict(self): + return asdict(self) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index e69de29..099adb2 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -0,0 +1,209 @@ +import argparse +import base64 +import html +import logging +import os +import sys + +import feedparser +import requests +from bs4 import BeautifulSoup +from tinydb import TinyDB, where + +current_dir = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(current_dir) +from converters import to_html, to_pdf +from exceptions_ import FeedError, InvalidArgs, ConvertionError +from rss_feed import RssFeed +from rss_item import RssItem + +# common logger init +LOGGER = logging.getLogger('rss_logger') + + +def init_parser(): + # I really dunno how to test this + parser = argparse.ArgumentParser(description='RRS feed receiver') + parser.add_argument('source', help='URL for RSS feed', nargs='?') + parser.add_argument('--version', help='prints version', action='store_true') + parser.add_argument('--json', help='converts news to JSON', action='store_true') + parser.add_argument('--verbose', help='output verbose status messages', action='store_true') + parser.add_argument('--limit', help='determines the number of showed news.', type=int) + parser.add_argument('--date', help='shows cached news at given date', type=str, default=None) + parser.add_argument('--to_pdf', help='coverts news to PDF.', type=str, default=None) + parser.add_argument('--to_html', help='coverts news to HTML.', type=str, default=None) + + return parser.parse_args() + + +def get_img_base64(url): + LOGGER.debug('GETTING IMAGE') + try: + img_info = requests.get(url) + return str(base64.b64encode(img_info.content)) + except Exception: + return 'no image' + + +def extract_text_from_html(html_str): + html_str = html.unescape(html_str) + soup = BeautifulSoup(html_str, 'html.parser') + return soup.text + + +def init_news_dict(entry): + ''' + Initializes single news item from given entry from feedparser + entry: dict + Does not init source field + ''' + title = entry.get('title', 'unknown') + description = entry.get('description', 'unknown') + description = extract_text_from_html(description) + published = entry.get('published', 'unknown') + published_parsed = entry.get('published_parsed', 'unknown') + published_parsed_string = str(published_parsed[0]) + \ + str(published_parsed[1]) + str(published_parsed[2]) + link = entry.get('link', 'no link') + media_list = entry.get('media_content', 'no media content') + media_link = 'no media content' + if isinstance(media_list, list): + media_link = media_list[0].get('url', 'no media') + if media_link == '': + media_link = 'no media' + base64img = None + if media_link[:4] == 'http': + base64img = get_img_base64(media_link) + + item_dict = { + "title": title, + "published": published, + "description": description, + "link": link, + "media": media_link, + "date": published_parsed_string, + "img": base64img + } + return item_dict + + +def init_news_list(feed_dict, limit, url): + ''' + Inits news list + + returns RssItem list + ''' + news_list = [] + if limit is None or limit > len(feed_dict.entries) or limit < 0: + limit = len(feed_dict.entries) + entries = feed_dict.entries[:limit] + + for _, entry in enumerate(entries): + news_dict = init_news_dict(entry) + news_dict['source'] = url + news = RssItem.from_dict(news_dict) + news_list.append(news) + return news_list + + +def init_feed(url, limit): + feed_dict = feedparser.parse(url) + if feed_dict.bozo: + raise FeedError('Invalid feed. Maybe there is / at end of the feed link?') + + LOGGER.debug('GOT FEED FROM SOURCE') + LOGGER.debug('FEED INIT') + news_list = init_news_list(feed_dict, limit, url) + feed = RssFeed(feed_dict.feed.title, feed_dict.feed.description, feed_dict.feed.link, news_list) + LOGGER.debug('DONE') + + return feed + + +def print_news_by_date(news_list, args): + LOGGER.debug('PRINTING...') + if args.json: + for _, news_dict in enumerate(news_list): + news_item = RssItem.from_dict(news_dict) + news_item.to_json() + else: + for _, news_dict in enumerate(news_list): + news_item = RssItem.from_dict(news_dict) + print(news_item) + + +def get_news_by_date(args): + ''' + Gives news by date and source if specified + + database.search returns Mapping object which used for + news_item initialization + + Raises FeedError exception if there are no news found + ''' + LOGGER.debug('READING DATABASE') + database = TinyDB('db.json') + LOGGER.debug('CHECKING INPUT') + + if args.source: + LOGGER.debug('DATE AND SOURCE ARE SPECIFIED') + news_list = database.search((where('date') == args.date) & (where('source') == args.source)) + else: + LOGGER.debug('ONLY DATE IS SPECIFIED') + news_list = database.search((where('date') == args.date)) + database.close() + if len(news_list) == 0: + raise FeedError('No news found') + limit = args.limit + if args.limit is None or args.limit > len(news_list) or args.limit < 0: + limit = len(news_list) + news_list = news_list[:limit] + + if args.to_html: + to_html(args.to_html, news_list) + return + if args.to_pdf: + to_pdf(args.to_pdf, news_list) + return + print_news_by_date(news_list, args) + + +def main(): + args = init_parser() + if args.verbose: + logging.basicConfig( + format='[%(asctime)s] {%(filename)s} %(levelname)s - %(message)s' + ) + LOGGER.setLevel('DEBUG') + + try: + if args.version: + print('version 1.5') + # if get news from cache + elif args.date: + get_news_by_date(args) + # if get news from the source + elif args.source: + news_feed = init_feed(args.source, args.limit) + + if args.to_html: + to_html(args.to_html, news_feed.get_news_as_dicts(args.limit)) + elif args.to_pdf: + to_pdf(args.to_pdf, news_feed.get_news_as_dicts(args.limit)) + # print as json + elif args.json: + news_feed.to_json() + news_feed.cache('db.json') + # print as usual + else: + LOGGER.debug('PRINTING FEED') + news_feed.print_feed() + news_feed.cache('db.json') + else: + raise InvalidArgs('Positional argument "source" is required') + except FeedError as exc: + print('Error: ' + str(exc)) + + +if __name__ == "__main__": + main() diff --git a/final_task/setup.py b/final_task/setup.py index e69de29..b5557a1 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -0,0 +1,34 @@ +from setuptools import setup, find_packages + +try: + from pip._internal.req import parse_requirements +except ImportError: + from pip.req import parse_requirements + + +setup( + name='rss_reader', + version='1.5', + description='RSS reader', + author='Roman Shagun', + author_email='rshag17@gmail.com', + packages=find_packages(), + data_files=[('rss_reader', ['rss_reader/Arial-Unicode-Regular.ttf'])], + entry_points={ + 'console_scripts': ['rss-reader=rss_reader.rss_reader:main'], + }, + install_requires=[ + 'feedparser==5.2.1', + 'argparse==1.4.0', + 'jsonpickle==1.2', + 'tinydb==3.15.1', + 'requests==2.22.0', + 'dominate==2.4.0', + 'beautifulsoup4==4.8.1', + 'fpdf==1.7.2', + 'colorama==0.4.1'], + license="none", + platforms="Linux, Windows (not tested)", + long_description="Yet another RSS reader", + include_package_data=True +) \ No newline at end of file diff --git a/final_task/tests/__init__.py b/final_task/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/final_task/tests/readme.txt b/final_task/tests/readme.txt new file mode 100644 index 0000000..31a2f3e --- /dev/null +++ b/final_task/tests/readme.txt @@ -0,0 +1,3 @@ +Some shit happend :c with imports and I was had to run tests from /FinalTaskRssParser dir like this + +coverage run -a -m unittest discover \ No newline at end of file diff --git a/final_task/tests/test_converters.py b/final_task/tests/test_converters.py new file mode 100644 index 0000000..9545b39 --- /dev/null +++ b/final_task/tests/test_converters.py @@ -0,0 +1,63 @@ +import sys +import os +import unittest +from io import StringIO +from unittest.mock import patch + +import requests + +sys.path.insert(1, 'final_task/rss_reader') +from converters import get_image_path, get_html_doc, path_validation +from rss_feed import RssFeed +from exceptions_ import ConvertionError +from rss_reader import init_feed + + +class TestConverters(unittest.TestCase): + + def test_get_image_url(self): + url = 'https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png' + expected_result = 'temp-img' + str(hash(url)) + '.jpg' + result = get_image_path(url) + self.assertEqual(result, expected_result) + os.remove('temp-img' + str(hash(url)) + '.jpg') + + def test_get_html_doc(self): + feed = init_feed('final_task/tests/test_feed.xml', 2) + expected_result = '\n'\ + '\n'\ + ' \n'\ + ' RSS FEED\n'\ + ' \n'\ + ' \n'\ + '

News:

\n'\ + '
\n'\ + '

ITEM1 TITLE

\n'\ + '
IMAGE
\n'\ + ' \n'\ + '
DESCRIPTION:
\n'\ + '

unknown

\n'\ + '

2003-12-31

\n'\ + '

SOURCE: final_task/tests/test_feed.xml

\n'\ + ' LINK\n'\ + '
\n'\ + '
\n'\ + '

ITEM2 TITLE

\n'\ + '
IMAGE
\n'\ + ' \n'\ + '
DESCRIPTION:
\n'\ + '

unknown

\n'\ + '

2003-12-31

\n'\ + '

SOURCE: final_task/tests/test_feed.xml

\n'\ + ' LINK\n'\ + '
\n'\ + ' \n'\ + '' + result = get_html_doc(feed.get_news_as_dicts(2)) + self.assertEqual(result, expected_result) + + def test_path_validation(self): + with self.assertRaises(ConvertionError): + path_validation('asdasdas', True) + with self.assertRaises(ConvertionError): + path_validation('asdasdas', False) diff --git a/final_task/tests/test_feed.xml b/final_task/tests/test_feed.xml new file mode 100644 index 0000000..213ea79 --- /dev/null +++ b/final_task/tests/test_feed.xml @@ -0,0 +1,26 @@ + + + + CHANNEL TITLE + + + CHANNEL DESCRIPTION + + CHANNEL LINK + + + ITEM1 TITLE + ITEM1 LINK + 2003-12-31 + + + + ITEM2 TITLE + ITEM2 LINK + 2003-12-31 + + + + + diff --git a/final_task/tests/test_rss_feed.py b/final_task/tests/test_rss_feed.py new file mode 100644 index 0000000..38fa3c7 --- /dev/null +++ b/final_task/tests/test_rss_feed.py @@ -0,0 +1,77 @@ +import sys +import unittest +from colorama import Back, init +from io import StringIO +from unittest.mock import patch, mock_open + +sys.path.insert(1, 'final_task/rss_reader') +from rss_feed import RssFeed +from rss_reader import init_feed +init() + + +class TestRssFeed(unittest.TestCase): + + def setUp(self): + self.feed = init_feed('final_task/tests/test_feed.xml', 2) + + def test_printfeed(self): + result = '\n' + result += ' '*36 + self.feed.title + '\n' + result += ' '*36 + Back.BLACK + '='*len(self.feed.title) + Back.RESET + '\n' + result += ' '*int(abs((36 + len(self.feed.title)/2 - len(self.feed.description)/2))) +\ + self.feed.description + '\n\n' + result += Back.RED + '='*120 + Back.RESET + '\n' + for _, item in enumerate(self.feed.news_list): + result += str(item) + '\n' + result += Back.RED + '='*120 + Back.RESET + '\n' + result += '\n' + + with patch('sys.stdout', new=StringIO()) as fake_out: + self.feed.print_feed() + self.assertEqual(fake_out.getvalue(), result) + + def test_tojson(self): + feed = RssFeed('title', 'description', 'link', [1, 2, 3]) + result = '{\n '\ + '"description": "description",\n '\ + '"link": "link",\n '\ + '"news_list": [\n 1,\n 2,\n 3\n ],'\ + '\n "title": "title"\n}' + result += '\n' + with patch('sys.stdout', new=StringIO()) as fake_out: + feed.to_json() + self.assertEqual(fake_out.getvalue(), result) + + def test_cache(self): + news_feed = init_feed('final_task/tests/test_feed.xml', 2) + expected_result = '{\n'\ + ' "_default": {\n'\ + ' "1": {\n'\ + ' "date": "20031231",\n'\ + ' "description": "unknown",\n'\ + ' "img": "b\'bm90IGZvdW5k\'",\n'\ + ' "link": "ITEM1 LINK",\n'\ + ' "media": "http://www.foo.com/bar.jpg",\n'\ + ' "published": "2003-12-31",\n'\ + ' "source": "final_task/tests/test_feed.xml",\n'\ + ' "title": "ITEM1 TITLE"\n'\ + ' },\n'\ + ' "2": {\n'\ + ' "date": "20031231",\n'\ + ' "description": "unknown",\n'\ + ' "img": "b\'bm90IGZvdW5k\'",\n'\ + ' "link": "ITEM2 LINK",\n'\ + ' "media": "http://www.foo.com/bar.jpg",\n'\ + ' "published": "2003-12-31",\n'\ + ' "source": "final_task/tests/test_feed.xml",\n'\ + ' "title": "ITEM2 TITLE"\n'\ + ' }\n'\ + ' }\n'\ + '}' + news_feed.cache('test_db.json') + + with open('test_db.json') as fp: + result = fp.read() + self.assertEqual(result, expected_result) + pass diff --git a/final_task/tests/test_rss_item.py b/final_task/tests/test_rss_item.py new file mode 100644 index 0000000..0e1f8c9 --- /dev/null +++ b/final_task/tests/test_rss_item.py @@ -0,0 +1,38 @@ +import sys +import unittest +from colorama import Fore, Back, Style, init +from io import StringIO +from unittest.mock import patch + +sys.path.insert(1, 'final_task/rss_reader') +from rss_item import RssItem +init() + + +class TestRssItem(unittest.TestCase): + + def setUp(self): + self.item = RssItem('title', 'date', 'description', 'link', 'media', 'source', 'date_parsed', 'base64 image') + + def test_string(self): + expected_result = 'TITLE: ' + Back.BLACK + Fore.WHITE + 'title' + Style.RESET_ALL\ + + ' \n\t|| DESCRIPTION: ' + Fore.MAGENTA + 'description' + Fore.RESET\ + + ' \n\t|| PUBLISHED: ' + Fore.GREEN + 'date' + Fore.RESET\ + + ' \n\t|| LINK: ' + Fore.BLUE + 'link' + Fore.RESET\ + + ' \n\t|| MEDIA: ' + Fore.YELLOW + 'media' + Fore.RESET + self.assertEqual(self.item.__str__(), expected_result) + + def test_to_json(self): + expected_result = '{\n'\ + ' "date": "date_parsed",\n'\ + ' "description": "description",\n'\ + ' "img": "base64 image",\n'\ + ' "link": "link",\n'\ + ' "media": "media",\n'\ + ' "published": "date",\n'\ + ' "source": "source",\n'\ + ' "title": "title"\n'\ + '}\n' + with patch('sys.stdout', new=StringIO()) as fake_out: + self.item.to_json() + self.assertEqual(fake_out.getvalue(), expected_result) diff --git a/final_task/tests/test_rss_reader.py b/final_task/tests/test_rss_reader.py new file mode 100644 index 0000000..dfd2317 --- /dev/null +++ b/final_task/tests/test_rss_reader.py @@ -0,0 +1,71 @@ +import sys +import unittest +from io import StringIO +from unittest.mock import patch + +sys.path.insert(1, 'final_task/rss_reader') +from rss_reader import init_feed, init_news_list, extract_text_from_html, get_img_base64 +from rss_item import RssItem + + +class TestRssReader(unittest.TestCase): + + def test_init_feed(self): + news_feed = init_feed('final_task/tests/test_feed.xml', 2) + rss_items = [ + RssItem('ITEM1 TITLE', '2003-12-31', 'unknown', 'ITEM1 LINK', + 'http://www.foo.com/bar.jpg', 'final_task/tests/test_feed.xml', '20031231', 'b\'bm90IGZvdW5k\''), + + RssItem('ITEM2 TITLE', '2003-12-31', 'unknown', 'ITEM2 LINK', + 'http://www.foo.com/bar.jpg', 'final_task/tests/test_feed.xml', '20031231', 'b\'bm90IGZvdW5k\'') + ] + self.assertEqual(news_feed.title, 'CHANNEL TITLE') + self.assertEqual(news_feed.description, 'CHANNEL DESCRIPTION') + self.assertEqual(news_feed.link, 'CHANNEL LINK') + self.assertEqual(news_feed.news_list, rss_items) + + def test_extract_text_from_html(self): + input_html = '

Booker and Harri'\
+            's warn Dems: Electability doesn't just mean appealing to white v'\
+            'otersEl'\ + 'ectability is the biggest buzzword of the 2020 cycle. It’s what Dem'\ + 'ocrats say they prize above all else: a nominee who can defeat Dona'\ + 'ld Trump. But it's also a code word. It tends to mask a raciali'\ + 'zed assumption about which Americans a candidate needs to win over '\ + 'in order to qualify as “electable”: that is, white voters who don’t'\ + ' live in big coastal cities.


' + expected_result = 'Electability is the biggest buzzword of the 2020 cycl'\ + 'e. It’s what Democrats say they prize above all else: a nominee who '\ + 'can defeat Donald Trump. But it\'s also a code word. It tends to mas'\ + 'k a racialized assumption about which Americans a candidate needs to'\ + ' win over in order to qualify as “electable”: that is, white voters '\ + 'who don’t live in big coastal cities.' + result = extract_text_from_html(input_html) + self.assertEqual(result, expected_result) + + def test_get_img_base64(self): + url = 'https://docs.python.org/2/_static/py.png' + expected_result = 'b\'iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR'\ + '0IArs4c6QAAAAZiS0dEAP8A/wD/oL2nkwAAAAlwSFlzAAALEwAACxMBAJqcGAAAAAd0'\ + 'SU1FB9gEGxE4IQYzJ14AAAI3SURBVDjLZZNPSFVBFIe/e9+zd3silBCl0SZoU4s2rVq'\ + '0EB5tQip4UNvATVGu3QRBiyAi2iltWkgbF5EgRhFFRpiWtrWIzDIV1Pzz7p15M2fmtv'\ + 'DevOqBw8DM9zvnN8ycgF3R/eDtM2mac96ZdrFNxBikqbRV+vHH/ut9gAZczoe7C3gnF'\ + '0f6au1OLM5avFi8d1Ea+JvAMSAq8nsKOGs5f2cYJ3Y7rc2PO4BqkS8DdD98f9tbe1ys'\ + 'CoxOBo1qlEXHJWcM4b5KPU19zleA0o4Clx99eO3EdqVewHsCoFRugUoVghJO7A6H6Vx'\ + '9wdtYi27cr5x6dy/03nVtWTU7bWeZh6jNUcAiCaFTURl9A+gs56AviHzh3mnqtdPxm6'\ + 'knfQPLU7UaokASQq/agY7yDrG16Mba6Pz48NP56VdrgAApYObGaicPtkovToFLQBKA/'\ + 'WUxTe3FRk4san15aGKgd3Dj560rrdGJS6FT0X9YYvLuiMKL1kAQOpHZ3PqfyZfP41+9'\ + 'PW1VfzX0RXFSECfgNEmSTgImdDruF2O0E8vvqZG1auQubAsKooIYYHpGvwA2g+xndQB'\ + 'HgWa6cG0ih5cW/w6VvEq3nChwCoBvs+bL2Z7VceBHGTDAIrABpMVuhw+4OiLgLIglOL'\ + 'PYBTQAlfErIeCzjRVg1dtEb1kt5Omv+DTV2YssAN+zNdkzC42N9brV8WdvYp07seOdM'\ + '2Of1F3AAknW0AJpwN6IgEPAEaANaMlcbmZdl7KRBuAfAb+v//yMAJoAAAAASUVORK5CYII=\'' + result = get_img_base64(url) + self.assertEqual(result, expected_result) + wrong_url_result = get_img_base64('blablabla') + self.assertEqual(wrong_url_result, 'no image')