From 8d1bcec5f88bc8566521356d2db9923805008490 Mon Sep 17 00:00:00 2001 From: Vitali Kozlou Date: Fri, 8 Nov 2019 16:06:46 +0300 Subject: [PATCH 1/2] Iterations 1 and 2 are ready. --- .gitignore | 2 + .../rss_reader/Tests/rss_reader_tests.py | 23 +++ final_task/rss_reader/__init__.py | 0 final_task/rss_reader/requirements.txt | 7 + final_task/rss_reader/rss_reader.py | 150 ++++++++++++++++++ final_task/setup.py | 12 ++ 6 files changed, 194 insertions(+) create mode 100644 .gitignore create mode 100644 final_task/rss_reader/Tests/rss_reader_tests.py create mode 100644 final_task/rss_reader/__init__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..226f9a6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/venv/ +/.idea/ diff --git a/final_task/rss_reader/Tests/rss_reader_tests.py b/final_task/rss_reader/Tests/rss_reader_tests.py new file mode 100644 index 0000000..de935d8 --- /dev/null +++ b/final_task/rss_reader/Tests/rss_reader_tests.py @@ -0,0 +1,23 @@ +import unittest +import final_task.rss_reader.rss_reader as reader + + +class MyTestCase(unittest.TestCase): + def test_totext(self): + self.assertEqual(reader.to_text('
Test
'), 'Test') + + def test_checkurl(self): + self.assertTrue(reader.check_url(url='https://news.yahoo.com/rss/', verbose=False), True) + self.assertFalse(reader.check_url(url='https://google.com/', verbose=False), False) + + def test_getfeed(self): + self.assertIsNotNone(reader.get_feed(url='https://news.yahoo.com/rss/', verbose=False)) + + def test_formatfeed(self): + self.assertIsInstance(reader.format_feed + (reader.get_feed(url='https://news.yahoo.com/rss/', verbose=False), + verbose=False, limit=1), cls=tuple) + + +if __name__ == '__main__': + unittest.main() diff --git a/final_task/rss_reader/__init__.py b/final_task/rss_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/final_task/rss_reader/requirements.txt b/final_task/rss_reader/requirements.txt index e69de29..e599c58 100644 --- a/final_task/rss_reader/requirements.txt +++ b/final_task/rss_reader/requirements.txt @@ -0,0 +1,7 @@ +certifi==2019.9.11 +chardet==3.0.4 +feedparser==5.2.1 +html2text==2019.9.26 +idna==2.8 +requests==2.22.0 +urllib3==1.25.6 diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index e69de29..5752b3c 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -0,0 +1,150 @@ +import feedparser +import argparse +import html2text +from requests import get +from datetime import datetime + + +parser = argparse.ArgumentParser() +parser.add_argument('--url', type=str, help='Please enter a valid URL for RSS Feed') +parser.add_argument('--limit', type=int, default=1, help='You can set a limit for news') +parser.add_argument('--verbose', type=bool, default=False, help='If you want to know ' + 'what\'s happening, set this to True') +parser.add_argument('--version', type=str, default=False, help='Print version of program') +args = parser.parse_args() +url = args.url +verbose = args.verbose +version = '[Reader version: 0.01]' + + +def to_text(html, rehtml=False): + """ + This function creates instance of html2text.HTML2text + and configuring it. + We will use this to cleanup all HTML markup in our feed + :param html: Text to format + :param rehtml: True or False + :return: formatted text + """ + formatter = html2text.HTML2Text() + formatter.wrap_links = False + formatter.skip_internal_links = True + formatter.inline_links = True + formatter.ignore_anchors = True + formatter.ignore_images = True + formatter.ignore_emphasis = True + formatter.ignore_links = True + text = formatter.handle(html) + text = text.strip(' \t\n\r') + if rehtml: + text = text.replace('\n', '
') + text = text.replace('\\', '') + return text + + +def check_url(url: str, verbose: bool): + """ + This function checks url provided by user + Function will check if Content-Type of document on url + is xml + :param url: url provided by user + :param verbose: True or False + :return: True in case of xml, + False and closing program in all other cases + """ + if verbose: + print('Wait a second, we will check if there is any RSS in URL you have given') + print('-' * 50) + response = get(url) + r = response.headers['Content-Type'] # We will check only headers + if 'xml' not in r: + print('Something went wrong. \n' + 'Looks like URL you\'ve entered doesn\'t content any RSS feed') + return False + else: + return True + + +def get_feed(url: str, verbose: bool): + """ + Caching feed by creating a feedparser instance. + Returns non-formatted feed + :param url: url provided by user + :param verbose: True of False + :return: non-formatted feed + """ + if verbose: + print('Establishing connection') + print('-' * 50) + feed = feedparser.parse(url) + if feed is None: + print('STOP') + if verbose: + print('Connection established') + print('-' * 50) + return feed + + +def format_feed(feed, verbose: bool, limit: int): + """ + This function recieves feed and format all entries + by using them as to_text() arguments to remove + all HTML markup + :param feed: Cached feed returned by get_feed() + :param verbose: True or False + :param limit: Standart limit is 1 article + :return: title, entry_title(Article title), description(Text of article), date + """ + global title, entry_title, description, date + if verbose: + print('Formatting your feed') + print('-' * 50) + try: + title = to_text(feed.feed.title) + entry_title = to_text(feed.entries[limit].title) + description = to_text(feed.entries[limit].description) + date = to_text(feed.entries[limit].published) + except AttributeError as error: # in case of AttributeError exception + with open('log' + str(datetime.now()) + '.txt', 'w') as log: # we will create a .txt log-file + log.write('[URL]: ' + str(url) + '\n') # providing info about feed that raised an exception + log.write('[ERROR]: ' + str(error) + '\n') # and text of error + print('An error has occured \n' + 'Log file was created in program folder \n' + 'To help us debug a program, please send this file to h4j0rx@gmail.com') + return title, entry_title, description, date + + +def main(title, entry_title, description, date): + """ + This function just prints all parameters given by format_feed() + :param title: Title of whole feed returned by format_feed() + :param entry_title: Title of one article returned by format_feed() + :param description: Description of one article returned by format_feed() + :param date: Publishing date of one article returned by format_feed() + :return: None + """ + print('[URL]: ' + args.url) + print('[Feed]: ' + title) + print('-' * 50) + print('[Title]: ' + entry_title) + print('[Text]: ' + description) + print('[Date]: ' + date) + return None + + +if __name__ == '__main__': + if args.url is None: + print('Please enter a valid URL for a feed ' + 'and run program like "python3 main.py ' + '--url %YOUR FEED URL%') + if args.limit == 0: + print('You have entered zero values to print, please specify another limit value') + else: + feed = get_feed(url, verbose) + limit = [i for i in range(0, args.limit+1)] + del limit[-1] + for i in limit: + format_feed(feed, verbose, i) + main(title, entry_title, description, date) + if args.version: + print(version) \ No newline at end of file diff --git a/final_task/setup.py b/final_task/setup.py index e69de29..406aef5 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup + +setup( + name='rss-reader', + version='0.01', + packages=['final_task', 'final_task.rss_reader'], + url='', + license='', + author='Vitali Kozlou', + author_email='h4j0rx@gmail.com', + description='One shot command-line RSS Parser and reader [Iteration 2]' +) From 1e71d76f32eb3f9495edb2ef8454d994d1e3e5b8 Mon Sep 17 00:00:00 2001 From: Vitali Kozlou Date: Sat, 9 Nov 2019 16:41:28 +0300 Subject: [PATCH 2/2] Fixed all, according to your comments. Please review --- final_task/rss_reader/rss_reader.py | 82 ++++++++++++++++++----------- 1 file changed, 50 insertions(+), 32 deletions(-) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 5752b3c..0cadf56 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -1,20 +1,12 @@ import feedparser import argparse import html2text +import json from requests import get from datetime import datetime -parser = argparse.ArgumentParser() -parser.add_argument('--url', type=str, help='Please enter a valid URL for RSS Feed') -parser.add_argument('--limit', type=int, default=1, help='You can set a limit for news') -parser.add_argument('--verbose', type=bool, default=False, help='If you want to know ' - 'what\'s happening, set this to True') -parser.add_argument('--version', type=str, default=False, help='Print version of program') -args = parser.parse_args() -url = args.url -verbose = args.verbose -version = '[Reader version: 0.01]' +VERSION = '[RSS Reader v0.02]' def to_text(html, rehtml=False): @@ -85,6 +77,10 @@ def get_feed(url: str, verbose: bool): return feed +def to_json(feed): + jsoned = json.dumps(feed, ensure_ascii=False, indent=4) + return jsoned + def format_feed(feed, verbose: bool, limit: int): """ This function recieves feed and format all entries @@ -95,26 +91,34 @@ def format_feed(feed, verbose: bool, limit: int): :param limit: Standart limit is 1 article :return: title, entry_title(Article title), description(Text of article), date """ - global title, entry_title, description, date - if verbose: - print('Formatting your feed') - print('-' * 50) try: - title = to_text(feed.feed.title) entry_title = to_text(feed.entries[limit].title) description = to_text(feed.entries[limit].description) + link = feed.entries[limit].link date = to_text(feed.entries[limit].published) + return entry_title, description, link, date except AttributeError as error: # in case of AttributeError exception with open('log' + str(datetime.now()) + '.txt', 'w') as log: # we will create a .txt log-file - log.write('[URL]: ' + str(url) + '\n') # providing info about feed that raised an exception + log.write('[URL]: ' + str(args.source) + '\n') # providing info about feed that raised an exception log.write('[ERROR]: ' + str(error) + '\n') # and text of error print('An error has occured \n' 'Log file was created in program folder \n' 'To help us debug a program, please send this file to h4j0rx@gmail.com') - return title, entry_title, description, date + except IndexError as indexerror: + print("You've specified too many articles to print\n" + "Feed doesn't have specified number of articles\n") + print('Number of articles in feed: ' + str(len(feed.entries))) + print('Number of articles printed: ' + str(len(feed.entries))) + exit() + + +def feed_info(): + print('[URL]: ' + args.source) + print('[Feed]: ' + feed.feed.title) + return None -def main(title, entry_title, description, date): +def main(entry_title, description, link, date): """ This function just prints all parameters given by format_feed() :param title: Title of whole feed returned by format_feed() @@ -123,28 +127,42 @@ def main(title, entry_title, description, date): :param date: Publishing date of one article returned by format_feed() :return: None """ - print('[URL]: ' + args.url) - print('[Feed]: ' + title) print('-' * 50) print('[Title]: ' + entry_title) print('[Text]: ' + description) + print('[Link]: ' + link) print('[Date]: ' + date) return None -if __name__ == '__main__': - if args.url is None: - print('Please enter a valid URL for a feed ' - 'and run program like "python3 main.py ' - '--url %YOUR FEED URL%') +def arg_parse(): + parser = argparse.ArgumentParser() + parser.add_argument('source', type=str, help='Please enter a valid URL for RSS Feed') + parser.add_argument('--limit', type=int, default=50, help='You can set a limit for news') + parser.add_argument('--verbose', action='store_true', help='If you want to know ' + 'what\'s happening, set this to True') + parser.add_argument('--version', action='store_true', help='Print version of program') + parser.add_argument('--json', action='store_true', help='Prints feed in JSON format') + args = parser.parse_args() if args.limit == 0: print('You have entered zero values to print, please specify another limit value') + if args.version: + print(VERSION) + return args + + +if __name__ == '__main__': + args = arg_parse() + feed = get_feed(args.source, args.verbose) + feed_info() + if args.json: + print(to_json(feed)) else: - feed = get_feed(url, verbose) - limit = [i for i in range(0, args.limit+1)] + if args.verbose: + print('Formatting your feed') + print('-' * 50) + limit = [index for index in range(0, args.limit+1)] del limit[-1] - for i in limit: - format_feed(feed, verbose, i) - main(title, entry_title, description, date) - if args.version: - print(version) \ No newline at end of file + for limit in limit: + entry_title, description, link, date = format_feed(feed, args.verbose, limit) + main(entry_title, description, link, date)