diff --git a/final_task/README.md b/final_task/README.md old mode 100644 new mode 100755 index 7af281f..2995297 --- a/final_task/README.md +++ b/final_task/README.md @@ -1,3 +1,63 @@ -# Your readme here -Some text. -Checkout how to write this file using *markdown*. +# Pyhton RSS reader + +## How to install: +**1st way** +* You need to have git installed. Run: +> $ git clone https://github.com/kirill-stp/FinalTaskRssParser.git +* when you are in your workspace folder. Then run: +> $ python setup.py install +* when you are in **final task** folder +**2nd way:** +* To install this package, you must have Python added to your user environment. +* Download the distribution archive +* run $ pip install ./python_rss_reader-1.0.tar.gz + +## Usage: +This app provide following interface: +```shell +usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] + [--date DATE] + source + +Pure Python command-line RSS reader + +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --version Print version info + --json Print result as JSON in stdout + --verbose Outputs verbose + --limit LIMIT Limit news topics + --date DATE Read news from given date (YMD) +``` +for example: +> $ python3 rss_reader.py https://news.yahoo.com/rss --date 20191120 --limit 2 --verbose --json + +## JSON structure: +```shell +{'Article 1': {'date':time.struct_time, + 'images': {'image desription': 'url'}, + 'link': '', + 'summary': '', + 'title': ''}, + 'Article 2': {date': time.struct_time, + 'images': {'image desription': 'url'}, + 'link': '', + 'summary': '', + 'title': ''}, + 'Feed': 'Feeds from 'url'', + 'Link': 'rss link'} + +``` +## Local news storage: +When **--date** argument is not provided, the news that you received will be saved to the database, if it wasn’t there yet. +Cached data stored in rss_reder/cached_feeds.db file using **shelve**. Database stores dictionary-like object, where the key is the publication date and the value is instance of **Article** class. + +## HTML and PDF converting: +You can use **--to-html** and **--to-pdf** to save feed in given format. If there is no internet connection, it will paste image links (clickable in pdf). If we have internet connection, then program will download images and paste it to the file. Titles in pdf also clickable. + +## Colorizing +This program can colorize normal and json output, using **termcolor**. To add some color to your life, use **--colorize** argument + \ No newline at end of file diff --git a/final_task/rss_reader/DejaVuSansCondensed-Bold.ttf b/final_task/rss_reader/DejaVuSansCondensed-Bold.ttf new file mode 100644 index 0000000..de020ab Binary files /dev/null and b/final_task/rss_reader/DejaVuSansCondensed-Bold.ttf differ diff --git a/final_task/rss_reader/DejaVuSansCondensed-Oblique.ttf b/final_task/rss_reader/DejaVuSansCondensed-Oblique.ttf new file mode 100644 index 0000000..7ea654a Binary files /dev/null and b/final_task/rss_reader/DejaVuSansCondensed-Oblique.ttf differ diff --git a/final_task/rss_reader/DejaVuSansCondensed.ttf b/final_task/rss_reader/DejaVuSansCondensed.ttf new file mode 100644 index 0000000..5bd8b3e Binary files /dev/null and b/final_task/rss_reader/DejaVuSansCondensed.ttf differ diff --git a/final_task/rss_reader/__init__.py b/final_task/rss_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/final_task/rss_reader/args_creater.py b/final_task/rss_reader/args_creater.py new file mode 100644 index 0000000..2ee455a --- /dev/null +++ b/final_task/rss_reader/args_creater.py @@ -0,0 +1,17 @@ +import argparse + + +def arguments(): + """create command line arguments""" + + parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader') + parser.add_argument('source', type=str, help='RSS URL') + parser.add_argument('--version', action='version', version='%(prog)s 1.0', help='Print version info') + parser.add_argument('--json', action='store_true', help='Print result as JSON in stdout') + parser.add_argument('--verbose', action='store_true', help='Outputs verbose') + parser.add_argument('--limit', type=int, help='Limit news topics') + parser.add_argument('--date', type=str, help='Read news from given date') + parser.add_argument('--to-html', action='store_true', help='Save feed in html format') + parser.add_argument('--to-pdf', action='store_true', help='Save feed in pdf format') + parser.add_argument('--colorize', action='store_true', help='Print result in colorize mode') + return parser.parse_args() diff --git a/final_task/rss_reader/article.py b/final_task/rss_reader/article.py new file mode 100644 index 0000000..f20efe2 --- /dev/null +++ b/final_task/rss_reader/article.py @@ -0,0 +1,87 @@ +from string_operations import * +import logging +import datetime +from termcolor import colored + + + +class Article: + """single article class""" + + def __init__(self, parsed, source): + """receive parsed article and extracts data from it""" + self.title = make_string_readable(parsed.title) + self.link = parsed.link + self.feed_link = source + self.published = extract_date(parsed) + summary_ = extract_topic_info_from_summary(parsed.summary) + self.summary = make_string_readable(summary_) + self.media = parsed.media_content + + description_ = extract_image_info_from_summary(parsed.summary) + self.media_description = make_string_readable(description_) + + def print_readable_article(self, is_colored): + """print article to stdout in human-readable format""" + print( "_" * 79) + date = self.published + date_for_print = f'{date.tm_year}/{date.tm_mon}/{date.tm_mday}, {date.tm_hour}:{date.tm_min}:{date.tm_sec}\n' + if is_colored: + print(colored(date_for_print, 'red')) + else: + print(date_for_print) + + cutted_title = cut_string_to_length_with_space(self.title, 77) + for str_number, string in enumerate(cutted_title): + if str_number + 1 == len(cutted_title): + if is_colored: + print(colored(string + '[1]', 'red')) + else: + print(string + '[1]') + else: + if is_colored: + print(colored(string, 'red')) + else: + print(string) + + # images description and their links numbers (like [2] - [5]) + str_number_of_img = ' ' + if len(self.media) > 1: + str_number_of_img = f' - [{len(self.media) + 1}]' + images_and_link_numbers = f'\n\nImages:\n{self.media_description} [2] - {str_number_of_img}\n' + if is_colored: + print(colored(images_and_link_numbers, 'blue')) + else: + print(images_and_link_numbers) + + + cutted_summary = cut_string_to_length_with_space(self.summary, 79) + for string in cutted_summary: + if is_colored: + print(colored(string, 'cyan')) + else: + print(string) + + # Links of article and images + if is_colored: + print(colored('\n\nLinks:\n[1]' + self.link, 'green')) + else: + print('\n\nLinks:\n[1]', self.link) + for number, img in enumerate(self.media): + if is_colored: + print(colored(f'[{number+2}] ' + img['url'], 'green')) + else: + print(f'[{number+2}]', img['url']) + + print("_" * 79) + + def make_article_json(self): + """convert article data in json format""" + json = { + 'images': {self.media_description: img['url'] for img in self.media}, + 'link': self.link, + 'summary': self.summary, + 'date': self.published, + 'title': self.title, + } + return json diff --git a/final_task/rss_reader/cashed_feeds.db b/final_task/rss_reader/cashed_feeds.db new file mode 100644 index 0000000..0219254 Binary files /dev/null and b/final_task/rss_reader/cashed_feeds.db differ diff --git a/final_task/rss_reader/check_func.py b/final_task/rss_reader/check_func.py new file mode 100644 index 0000000..ba3be37 --- /dev/null +++ b/final_task/rss_reader/check_func.py @@ -0,0 +1,14 @@ +import requests +import exceptions as ex + + +def internet_connection_check(): + url = 'http://www.google.com/' + timeout = 5 + is_internet = True + try: + requests.get(url, timeout=timeout) + except requests.ConnectionError: + is_internet = False + finally: + return is_internet diff --git a/final_task/rss_reader/exceptions.py b/final_task/rss_reader/exceptions.py new file mode 100644 index 0000000..c2e37cb --- /dev/null +++ b/final_task/rss_reader/exceptions.py @@ -0,0 +1,17 @@ +class InvalidURLAddress(Exception): + pass + + +class NoInternetConnection(Exception): + pass + + +class EmptyDataBase(Exception): + pass + + +class DateNotInDatabase(Exception): + pass + +class PathError(Exception): + pass diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py new file mode 100644 index 0000000..07025b0 --- /dev/null +++ b/final_task/rss_reader/feed.py @@ -0,0 +1,203 @@ +import logging +import requests as r +import os +from string_operations import make_string_readable +import exceptions as ex +from pprint import pprint +import article +import shelve +import check_func as check +import datetime +import dominate +import dominate.tags as dtags +from fpdf import FPDF +from termcolor import colored + + + +class Feed: + """Feed class, contain feed info and list of articles """ + def __init__(self, parsed, args): + """create feed with fixed number of articles """ + logging.info('Started creting feed') + + articles_list = [] + cashed_news_number = 0 + if args.date: + logging.info('Started extracting data from cache') + self.link = args.source + self.feed_name = f'Feeds from {args.source}' + with shelve.open('cashed_feeds') as database: + if not database: + raise ex.EmptyDataBase('Local feed storage is empty') + for date in database: + if args.date in date and database[date].feed_link == args.source: + articles_list.append(database[date]) + cashed_news_number += 1 + if cashed_news_number == 0: + raise ex.DateNotInDatabase('There is no feeds with this date and source in local storage') + logging.info('Finished extracting data from cache') + + if args.limit: + if args.limit > cashed_news_number and args.date: + print(f'Only {cashed_news_number} feeds cashed') + number_of_articles = cashed_news_number + elif not args.date and args.limit > len(parsed.entries): + print(f'Only {len(parsed.entries)} feeds cashed') + number_of_articles = len(parsed.entries) + else: + number_of_articles = args.limit + articles_list = articles_list[:number_of_articles] + else: + if args.limit: + if args.limit > len(parsed.entries): + print(f'Only {len(parsed.entries)} feeds avaliable') + number_of_articles = len(parsed.entries) + else: + number_of_articles = args.limit + else: + number_of_articles = len(parsed.entries) + for i in range(number_of_articles): + articles_list.append(article.Article(parsed.entries[i], args.source)) + + self.feed_name = make_string_readable(parsed.feed.title) + self.link = parsed.feed.link + self.articles = articles_list + + def print_readable_feed(self, is_colored): + """print feed to stdout in readable format""" + logging.info('Started printing feed') + print('.' * 79) + print(f'\n\n{self.feed_name}\n\n') + print(self.link) + for article_ in self.articles: + article_.print_readable_article(is_colored) + logging.info('Finished printing feed') + + def print_json_feed(self, is_colored): + """print feed to stdout in json""" + json = {} + for i, article_ in enumerate(self.articles): + name = f"Article {i + 1}" + json[name] = article_.make_article_json() + json['Feed'] = self.feed_name + json['Link'] = self.link + if is_colored: print('\033[96m') + pprint(json) + if is_colored: print('\033[0m') + + def save_feed_to_database(self): + """ + saving to shelve database instances of article class, + that curent feed contains + using article published date as a key + """ + logging.info('Saving feed to database') + with shelve.open('cashed_feeds') as database: + for article in self.articles: + year = article.published.tm_year + mon = article.published.tm_mon + day = article.published.tm_mday + hour = article.published.tm_hour + minute = article.published.tm_min + sec = article.published.tm_sec + date = f'{year}{mon}{day} {hour}:{minute}:{sec}' + if date not in database: + database[date] = article + logging.info('feed saved') + + def save_feed_to_html(self): + """Creating an html file, using curent datetime as a filename""" + logging.info('Started saving feed to html file') + time_now = str(datetime.datetime.now()) + time_for_path = time_now[:-16] + '_' + time_now[-15:-13] + '-' + time_now[-12:-10] + '-' + time_now[-9:-7] + html = dominate.document(title="HTML RSS feed") + with html.head: + dtags.meta(charset='utf-8') + html += dtags.h1(self.feed_name) + for article_number, article in enumerate(self.articles): + html += dtags.br() + + date = article.published + str_date = f'{date.tm_year}/{date.tm_mon}/{date.tm_mday} {date.tm_hour}:{date.tm_min}' + + html += dtags.h2(f'{article_number + 1}. {article.title}') + html += dtags.h3(f' {str_date}') + + html += dtags.a(f'Link: {article.link}') + html += dtags.br() + + with html: + if check.internet_connection_check(): + # if have internet access, downloading images and pasting in a html file + for link in article.media: + dtags.img(src=link['url']) + else: + # if no, paste links of these images + dtags.a('Image links:') + for link_number, link in enumerate(article.media): + img_url = link['url'] + dtags.a(f'{link_number + 1}. {img_url}', href=link) + html += dtags.p(article.summary) + html += dtags.br() + with open('html_feeds/' + time_for_path + ' RSS_feeds.html', 'w') as html_file: + html_file.write(str(html)) + logging.info('Finished saving feed to html file') + + def save_feed_to_pdf(self): + """Creating an pdf file, using curent datetime as a filename""" + logging.info('Started saving feed to pdf file') + time_now = str(datetime.datetime.now()) + time_for_path = time_now[:-16] + '_' + time_now[-15:-13] + '-' + time_now[-12:-10] + '-' + time_now[-9:-7] + pdf_path = 'pdf_feeds/' + time_for_path + ' RSS_feeds.pdf' + pdf = FPDF() + # fonts + pdf.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True) + pdf.add_font('DejaVu', 'B', 'DejaVuSansCondensed-Bold.ttf', uni=True) + pdf.add_font('DejaVu', 'I', 'DejaVuSansCondensed-Oblique.ttf', uni=True) + pdf.set_font('DejaVu', 'B', 25) + + pdf.add_page() + pdf.set_text_color(0,0,228) + # feed source + pdf.cell(0, 0, f'Feeds from {self.link}', 0, 1,align='C', link = self.link) + pdf.set_text_color(0,0,0) + pdf.set_font('DejaVu', '', 14) + # list of articles that current feed contains (clickable) + for article_number, article in enumerate(self.articles): + pdf.write(5,'\n\n' + str(article_number + 1) + '. ' + article.title, link=article.link) + # articles + for article_number, article in enumerate(self.articles): + pdf.add_page() + pdf.set_font('DejaVu', 'B', 18) + pdf.write(5,'\n' + str(article_number + 1) + '. ' + article.title + '\n', link=article.link) + pdf.set_font('DejaVu', 'I', 18) + date = article.published + str_date = f'{date.tm_year}/{date.tm_mon}/{date.tm_mday} {date.tm_hour}:{date.tm_min}' + pdf.write(5,'\n' + str_date + '\n\n\n') + pdf.set_font('DejaVu','',12) + if check.internet_connection_check(): + # if have internet access, downloading images and pasting in a pdf file + for img_number, img in enumerate(article.media): + img_url = img['url'] + if img_url != '': + image = r.get(img_url) + img_path = str(article_number + 1) + '_' + str(img_number + 1) + '_img.jpg' + with open(img_path, 'wb') as file: + file.write(image.content) + try: + pdf.image(img_path,w = 50, h = 50) + except (SyntaxError, RuntimeError): + pdf.write(15, 'Media link(clickable)' + '\n\n', link=img['url']) + os.remove(img_path) + else: + # if no, paste links of these images (clickable) + pdf.write(15, "Image links (clickable):\n") + for img_number, img in enumerate(article.media): + pdf.write(15,str(img_number + 1) + '. ' + article.media_description, link=img['url']) + pdf.write(15,str(img_number + 1) + '.\n', link=img['url']) + pdf.write(10, article.summary) + pdf.output(pdf_path) + logging.info('Finished saving feed to pdf file') + + diff --git a/final_task/rss_reader/html_feeds/.keep b/final_task/rss_reader/html_feeds/.keep new file mode 100644 index 0000000..e69de29 diff --git a/final_task/rss_reader/pdf_feeds/.keep b/final_task/rss_reader/pdf_feeds/.keep new file mode 100644 index 0000000..e69de29 diff --git a/final_task/rss_reader/requirements.txt b/final_task/rss_reader/requirements.txt index e69de29..fa223bf 100644 --- a/final_task/rss_reader/requirements.txt +++ b/final_task/rss_reader/requirements.txt @@ -0,0 +1,5 @@ +feedparser==5.2.1 +requests==2.22.0 +dominate==2.4.0 +fpdf==1.7.2 +termcolor==1.1.0 diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index e69de29..11932c6 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -0,0 +1,65 @@ +import feed +import feedparser +import ssl +import logging +import exceptions as ex +import check_func as check +from args_creater import arguments + +if hasattr(ssl, '_create_unverified_context'): + ssl._create_default_https_context = ssl._create_unverified_context + + +def main(): + """main function""" + try: + args = arguments() + rss = args.source + + logging_level = 'WARNING' + if args.verbose: + logging_level = 'INFO' + logging.basicConfig(level=logging_level,) + logging.info('Started') + + if not args.date: + if not check.internet_connection_check(): + raise ex.NoInternetConnection("No internet connection") + parsed = feedparser.parse(rss) + if parsed.bozo > 0: + raise ex.InvalidURLAddress("Invalid RSS URL address") + logging.info('parsed url: %s', rss) + else: + parsed = {} + feed_obj = feed.Feed(parsed, args) + + if not args.date: + feed_obj.save_feed_to_database() + + to_print = True + + if args.to_html: + feed_obj.save_feed_to_html() + to_print = False + if args.to_pdf: + feed_obj.save_feed_to_pdf() + to_print = False + + if to_print: + if args.json: + feed_obj.print_json_feed(args.colorize) + else: + feed_obj.print_readable_feed(args.colorize) + + except ( + ex.InvalidURLAddress, + ex.NoInternetConnection, + ex.EmptyDataBase, + ex.DateNotInDatabase + ) as E: + print(E) + + +if __name__ == "__main__": + main() + logging.info('Finished') \ No newline at end of file diff --git a/final_task/rss_reader/string_operations.py b/final_task/rss_reader/string_operations.py new file mode 100644 index 0000000..9fd94cc --- /dev/null +++ b/final_task/rss_reader/string_operations.py @@ -0,0 +1,63 @@ +from html.parser import HTMLParser + + +def cut_string_to_length_with_space(basic_str, length): + """function that cut given string to the list of strings with a given max length""" + strings_list = [] + string = '' + for word_number, word in enumerate(basic_str.split()): + string += word + ' ' + if word_number + 1 < len(basic_str.split()): + next_word = basic_str.split()[word_number + 1] + else: + next_word = '' + strings_list.append(string) + if len(string + next_word) >= length: + strings_list.append(string) + string = '' + return strings_list + + +def make_string_readable(basic_str): + html_parser = HTMLParser() + return html_parser.unescape(basic_str) + + +def extract_topic_info_from_summary(basic_str): + start = basic_str.find('>') + len_ = 5 + if start == -1: + start = basic_str.find('/>') + len_ = 2 + if start == -1: + start = 0 + len_ = 0 + end = basic_str.find('
>>>>>> ee7ddc5f3897764b3da12847a678a2487a76a762
+)