diff --git a/final_task/README.md b/final_task/README.md index 7af281f..ce111e7 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -1,3 +1,68 @@ -# Your readme here -Some text. -Checkout how to write this file using *markdown*. +RSS_READER +RSS reader is a command-line utility. + +Usage +usage: rss_reader.py [-h] [--source SOURCE] [--version] [--json] [--verbose] +[--limit LIMIT] [--date DATE] + +Pure Python command-line RSS reader. + +optional arguments: + +-h, --help show this help message and exit +--source SOURCE RSS URL +--version Print version info +--json Print result as JSON in stdout +--verbose Outputs verbose status messages +--limit LIMIT Limit news topics if this parameter provided +--date DATE News from the specified day will be printed out. Format: YYYYMMDD +It is mandatory to specify date or/and time. +If both are specified, then news will be searched by date and by source. + +Json structure +[ + { + "feed": [feed], + "title": [title], + "date": [date], + "link": [link], + "text": [text], + "image links": [ + [link1] + [link2] + ... + ] + }, + ... +] + +Local storage +All read news is saved in database by using Mysql. You should have database final_task_database with table news_cache +news_cache structure: + feed:longtext + title:longtext + date:date + link:longtext + image_description:longtext + new_description:longtext + image_links:longtext +When using the --date argument, news is searched by date in database + +Saving in format feature +You can save getted news in 2 formats: html, fb2 +If news are got from Internet and Internet on news images are downloaded from website +and converted in base64 string. After saved html or fb2 format files can show them without connecting +to Internet. If Internet off images aren't downloaded, in html instead of images utility writes links of images. +When using the --date argument, news are got from database. Image are downloaded the same way depending on whether +the Internet is on + +Colorize mode +When using the --colorize argument the output news in console will be colorized. If using --json at the same time +the output news will be printed in colorized json format + +How to install application +To install application you should have setuptools. Open cmd and enter 'pip install -U setuptools'. +Using 'pyhton setup.py install' in cmd install application. +Install requirements 'pip install -r requirements.txt' +You are now ready to run the application. Use 'rss-reader [arguments]' to run it. +Warning: If path to rss-reader is not in Path variable, use full path to file at running. \ No newline at end of file diff --git a/final_task/rss_reader/arguments_functions.py b/final_task/rss_reader/arguments_functions.py new file mode 100644 index 0000000..86639b1 --- /dev/null +++ b/final_task/rss_reader/arguments_functions.py @@ -0,0 +1,63 @@ +import argparse +import os +import re +import logging +from custom_exceptions import IncorrectFilePath + + +def get_arguments(): + """ + :return: Arguments of application + Read and returns arguments of application + """ + argument_parser = argparse.ArgumentParser() + argument_parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity') + argument_parser.add_argument('--json', action='store_true', help='print result as JSON in stdout') + argument_parser.add_argument('--version', action='store_true', help='print version info') + argument_parser.add_argument('--limit', help='limit news topics if this parameter provided') + argument_parser.add_argument('--date', help='represent news from local storage by date') + argument_parser.add_argument('--to-html', help='save news in html format') + argument_parser.add_argument('--to-fb2', help='save news in fb2 format') + argument_parser.add_argument('--colorize', action='store_true', help='print news in colorized mode') + argument_parser.add_argument('source', nargs='?') + return argument_parser.parse_args() + + +def check_html_argument(html_argument): + """ + :param html_argument: html directory path + If argument wrong raises exception + """ + if not os.path.exists(html_argument): + logging.error('Inrorrect html filepath') + raise IncorrectFilePath('Inrorrect html filepath') + + +def check_fb2_argument(fb2_argument): + """ + :param fb2_argument: fb2 directory path + If argument wrong raises exception + """ + if not os.path.exists(fb2_argument): + logging.error('Inrorrect fb2 filepath') + raise IncorrectFilePath('Inrorrect fb2 filepath') + + +def check_limit_argument(limit_argument): + """ + :param limit_argument: limit of news + If argument wrong raises exception + """ + if not re.match('\\d+', limit_argument): + logging.error('Input value of --limit is incorrect') + raise ValueError('Input value of --limit is incorrect') + + +def check_date_argument(date_argument): + """ + :param date_argument: Date of news in database + If argument wrong raises exception + """ + if not re.match('\\d+', date_argument) or len(date_argument) != 8: + logging.error('Input value of --date is incorrect') + raise ValueError('Input value of --date is incorrect') diff --git a/final_task/rss_reader/custom_exceptions.py b/final_task/rss_reader/custom_exceptions.py new file mode 100644 index 0000000..50c1cf9 --- /dev/null +++ b/final_task/rss_reader/custom_exceptions.py @@ -0,0 +1,14 @@ +class IncorrectURL(Exception): + pass + + +class NoInternet(Exception): + pass + + +class IncorrectFilePath(Exception): + pass + + +class DatabaseConnectionError(Exception): + pass diff --git a/final_task/rss_reader/database_functions.py b/final_task/rss_reader/database_functions.py new file mode 100644 index 0000000..c06416a --- /dev/null +++ b/final_task/rss_reader/database_functions.py @@ -0,0 +1,74 @@ +from contextlib import closing +import logging +import pymysql +from custom_exceptions import DatabaseConnectionError + + +def get_news_list_by_date(date, limit): + """ + :param date: Date of publication of news + :return: : List with news publicated by date + Returns list of news by date from database + """ + logging.info('Connecting to database') + try: + with closing(pymysql.connect(host='localhost', user='root', password='Password12345', + database='final_task_database')) as connection: + with closing(connection.cursor()) as cursor: + logging.info('Connected to database') + logging.info('Giving request') + try: + cursor.execute(f'select * from news_cache where date="{date}"') + except pymysql.err.InternalError: + logging.error('Input value of --date is incorrect') + raise ValueError('Input value of --date is incorrect') + logging.info('Getting response') + database_response = cursor.fetchall() + if limit: + limit = min(len(database_response), limit) + else: + limit = len(database_response) + logging.info('Response was got') + news_list = [] + for index in range(limit): + news_list.append({'Feed': database_response[index][0], + 'Title': database_response[index][1], + 'Date': str(database_response[index][2]), + 'Link': database_response[index][3], + 'Image description': database_response[index][4], + 'New description': database_response[index][5], + 'Image links': database_response[index][6].split('|||')}) + return news_list + except pymysql.err.OperationalError: + logging.error('Not connected to database') + raise DatabaseConnectionError("Can't connect to database, check if you have installed Mysql, and necessary" + "database with table described in README") + + +def write_news_to_database(news_list): + """ + :param news_list: List of news + Writes news to database + """ + logging.info('Connecting to database') + try: + with closing(pymysql.connect(host='localhost', user='root', password='Password12345', + database='final_task_database')) as connection: + with closing(connection.cursor()) as cursor: + logging.info('Connected to database') + for new in news_list: + # Try to find new in database by link, if exists + cursor.execute(f'select * from news_cache where link = "{new["Link"]}"') + if cursor.fetchall(): + continue + insert_values = [value for value in new.values()] + #converting list of image links into string to store in database + insert_values[6] = '|||'.join(insert_values[6]) + insert_values = [tuple(insert_values), ] + cursor.executemany('Insert into news_cache values(%s,%s,%s,%s,%s,%s,%s)', insert_values) + connection.commit() + logging.info('Data write successful') + except pymysql.err.OperationalError: + logging.error('Not connected to database') + raise DatabaseConnectionError("Can't connect to database, check if you have installed Mysql, and necessary" + "database with table described in README") diff --git a/final_task/rss_reader/parse_rss_functions.py b/final_task/rss_reader/parse_rss_functions.py new file mode 100644 index 0000000..72a793c --- /dev/null +++ b/final_task/rss_reader/parse_rss_functions.py @@ -0,0 +1,81 @@ +import re +import feedparser +import socket +import logging +from dateutil import parser as date_parser +import html +from custom_exceptions import IncorrectURL, NoInternet + + +def ckeck_internet(): + """ + Checks Internet connetction + """ + try: + logging.info("checking Internet connection") + socket.setdefaulttimeout(5) + host = socket.gethostbyname("www.google.com") + sock = socket.create_connection((host, 80), 2) + sock.close() + logging.info('Internet on.') + return True + except Exception as e: + logging.error("Internet off.") + return False + + +def get_new_description(summary_str): + """ + :param summary_str: Summary string from parsing RSS + :return: New description + Extract new description from summary string + """ + pattern = re.compile(r'<.*?>') + return pattern.sub('', summary_str) + + +def get_image_description(summary_str): + """ + :param summary_str: Summary string from parsing RSS + :return: Image description + Extract image description from summary string + """ + return summary_str[summary_str.find('alt') + 5::].split('"')[0] + + +def get_news_list(source, limit): + """ + :param source - RSS URL: + :param limit - Limit of viewing news: + :return - RSS display list: + Function parsing the rss received from source + into a list of news which will then be used for printing or parsing into JSON + """ + logging.info('Creating news list') + if not ckeck_internet(): + raise NoInternet("Internet off, please check your connection") + logging.info('Getting and parsing RSS') + parsed_rss = feedparser.parse(source) + if parsed_rss['bozo']: + raise IncorrectURL('The entered URL is incorrect') + if limit: + limit = min(limit, len(parsed_rss['entries'])) + else: + limit = len(parsed_rss['entries']) + news_list = [] + for index in range(limit): + news_list.append({'Feed': + html.unescape(parsed_rss['feed']['title']), + 'Title': + html.unescape(parsed_rss['entries'][index]['title']), + 'Date': + str(date_parser.parse(parsed_rss['entries'][index]['published'])).split(" ")[0], + 'Link': + parsed_rss['entries'][index]['link'], + 'Image description': + html.unescape(get_image_description(parsed_rss['entries'][index]['summary'])), + 'New description': + html.unescape(get_new_description(parsed_rss['entries'][index]['summary'])), + 'Image links': + [content['url'] for content in parsed_rss['entries'][index]['media_content']]}) + return news_list diff --git a/final_task/rss_reader/print_functions.py b/final_task/rss_reader/print_functions.py new file mode 100644 index 0000000..7fdd059 --- /dev/null +++ b/final_task/rss_reader/print_functions.py @@ -0,0 +1,84 @@ +import logging +import json +from colorama import Fore, Back, Style + + +def print_news(news_list): + """ + :param news_list: The list of news + Prints news in readable format + """ + logging.info('Printing news') + for index, new in enumerate(news_list): + print(f'New {index + 1}\n') + print(f'Feed:\n\t{new["Feed"]}') + print('Title:') + print(f'\t{new["Title"]}') + print(f'Date:\n\t{new["Date"]}') + print(f'Link:\n\t{new["Link"]}') + print(f'Image description:\n\t{new["Image description"]}') + print(f'New description:\n\t{new["New description"]}') + print('Image links:') + for image_link in new['Image links']: + print(f'\t{image_link}') + print('\n') + + +def print_news_colorize(news_list): + """ + :param news_list: The list of news + Prints news in readable colorize format + """ + logging.info('Printing news colorize') + for index, new in enumerate(news_list): + print(Style.RESET_ALL + Fore.WHITE + Back.MAGENTA + f'New {index + 1}\n') + print(Style.RESET_ALL + Fore.WHITE + Back.BLUE + 'Feed:\n' + Style.RESET_ALL + Fore.BLUE + f'\t{new["Feed"]}') + print( + Style.RESET_ALL + Fore.WHITE + Back.GREEN + 'Title:\n' + Style.RESET_ALL + Fore.GREEN + f'\t{new["Title"]}') + print(Style.RESET_ALL + Fore.WHITE + Back.CYAN + 'Date:\n' + Style.RESET_ALL + Fore.CYAN + f'\t{new["Date"]}') + print(Style.RESET_ALL + Fore.WHITE + Back.RED + 'Link:\n' + Style.RESET_ALL + Fore.RED + f'\t{new["Link"]}') + print( + Style.RESET_ALL + Fore.WHITE + Back.YELLOW + 'Image description:\n' + Style.RESET_ALL + \ + Fore.YELLOW + f'\t{new["Image description"]}') + print( + Style.RESET_ALL + Fore.WHITE + Back.LIGHTBLUE_EX + 'New description:\n' + Style.RESET_ALL + \ + Fore.LIGHTBLUE_EX + f'\t{new["New description"]}') + print(Style.RESET_ALL + Fore.BLACK + Back.LIGHTGREEN_EX + 'Image links:') + for image_link in new['Image links']: + print(Style.RESET_ALL + Fore.LIGHTGREEN_EX + f'\t{image_link}') + print('\n') + + +def print_news_JSON(news_list): + """ + :param news_list: The list of news + Prints news in readable JSON format + """ + logging.info('Printing news as JSON') + print(json.dumps(news_list, ensure_ascii=False, indent=4)) + + +def print_news_JSON_colorize(news_list): + """ + :param news_list: The list of news + Prints news in readable colorize JSON format + """ + logging.info('Printing news as JSON') + result_str = "[" + for new_index, new in enumerate(news_list): + result_str += f"\n\t\n\t\t\"\033[41mFeed\033[0m\": \"\033[31m{new['Feed']}\033[0m\"," \ + f"\n\t\t\"\033[42mTitle\033[0m\": \"\033[32m{new['Title']}\033[0m\"," \ + f"\n\t\t\"\033[43mDate\033[0m\": \"\033[33m{new['Date']}\033[0m\"," \ + f"\n\t\t\"\033[44mLink\033[0m\": \"\033[34m{new['Link']}\033[0m\"," \ + f"\n\t\t\"\033[45mImage description\033[0m\": \"\033[35m{new['Image description']}\033[0m\"," \ + f"\n\t\t\"\033[46mNew description\033[0m\": \"\033[36m{new['New description']}\033[0m\"," \ + f"\n\t\t\"\033[44mImage links\033[0m\": [" + for link_index, link in enumerate(new['Image links']): + result_str += f"\n\t\t\t\"\033[34m{link}\033[0m\"" + if link_index + 1 != len(new['Image links']): + result_str += ',' + result_str += "\n\t\t]\n\t}" + if new_index + 1 != len(news_list): + result_str += ',' + result_str += '\n]' + print(result_str) diff --git a/final_task/rss_reader/requirements.txt b/final_task/rss_reader/requirements.txt index e69de29..5bab75d 100644 --- a/final_task/rss_reader/requirements.txt +++ b/final_task/rss_reader/requirements.txt @@ -0,0 +1,8 @@ +feedparser +termcolor +pymysql +colorama +mysql-connector-python +py-dateutil +requests +Pillow \ No newline at end of file diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index e69de29..cca9100 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -0,0 +1,101 @@ +import argparse +import re +from termcolor import colored +import os +import colorama +import logging +from database_functions import get_news_list_by_date, write_news_to_database +from parse_rss_functions import get_news_list +from custom_exceptions import NoInternet, IncorrectURL, IncorrectFilePath, DatabaseConnectionError +from print_functions import print_news_colorize, print_news_JSON_colorize, print_news, print_news_JSON +from save_in_format_functions import save_in_fb2, save_in_html +from arguments_functions import check_date_argument, check_fb2_argument, check_html_argument, \ + check_limit_argument, get_arguments + +VERSION = 5 + + +def main(): + """ + The main entry point of the application + """ + colorama.init() + arguments = get_arguments() + if arguments.version: + print(f'Program version - {VERSION}') + return + if arguments.verbose: + logging.basicConfig(level=logging.INFO) + else: + logging.basicConfig(filename='sample.log', filemode='w', level=logging.INFO) + logging.info('Program started') + if arguments.to_html: + check_html_argument(arguments.to_html) + if arguments.to_fb2: + check_fb2_argument(arguments.to_fb2) + if arguments.limit: + check_limit_argument(arguments.limit) + arguments.limit = int(arguments.limit) + if arguments.date: + check_date_argument(arguments.date) + news_list = get_news_list_by_date(arguments.date, arguments.limit) + if arguments.to_html or arguments.to_fb2: + if arguments.to_html: + save_in_html(arguments.to_html, news_list, f"news_by_date-{arguments.date}.html") + if arguments.to_fb2: + save_in_fb2(arguments.to_fb2, news_list, f"news_by_date-{arguments.date}.fb2") + else: + if news_list: + if arguments.colorize: + if arguments.json: + print_news_JSON_colorize(news_list) + else: + print_news_colorize(news_list) + else: + if arguments.json: + print_news_JSON(news_list) + else: + print_news(news_list) + else: + print('No news by this date') + return + news_list = get_news_list(arguments.source, arguments.limit) + if arguments.to_html or arguments.to_fb2: + if arguments.to_html: + save_in_html(arguments.to_html, news_list, f"news_from-{arguments.source[8:-4]}.html") + if arguments.to_fb2: + save_in_fb2(arguments.to_fb2, news_list, f"news_from-{arguments.source[8:-4]}.fb2") + else: + if arguments.colorize: + if arguments.json: + print_news_JSON_colorize(news_list) + else: + print_news_colorize(news_list) + else: + if arguments.json: + print_news_JSON(news_list) + else: + print_news(news_list) + write_news_to_database(news_list) + + +if __name__ == '__main__': + try: + main() + except IncorrectURL as e: + print(colored(e, 'red')) + logging.error(e) + except NoInternet as e: + print(colored(e, 'red')) + logging.error(e) + except ValueError as e: + print(colored(e, 'red')) + logging.error(e) + except IncorrectFilePath as e: + print(colored(e, 'red')) + logging.error(e) + except DatabaseConnectionError as e: + print(colored(e, 'red')) + logging.error(e) + finally: + logging.info('Program ended') diff --git a/final_task/rss_reader/save_in_format_functions.py b/final_task/rss_reader/save_in_format_functions.py new file mode 100644 index 0000000..7d8a62b --- /dev/null +++ b/final_task/rss_reader/save_in_format_functions.py @@ -0,0 +1,117 @@ +from PIL import Image +import requests +import base64 +from io import BytesIO +import os +import logging +from parse_rss_functions import ckeck_internet + + +def get_new_content_html(new): + """ + :param new: The new + :return: string representation of new in html + Converts new into string which will be used in html format + """ + images_content = "" + if not ckeck_internet(): + for image_link in new['Image links']: + images_content += f"{image_link}" + else: + for image_link in new['Image links']: + if image_link == "": + continue + response = requests.get(image_link) + encoded_string = str(base64.b64encode(response.content)) + images_content += "\n" + return f""" +

{new['Feed']}

+

{new['Title']}

+

{new['Date']}

+

{new['Link']}

+

{images_content}

+

{new['New description']}

+

+ """ + + +def save_in_html(path, news_list, filename): + """ + :param path: The path of html format file + :param news_list: The list of news + Saves news in html format by path + """ + logging.info('Creating html format file') + html_content = "\n\n" + for new in news_list: + html_content += get_new_content_html(new) + html_content += "\n" + with open(os.path.join(path, filename), 'w', encoding="utf-8") as html_file: + html_file.write(html_content) + logging.info('Html format file created') + + +def get_new_content_fb2(new): + """ + :param new: The new + :return: string representation of new in fb2 + Converts new into string which will be used in fb2 format + """ + images_content = "" + for image_link in new['Image links']: + images_content += f"" + return f""" +
+

{new['Feed'].replace('&', 'and')}

+

{new['Title'].replace('&', 'and')}

+

{new['Date']}

+

{images_content}

+

{new['New description'].replace('&', 'and')}

+
+ """ + + +def get_images_content(news_list): + """ + :param news_list: The list of news + :return: string representation of images + Transforms images into string by using base64 + """ + if not ckeck_internet(): + return "" + images_content = "" + for new in news_list: + for image_link in new['Image links']: + if image_link == "": + continue + response = requests.get(image_link) + img = Image.open(BytesIO(response.content)) + img = img.resize((100, 100)) + img = img.convert('RGB') + img.save('tmp.jpg', 'JPEG') + with open('tmp.jpg', 'rb') as f: + encoded_string = str(base64.b64encode(f.read())) + images_content += f"\n" + encoded_string[2:len( + encoded_string) - 1] + "\n\n" + os.remove('tmp.jpg') + return images_content + + +def save_in_fb2(path, news_list, filename): + """ + :param path: The path of fb2 format file + :param news_list: The list of news + Saves news in fb2 format by path + """ + logging.info('Creating fb2 format file') + fb2_content = """ + + """ + for new in news_list: + fb2_content += get_new_content_fb2(new) + fb2_content += "\n\n" + fb2_content += get_images_content(news_list) + "" + with open(os.path.join(path, filename), 'w', encoding="utf-8") as fb2_file: + fb2_file.write(fb2_content) + logging.info('Fb2 format file created') diff --git a/final_task/setup.py b/final_task/setup.py index e69de29..2f9e7f6 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -0,0 +1,37 @@ +from setuptools import setup +from os import path + +here = path.abspath(path.dirname(__file__)) + +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='rss-reader', + version='5.0', + description='Pure Python command-line RSS reader', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://github.com/paxalos/FinalTaskRssParser/tree/master/final_task', + author='Pavel Los', + author_email='Lospawel@yandex.ru', + license='MIT', + zip_safe=False, + scripts=['rss_reader/custom_exceptions.py', + 'rss_reader/database_functions.py', + 'rss_reader/parse_rss_functions.py', + 'rss_reader/print_functions.py', + 'rss_reader/rss_reader.py', + 'rss_reader/save_in_format_functions.py', + 'rss_reader/arguments_functions.py'], + install_requires=['feedparser', + 'termcolor', + 'pymysql', + 'colorama', + 'mysql-connector-python', + 'py-dateutil', + 'requests'], + entry_points={ + 'console_scripts': ['rss-reader=rss_reader:main'], + } +) \ No newline at end of file diff --git a/final_task/tests/test.py b/final_task/tests/test.py new file mode 100644 index 0000000..6b841c7 --- /dev/null +++ b/final_task/tests/test.py @@ -0,0 +1,34 @@ +import unittest +from final_task.rss_reader.parse_rss_functions import get_new_description, get_image_description +class TestMethods(unittest.TestCase): + def test_get_new_description(self): + summary_str='

Ocasio-Cortez: Trump was 'clearly engaged in extortion and bribery\';Ocasio-Cortez discussed the issue with Yahoo News on ' \ + 'Capitol Hill on Tuesday as the third day of public hearings was being conducted in ' \ + 'the Democrats’ ongoing impeachment inquiry.


' + correct_result='Ocasio-Cortez discussed the issue with Yahoo News on Capitol Hill on Tuesday as ' \ + 'the third day of public hearings was being conducted in the Democrats’ ' \ + 'ongoing impeachment inquiry.' + self.assertEqual(get_new_description(summary_str),correct_result) + + def test_get_image_description(self): + summary_str = '

Ocasio-Cortez: Trump was 'clearly engaged in extortion and bribery\';Ocasio-Cortez discussed the issue with Yahoo News on ' \ + 'Capitol Hill on Tuesday as the third day of public hearings was being conducted in ' \ + 'the Democrats’ ongoing impeachment inquiry.


' + correct_result="Ocasio-Cortez: Trump was 'clearly engaged in extortion and bribery';" + self.assertEqual(get_image_description(summary_str), correct_result) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file