diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b46c6e4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +*.pyc +*.idea'.idea' +*.db +*.pkl +*.cw127.pkl +*.egg-info +final_task/FinalTaskRssParser.egg-info +final_task/dist +*.log + diff --git a/Final_Task/FinalTask.md b/Final_Task/FinalTask.md deleted file mode 100644 index 1515169..0000000 --- a/Final_Task/FinalTask.md +++ /dev/null @@ -1,136 +0,0 @@ -# Introduction to Python. Hometask -You are proposed to implement Python RSS-reader using **python 3.8**. - -The task consists of few iterations. Do not start new iteration if the previous one is not implemented yet. - -## Common requirements -* It is mandatory to use `argparse` module. -* Codebase must be covered with unit tests with at least 50% coverage. -* In case of any mistakes utility should print human-readable -error explanation. Exception tracebacks in stdout are prohibited in final version of application. -* Docstrings are mandatory for all methods, classes, functions and modules. -* Code must correspond to `pep8` (use `pycodestyle` utility for self-check). - * You can set line length up to 120 symbols. -* Commit messages should provide correct and helpful information about changes in commit. Messages like `Fix bug`, -`Tried to make workable`, `Temp commit` and `Finally works` are prohibited. -* Usage of external APIs is prohibited (except of APIs for receiving RSS) - -## [Iteration 1] One-shot command-line RSS reader. -RSS reader should be a command-line utility which receives [RSS](wikipedia.org/wiki/RSS) URL and prints results in human-readable format. - -You are free to choose format of the news console output. The textbox below provides an example of how it can be implemented: - -```shell -$ rss_reader.py "https://news.yahoo.com/rss/" --limit 1 - -Feed: Yahoo News - Latest News & Headlines - -Title: Nestor heads into Georgia after tornados damage Florida -Date: Sun, 20 Oct 2019 04:21:44 +0300 -Link: https://news.yahoo.com/wet-weekend-tropical-storm-warnings-131131925.html - -[image 2: Nestor heads into Georgia after tornados damage Florida][2]Nestor raced across Georgia as a post-tropical cyclone late Saturday, hours after the former tropical storm spawned a tornado that damaged -homes and a school in central Florida while sparing areas of the Florida Panhandle devastated one year earlier by Hurricane Michael. The storm made landfall Saturday on St. Vincent Island, a nature preserve -off Florida's northern Gulf Coast in a lightly populated area of the state, the National Hurricane Center said. Nestor was expected to bring 1 to 3 inches of rain to drought-stricken inland areas on its -march across a swath of the U.S. Southeast. - - -Links: -[1]: https://news.yahoo.com/wet-weekend-tropical-storm-warnings-131131925.html (link) -[2]: http://l2.yimg.com/uu/api/res/1.2/Liyq2kH4HqlYHaS5BmZWpw--/YXBwaWQ9eXRhY2h5b247aD04Njt3PTEzMDs-/https://media.zenfs.com/en/ap.org/5ecc06358726cabef94585f99050f4f0 (image) - -``` - -Utility should provide the following interface: -```shell -usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] - source - -Pure Python command-line RSS reader. - -positional arguments: - source RSS URL - -optional arguments: - -h, --help show this help message and exit - --version Print version info - --json Print result as JSON in stdout - --verbose Outputs verbose status messages - --limit LIMIT Limit news topics if this parameter provided - -``` - -In case of using `--json` argument your utility should convert the news into [JSON](https://en.wikipedia.org/wiki/JSON) format. -You should come up with the JSON structure on you own and describe it in the README.md file for your repository or in a separate documentation file. - -The `--limit` argument should also affect JSON generation. - -With the argument `--verbose` your program should print all logs in stdout. - -Withe the argument `--version` your program should print in stdout it's current version and complete it's work. The version supposed to change with every iteration. - - -## [Iteration 2] Distribution - -* Utility should be wrapped into distribution package with `setuptools`. -* This package should export CLI utility named `rss-reader`. - -> Note: Double-check, that your utility works correctly after its new package was installed on a clean machine. - -## [Iteration 3] News caching -The RSS news should be stored in a local storage while reading. The way and format of this storage you can choose yourself. -Please describe it in a separate section of README.md or in the documentation. - -New optional argument `--date` must be added to your utility. It should take a date in `%Y%m%d` format. -For example: `--date 20191020` - -The cashed news can be read with it. The new from the specified day will be printed out. -If the news are not found return an error. - -If the `--date` argument is not provided, the utility should work like in the previous iterations. - -## [Iteration 4] Format converter - -You should implement the conversion of news in at least two of the suggested format: `.mobi`, `.epub`, `.fb2`, `.html`, `.pdf` - -New optional argument must be added to your utility. This argument receives the path where new file will be saved. The arguments should represents which format will be generated. - -For example: `--to-mobi` or `--to-fb2` or `--to-epub` - - -You can choose yourself the way in which the news will be displayed, but the final text result should contain pictures and links, if they exist in the original article and if the format permits to store this type of data. - -## * [Iteration 5] Output colorization -> Note: An optional iteration, it is not necessary to implement it. You can move on with it only if all the previous iterations (from 1 to 4) are completely implemented. - -You should add new optional argument `--colorize`, that will print the result of the utility in colorized mode. - -If the argument is not provided, the utility should work like in the previous iterations. - -> Note: Take a look at the [colorize](https://pypi.org/project/colorize/) library - -## * [Iteration 6] Web-server -> Note: An optional iteration, it is not necessary to implement it. You can move on with it only if all the previous iterations (from 1 to 4) are completely implemented. Introduction to Python course does not cover the topics that are needed for the implementation of this part. - -There are several mandatory requirements in this iteration: -* `Docker` + `docker-compose` usage (at least 2 containers: one for web-application, one for DB) -* Web application should provide all the implemented in the previous parts of the task functionality, using the REST API: - - One-shot conversion from RSS to Human readable format - - Server-side news caching - - Conversion in epub, mobi, fb2 or other formats - -Feel free to choose the way of implementation, libraries and frameworks. (We suggest you `Django Rest Framework` + `PostgreSQL` combination) - -You can implement any functionality that you want. The only requirement is to add the description into README file or update project documentation, for example: -* authorization/authentication -* automatic scheduled news update -* adding new RSS sources using API - - - ---- -Implementations will be checked with the latest cPython interpreter of 3.8 branch. ---- - - -> Always code as if the guy who ends up maintaining your code will be a violent psychopath who knows where you live. Code for readability. **John F. Woods** diff --git a/Final_Task/README.md b/Final_Task/README.md deleted file mode 100644 index 7af281f..0000000 --- a/Final_Task/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Your readme here -Some text. -Checkout how to write this file using *markdown*. diff --git a/README.md b/README.md index 228b9ad..8fb4fc4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,32 @@ -# FinalTaskRssParser -For final task pull requests. +# That's how it works + +* Creating rss_read class object +* Using feedparser to get a page with function parse +* Then using output functions get info from the page +* Info (source link, image link, etc.) for every novelty pack in class Novelty +* Create a pack of news filled with novelty class objects +* When a pack of news is done come back to rss_reader.py +* Here we prepare to output info according to arguments from console and write down information into DB +* If there is '--to-pdf' or '--to-html' (or both arguments) argument in console we use functions +from PDF_and_HTML_converting to: + 1. Get some images (to avoid many copies of pictures we first of all delete images + folder if it exists) + 2. Add them into PDF or/and html file + 3. Add all other information + +* If there is also '--date Y%M%D' in console with '--to-pdf' or/and '--to-html' we write down into the +pdf or/and html file(s) news for that date. +* If there is '--date Y%M%D' in console we take news with that date from our DB. If there is also +'--limit N' arguments, we take N news from our DB. +* If in addition to '--to-pdf' or/and '--to-html' and '--date Y%M%D' there is '--limit N' we write down +N news with that date to file(s) pdf or/and html +* If '--colorize' is in console args then we colorize our news in random colors. If there is no '--colorize' +we use usual color (grey-white) +## Important! +When using pdf or html converting input your path in look like this: "C:\\Test\\" or "C:\\Test" + +When input arguments to parse any page first of all put link, EXAMPLE: +python rss_reader.py https://bla-bla-bla.by --limit 1 + +If you don't want to input link and want to get news stored in local storage input for EXAMPLE + like this: python rss_reader.py --colorize --limit 15 diff --git a/rss_task/__init__.py b/rss_task/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_task/rss_reader/Classes/novelty.py b/rss_task/rss_reader/Classes/novelty.py new file mode 100644 index 0000000..9171e67 --- /dev/null +++ b/rss_task/rss_reader/Classes/novelty.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass + + +@dataclass +class Novelty: + number_of_novelty: int + title_of_novelty: str + time_of_novelty: str + source_link: str + description: str + images_links: str + alt_text: str + date_corrected: str + main_source: str diff --git a/rss_task/rss_reader/Classes/rss_read.py b/rss_task/rss_reader/Classes/rss_read.py new file mode 100644 index 0000000..01cb1ad --- /dev/null +++ b/rss_task/rss_reader/Classes/rss_read.py @@ -0,0 +1,163 @@ +import feedparser +from output_functions import getting_full_info, getting_pack_of_news, converting_to_json, \ + writing_to_cache, getting_from_database_to_pack +from pdf_and_html_converting import converting_to_pdf, converting_to_html, pdf_path, html_path +import logging +import re + + +class RSSParser: + """ + class RSSParser has 3 parameters and it calls function parse when created + """ + + def __init__(self, param_url, num_of_news=None, list_of_args=None): + self.feed_url = param_url + self.number = num_of_news + self.list_of_args = list_of_args + + def parse(self): + """ + 1. Use feedparser to get the page + 2. If we have some problems with connection - raise ConnectionError + 3. Handle Exception without showing a traceback + 4. Do parse method + 5. If there are some arguments from console - work with them + """ + try: + logging.info("Trying to get page from feedparser!") + the_feed = feedparser.parse(self.feed_url) + logging.info("Got it (the page)!") + if the_feed.get('bozo'): + if '--date' in self.list_of_args: + if '--to-pdf' in self.list_of_args: + path_pdf = pdf_path(self.list_of_args) + pack_news = self.news_for_date() + converting_to_pdf(path_pdf, pack_news) + elif '--to-html' in self.list_of_args: + path_html = html_path(self.list_of_args) + pack_news = self.news_for_date() + converting_to_pdf(path_html, pack_news) + else: + logging.info("Getting news for date!") + news = self.news_for_date() + if '--json' not in self.list_of_args: + getting_full_info(the_feed, news, self.list_of_args) + print("\nJSON VIEW OF NEWS:", converting_to_json(news, the_feed)) + logging.info("Got news for date!") + else: + logging.info("Got some problems due to connection!") + except ConnectionError: + logging.critical("CONNECTION ERROR, HELP!") + print("You have some connection problems!") + if '--date' in self.list_of_args: + if '--to-pdf' in self.list_of_args: + path_pdf = pdf_path(self.list_of_args) + pack_news = self.news_for_date() + converting_to_pdf(path_pdf, pack_news) + elif '--to-html' in self.list_of_args: + path_html = html_path(self.list_of_args) + pack_news = self.news_for_date() + converting_to_html(path_html, pack_news) + else: + logging.info("Getting news for date!") + news = self.news_for_date() + if '--json' not in self.list_of_args: + getting_full_info(the_feed, news, self.list_of_args) + print("\nJSON VIEW OF NEWS:", converting_to_json(news, the_feed)) + logging.info("Got news for date!") + + logging.info("Getting pack of news!") + pack_of_news, pack_of_news_for_db = getting_pack_of_news(the_feed, self.feed_url, + self.list_of_args, self.number) + logging.info("Got pack of news!") + logging.info("Writing news from source and DB to file!") + writing_to_cache(pack_of_news, pack_of_news_for_db, 'news_cache.txt') + logging.info("News are in the file!") + if '--to-html' in self.list_of_args: + path_html = html_path(self.list_of_args) + if '--date' in self.list_of_args: + pack = self.news_for_date() + converting_to_html(path_html, pack) + else: + converting_to_html(path_html, pack_of_news) + if '--to-pdf' in self.list_of_args: + path_pdf = pdf_path(self.list_of_args) + if '--date' in self.list_of_args: + pack = self.news_for_date() + converting_to_pdf(path_pdf, pack) + else: + converting_to_pdf(path_pdf, pack_of_news) + if '--to-pdf' not in self.list_of_args and '--to-html' not in self.list_of_args: + if '--date' in self.list_of_args and '--json' not in self.list_of_args: + logging.info("Getting full info!") + getting_full_info(the_feed, self.news_for_date(), self.list_of_args) + logging.info("Got full info!") + else: + logging.info("Getting full info!") + if not the_feed.get('bozo') and '--json' not in self.list_of_args: + getting_full_info(the_feed, pack_of_news, self.list_of_args) + logging.info("Got full info!") + + if '--json' in self.list_of_args and '--date' not in self.list_of_args: + print("\nJSON VIEW OF NEWS:", converting_to_json(pack_of_news, the_feed)) + elif '--json' in self.list_of_args and '--date' in self.list_of_args: + print("\nJSON VIEW OF NEWS:", converting_to_json(self.news_for_date(), the_feed)) + + def news_if_not_source(self, the_feed): + # Looking for url address: if it is => doing all the thing; if it is not => printing all the news + chk_pat = '(?:{})'.format('|'.join(self.list_of_args)) + s = 'http' + if not bool(re.search(s, chk_pat, flags=re.I)): + pack_of, pack_db = getting_pack_of_news(the_feed, self.feed_url, self.list_of_args, self.number) + getting_full_info(the_feed, pack_db, self.list_of_args) + + def news_for_date(self): + """ + Finding news by date and rss + If your rss and date are correct we append the novelty to the pack_of_news_needed + If not we continue our searching + """ + try: + news_for_date_needed = [] + date_needed = self.list_of_args[self.list_of_args.index('--date') + 1] + pack_of_db_news = getting_from_database_to_pack() + if '--limit' in self.list_of_args: + cycle_counter = 0 + number_of_news_found = 0 + while cycle_counter != len(pack_of_db_news): + if str(pack_of_db_news[cycle_counter].date_corrected) == date_needed and \ + self.feed_url == pack_of_db_news[cycle_counter].main_source: + news_for_date_needed.append(pack_of_db_news[cycle_counter]) + number_of_news_found += 1 + if number_of_news_found == self.number: + break + cycle_counter += 1 + else: + for item in pack_of_db_news: + if str(item.date_corrected) == date_needed and \ + self.feed_url == item.main_source: + news_for_date_needed.append(item) + if self.feed_url is None: + counter = 0 + number_of_news_f = 0 + while counter != len(pack_of_db_news): + if str(pack_of_db_news[counter].date_corrected) == date_needed: + pack_of_db_news[counter].number_of_novelty = number_of_news_f + 1 + news_for_date_needed.append(pack_of_db_news[counter]) + number_of_news_f += 1 + counter += 1 + if '--limit' in self.list_of_args: + if number_of_news_f == self.number: + break + if not news_for_date_needed: + if '--limit' in self.list_of_args: + print("No news have been found for this date with your limits!") + elif 'source' in self.list_of_args: + print("No news have been found for your source") + else: + print("No news have been found for this date!") + return news_for_date_needed + except IndexError: + print("You forgot to enter date in format %Y%m%d") + diff --git a/rss_task/rss_reader/DejaVuSans.ttf b/rss_task/rss_reader/DejaVuSans.ttf new file mode 100644 index 0000000..e5f7eec Binary files /dev/null and b/rss_task/rss_reader/DejaVuSans.ttf differ diff --git a/rss_task/rss_reader/output_functions.py b/rss_task/rss_reader/output_functions.py new file mode 100644 index 0000000..9431b4d --- /dev/null +++ b/rss_task/rss_reader/output_functions.py @@ -0,0 +1,354 @@ +import dataclasses +import json +import logging +import re +import pprint +import sqlite3 +from datetime import datetime +from itertools import groupby +from html import unescape +import colorama + +from bs4 import BeautifulSoup + +from Classes.novelty import Novelty + + +def clean_html(raw_html): + """ + Formatting our String excluding all html-tags making it easier to read our description + """ + logging.info("Cleaning something from htlm-tags.") + cleaner = re.compile('<.*?>') + clean_text = re.sub(cleaner, '', raw_html) + logging.info("Cleaned from html-tags!") + return clean_text + + +def printing_title(the_feed) -> str: + logging.info("Getting title.") + source_title = the_feed.get('feed', '').get('title') + logging.info("Got title.") + return source_title + + +def getting_num_of_news(the_feed, num_of_news): + if num_of_news is None: + return len(the_feed.entries) + else: + return num_of_news + + +def getting_images_links(the_feed, num_of_news) -> list: + """ + due to variety of web sites templates how to carry link for an image I needed variety of solutions + to get links + 1. Step into entries of feed + 2. Use soup for each item of entries + 3. Compare item with different forms of where links can be + """ + logging.info("Now getting images from the page.") + pack_of_img = [] + pack_of_images_links = [] + for num, item in enumerate(the_feed.entries[:num_of_news]): + soup = BeautifulSoup(str(item), "lxml") + if item.get('media_content', '') != '' and not soup.find_all('img') != []: # if soup.find_all == []: + for image in range(getting_num_of_news(the_feed, num_of_news)): + pack_of_images_links.append(item.get('media_content', '')[0].get('url', '')) + else: + try: + pack_of_img.append(soup.find('img')) + if soup.find('img') is not None: + pack_of_images_links.append(pack_of_img[num]['src']) + else: + pack_of_images_links.append('No image') + except IndexError: + logging.warning("Some problems appeared, solving them!") + for img in soup.find_all('img'): + if img.get('src') != '': + pack_of_images_links.append(img.get('src')) + else: + pack_of_images_links.append(str(num)) + logging.info("Got some images!") + return pack_of_images_links + + +def getting_alt_text(the_feed, num_of_news): + """ + getting alternative text for images: + """ + pack_of_alts = [] + logging.info("Getting alternative text for images!") + try: + for num, item in enumerate(the_feed.entries[:num_of_news]): + soup = BeautifulSoup(str(item), "lxml") + for img in soup.find_all('img'): + if img is None: + pack_of_alts.append(str(num)) + else: + try: + pack_of_alts.append(img['alt']) + except KeyError: + logging.warning("Solving problems with alternative text!") + pack_of_alts.append(str(num)) + except IndexError: + pass + logging.info("Got some alternative text.") + return pack_of_alts + + +def getting_pack_of_news(the_feed, main_source, list_of_args, num_of_news=None): + """ + Creating full novelty + 1. Adding list of images, correct it if some duplicates are there + 2. Adding list of alternative texts, correct it if some duplicates are there + 3. Some problems can occur if there are no alternative text, solving them with changing list of alts + """ + if getting_num_of_news(the_feed, num_of_news) > len(the_feed.entries): + print("You want to get more news than we can get!") + print("Printing all news from the storage!") + pack = getting_from_database_to_pack() + getting_full_info(the_feed, pack[:int(list_of_args[list_of_args.index('--limit') + 1])], + list_of_args) + + pack_of_news = [] + pack_of_news_for_db = [] + pack_of_images_links = getting_images_links(the_feed, num_of_news) + pack_of_alts = getting_alt_text(the_feed, num_of_news) + corrected_pack_of_alts = [el for el, _ in groupby(pack_of_alts)] + + for num, item in enumerate(the_feed.entries[:num_of_news]): + novelty, novelty_for_database = getting_novelty(item, num, pack_of_images_links, corrected_pack_of_alts, + main_source) + pack_of_news.append(novelty) + pack_of_news_for_db.append(novelty_for_database) + return pack_of_news, pack_of_news_for_db + + +def getting_novelty(item, number, corrected_pack_of_images_links, corrected_pack_of_alts, main_source): + """ + It was really difficult to get images because of variety of templates how sites leave link for image + Here we create object of Novelty class and fill it with our title, description and etc. + Then if some problems with images or alt.text occur we use + """ + + try: + alt_text = corrected_pack_of_alts[number] + except IndexError: + alt_text = 'No alternative text.' + # Making a readable 'table' of class Novelty parameters + number_of_novelty = number + 1 + title_of_novelty = unescape(item.get('title', '')) + time_of_publishing = item.get('published', '') + source_link = item.get('link', '') + description = unescape(clean_html(item.get('description', ''))) + image_link = corrected_pack_of_images_links[number] + alternative_text = unescape(alt_text) + corrected_time = getting_corrected_time(item) + main_link = main_source + # creating Novelty class object + novelty = Novelty(number_of_novelty, + title_of_novelty, + time_of_publishing, + source_link, + description, + image_link, + alternative_text, + corrected_time, + main_link) + # creating a tuple from Novelty class object to put it in the database + novelty_for_database = (novelty.number_of_novelty, novelty.title_of_novelty, novelty.time_of_novelty, + novelty.source_link, novelty.description, novelty.images_links, + novelty.alt_text, novelty.date_corrected, novelty.main_source) + return novelty, novelty_for_database + + +def getting_full_info(the_feed, pack_of_news, list_of_args): + """ + Getting full info from news + try-except for printing links and alternative text + """ + logging.info("Getting news to output!") + print("------------------------") + if the_feed.get('feed', '').get('title') is not None: + source_title = the_feed.get('feed', '').get('title') + else: + source_title = "No title." + print(f"Source: {source_title}") + for novelty in pack_of_news: + logging.info("Getting novelty to output!") + if '--colorize' in list_of_args: + colorama.init() + print(colorama.Fore.BLUE + f"\n{novelty.number_of_novelty}. Title: {novelty.title_of_novelty}") + else: + print(f"\n{novelty.number_of_novelty}. Title: {novelty.title_of_novelty}") + print(f"Published: {novelty.time_of_novelty}") + print(f"Link: {novelty.source_link}") + print("Description: ") + if '--colorize' in list_of_args: + colorama.init() + print(colorama.Fore.YELLOW + pprint.pformat(novelty.description, width=115)) + else: + print(pprint.pformat(novelty.description, width=115)) + try: + print(f"\n[{1}] {novelty.source_link}") + if novelty.images_links != novelty.number_of_novelty - 1: + print(f"[{2}] {novelty.images_links}") + else: + print(f"[{2}] no image") + if novelty.alt_text != str(novelty.number_of_novelty - 1): + print(f"Alternative text: {novelty.alt_text}") + else: + print("Alternative text: no alternative text") + except AttributeError: + print(f"\n[{1}] {novelty.source_link}") + print(f"[{2}] no image") + print("Alternative text: no alt") + logging.info("Got novelty!") + + +def converting_to_json(pack_of_news, the_feed=''): + logging.info("Converting to json view!") + try: + source = the_feed.get('feed', '').get('title') + except AttributeError: + source = '' + news_dict = { + "Source": source, + "News": [dataclasses.asdict(item) for item in pack_of_news] + } + logging.info("Converted to json view!") + return json.dumps(news_dict) + + +def getting_info_into_file(item): + """ + Preparing information to be written into the file in more readable way + """ + logging.info("Getting novelty into file!") + number = item.number_of_novelty + title = pprint.pformat(item.title_of_novelty, width=115) + time = item.time_of_novelty + source_link = pprint.pformat(item.source_link, width=115) + description = pprint.pformat(item.description.replace("\xa0", " "), width=115) + images_links = pprint.pformat(item.images_links) + alt_text = pprint.pformat(item.alt_text, width=115) + main_source = item.main_source + novelty = f"\n{number}.\nTitle: {title}\nDate: {time}\nLink: {source_link}\nDescription:\n {description}" \ + f"\nImages links:{images_links}\nAlternative text:{alt_text}\nMain source: {main_source}" + logging.info("Got novelty into file!") + return novelty + + +def getting_corrected_time(item): + """ + Getting time in view %Y%m%d + """ + logging.info("Getting corrected date!") + corrected_date = datetime.strptime(item.get('published', ''), '%a, %d %b %Y %X %z') + logging.info("Got corrected date!") + return corrected_date.strftime('%Y%m%d') + + +def reading_file(name_of_file): + with open(name_of_file, 'r', encoding='utf-8') as news_cache: + return news_cache.read() + + +def writing_to_cache(pack_of_news, pack_of_news_for_db, filename): + """ + Writing information into 2 files: news_cache.txt and News_cache_json.json + Creating 2 files because it's easier to read information into computer from JSON file than another file + 1. Open file + 2. Check if file is empty or not + 3. If empty - append whole information + If Not empty: + 1. Check if the novelty in file + 2. If in file: continue + If Not in the file: Find out length of file (amount of news), put that number to incoming novelty + If that novelty exists it will go to another novelty in limit + It means that if you enter --limit 15 and 10 news are already in list it will add only 5 news to list + """ + conn = sqlite3.connect("database.db") + cursor = conn.cursor() + cursor.execute('create table if not exists projects(num integer, title text, time text, source_link text, ' + 'description text, images_links text, alt_tx text, date_corrected integer, main_source text)') + conn.commit() + logging.info("Opening file News_cache.") + with open(filename, 'a', encoding='utf-8') as news_cache: + logging.info("Reading from News_cache.") + content = reading_file('news_cache.txt') + if not content: + logging.info("Writing news to an empty file!") + writing_if_file_empty(news_cache, pack_of_news, pack_of_news_for_db) + logging.info("Wrote news to an empty file!") + else: + logging.info("Writing news to NOT empty file!") + writing_if_something_in_file(news_cache, pack_of_news, pack_of_news_for_db, content) + logging.info("Wrote news to NOT empty file!") + conn.close() + + +def writing_if_something_in_file(news_cache, pack_of_news, pack_of_news_for_db, content): + """ + checking if something is in file. If it is we continue to write into it and database. If it is Not + we clear database to synchronise txt file and DB + """ + conn = sqlite3.connect("database.db") + cursor = conn.cursor() + for number, item in enumerate(pack_of_news): + if item.source_link in content: + continue + else: + length = sum(1 for line in cursor.execute("SELECT * FROM projects")) + 1 + logging.info("Counting lines.") + logging.info("Counted lines.") + item.number_of_novelty = length + logging.info("Writing into file if it was not empty.") + news_cache.write(getting_info_into_file(item)) + news_cache.write("\n_ _ _ _ _ _ _") + cursor.execute('insert into projects values (?,?,?,?,?,?,?,?,?)', pack_of_news_for_db[number]) + conn.commit() + conn.close() + + +def writing_if_file_empty(news_cache, pack_of_news, pack_of_news_for_db): + """ + Writing news when file is empty to DB and txt file + """ + conn = sqlite3.connect("database.db") + cursor = conn.cursor() + if cursor.execute("SELECT * FROM projects"): + cursor.execute("DELETE FROM projects") + conn.commit() + for num, item in enumerate(pack_of_news): + logging.info("Writing into file if it was empty.") + news_cache.write(getting_info_into_file(item)) + news_cache.write("\n_ _ _ _ _ _ _") + cursor.execute('insert into projects values (?,?,?,?,?,?,?,?,?)', pack_of_news_for_db[num]) + conn.commit() + conn.close() + + +def getting_from_database_to_pack(): + """ + Getting news from database back to the pack to use them for our needs + """ + logging.info("Getting news from DB!") + pack_of_news = [] + with sqlite3.connect("database.db") as conn: + cursor = conn.cursor() + try: + for item in cursor.execute("SELECT * FROM projects"): + (number, title, date, source, description, im_links, alt, date_corr, main_source) = item + number = len(pack_of_news) + 1 + novelty = Novelty(number, title, date, source, description, im_links, alt, date_corr, main_source) + pack_of_news.append(novelty) + logging.info("Got news from DB!") + except sqlite3.OperationalError: + print("Get some news first!") + return pack_of_news + + + + diff --git a/rss_task/rss_reader/pdf_and_html_converting.py b/rss_task/rss_reader/pdf_and_html_converting.py new file mode 100644 index 0000000..79c2b57 --- /dev/null +++ b/rss_task/rss_reader/pdf_and_html_converting.py @@ -0,0 +1,136 @@ +import urllib +from fpdf import FPDF +import os +import shutil +from pathlib import Path + + +def drawing_image(file_name): + from PIL import Image + from PIL import ImageFont + from PIL import ImageDraw + + img = Image.new("RGB", (480, 360)) + draw = ImageDraw.Draw(img) + font = ImageFont.truetype('arial', 60) + draw.text((120, 120), "No image", (255, 255, 255), font=font) + img.save('{0}'.format(file_name)) + + +def checking_path(): + return "Imag\\" + + +def getting_images(path, pack_of_news): + deleting_images(path) + symbol = ':' + try: + os.mkdir(os.path.abspath(path.joinpath(Path(checking_path())))) + except OSError: + print("Creating directory %s hasn't been complete!" % path) + for item in pack_of_news: + name_by_date = item.time_of_novelty.replace(symbol, '') + url = item.images_links + file_name = path.joinpath(Path(checking_path()).joinpath(Path(name_by_date + '.jpg'))) + try: + img = urllib.request.urlopen(url).read() + out = open(file_name, "wb") + out.write(img) + out.close() + except FileNotFoundError: + print("Wrong path for images.") + break + except ValueError: + drawing_image(file_name) + except TypeError: + drawing_image(file_name) + except urllib.error.URLError: + drawing_image(file_name) + + +def deleting_images(path): + directory = checking_path() + try: + shutil.rmtree(path.joinpath(Path(directory))) + except OSError: + print("Nothing to delete.") + + +def converting_to_pdf(path, pack_of_news): + symbol = ':' + pdf = FPDF() + pdf.add_font('DejaVuSans', '', 'DejaVuSans.ttf', uni=True) + pdf.set_font("DejaVuSans") + pdf.add_page() + if not os.path.exists(path): + os.makedirs(path) + getting_images(path, pack_of_news) + for item in pack_of_news: + name_by_date = item.time_of_novelty.replace(symbol, '') + directory = checking_path() + file_name = path.joinpath(Path(directory + name_by_date + '.jpg')) + + pdf.set_text_color(0, 0, 0) + pdf.write(10, str(item.number_of_novelty) + ". " + item.title_of_novelty + "\n") + try: + pdf.image(str(file_name), w=140, h=100) + except RuntimeError: + pdf.write("NO IMAGE, BUT LINK: " + item.images_links + "\n") + pdf.write(5, "ALT TEXT: " + item.alt_text + "\n") + pdf.write(10, "PUBLISHED: " + item.time_of_novelty + "\n") + pdf.write(7, "SOURCE LINK: " + item.source_link + "\n\n") + pdf.set_text_color(0, 0, 255) + pdf.write(6, "DESCRIPTION: " + item.description + "\n") + pdf.set_text_color(0, 0, 0) + pdf.write(7, "\nIMAGES LINKS: " + item.images_links + "\n") + pdf.write(6, "MAIN SOURCE LINK: " + item.main_source + "\n") + try: + pdf.output((path.joinpath(Path('News.pdf')))) # Create pdf in path + except PermissionError: + print("Something wrong with your path in pdf!") + except FileNotFoundError: + print("You entered wrong path!") + + +def converting_to_html(path, pack_of_news): + symbol = ':' + if not os.path.exists(path): + os.makedirs(path) + getting_images(path, pack_of_news) + path_to_file = path.joinpath(Path("Html_news.html")) + try: + with open(path_to_file, 'w', encoding='utf-8') as html_news: + for item in pack_of_news: + name_by_date = item.time_of_novelty.replace(symbol, '') + directory = checking_path() + file_name = path.joinpath(Path(directory + name_by_date + '.jpg')) + html_news.write(f"
{item.number_of_novelty}.
Title: {item.title_of_novelty}
")
+ html_news.write("Date: {0}
Link: {1}
".format(item.time_of_novelty,
+ item.source_link))
+ html_news.write(f"Description:
{item.description}
")
+ try:
+ html_news.write(f"Image: ")
+ html_news.write("
Image links: {0}
Alternative text: {1}
".
+ format(item.images_links,
+ item.alt_text))
+ except RuntimeError:
+ html_news.write(f"Image links: {item.images_links}
Alternative text: {item.alt_text}
")
+ html_news.write("Main source: {0}