From 3f9190c64de58219f6be86228aaac3cf3815f253 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sat, 23 Nov 2019 20:17:16 +0300 Subject: [PATCH 01/31] Create basic CLI with arguments parsing Created CLI using argparse and handle all arguments from the first iteration. --- final_task/rss_reader/rss_reader.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index e69de29..8e9b6ac 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -0,0 +1,24 @@ +import argparse + + +# temp +__version__ = "0.1" + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.") + + parser.add_argument("source", help="RSS URL") + parser.add_argument("--version", action="version", version=f"{parser.prog}s {__version__}", + help="Print version info") + parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout") + parser.add_argument("-v", "--verbose", action="store_true", help="Outputs verbose status messages") + parser.add_argument("--limit", type=int, help="Limit news topics if this parameter provided") + + args = parser.parse_args() + + rss_url = args.source + + if args.verbose: + print("verbosity turned on") + + limit = args.limit if args.limit is not None else 0 From 6cbe140b136b9971da8bd6d0410f09dc1f8346cd Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sat, 23 Nov 2019 20:36:30 +0300 Subject: [PATCH 02/31] Create class for RSS feed with basic parsing functionality Created Feed class for parsed RSS. For now, it only stores necessary fields from object returned by feedparser library. --- final_task/rss_reader/feed.py | 83 +++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 final_task/rss_reader/feed.py diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py new file mode 100644 index 0000000..8b55f0d --- /dev/null +++ b/final_task/rss_reader/feed.py @@ -0,0 +1,83 @@ +import urllib.parse +import html + +import feedparser + + +class URLFormatError(ValueError): + pass + + +class FeedNotFoundError(Exception): + pass + + +class IncorrectRSSError(Exception): + pass + + +class Feed: + + def __init__(self, link, limit=0, *, date=None, to_json=False): + self.link = self._try_fix_url(link) + self.title = None + self.items = [] + + self.limit = int(limit) + self.parse_remote() + + def parse_remote(self): + parsed_rss = feedparser.parse(self.link) + # Did Feedparser access feed as remote and is the HTTP status ok + if "status" not in parsed_rss or parsed_rss.status >= 400: + raise FeedNotFoundError("Could not connect or find RSS feed") + # Checks does parsed_feed object contain at least one main element of feed + if "title" not in parsed_rss.feed and "link" not in parsed_rss and len(parsed_rss.entries) < 0: + raise IncorrectRSSError("URL is not a correct RSS feed") + + self.title = parsed_rss.feed.get("title") or parsed_rss.feed.get("link") + + if self.limit < 1 or self.limit > len(parsed_rss.entries): + limit = len(parsed_rss.entries) + else: + limit = self.limit + self.items = [] + for entry in parsed_rss.entries[:limit]: + item = self.parse_remote_item(entry) + if item is not None: + self.items.append(item) + + def parse_remote_item(self, entry): + item = dict() + if "published_parsed" not in entry: + return None + item["date"] = entry.published + # item["date_str"] = self._create_date_str(entry.published_parsed) + + if "title" in entry: + item["title"] = html.unescape(entry.title) + else: + item["title"] = None + + item["link"] = entry.get("link") + + if len(entry.enclosures) > 0: + item["enclosure"] = entry.enclosures[0] + + item["description"] = entry.get("description") + + return item + + @staticmethod + def _try_fix_url(url): + """ + Attempts to fix and uniform url + :type url: str + """ + try: + parsed_url = urllib.parse.urlsplit(url, "https") + except ValueError: + raise URLFormatError("Error in url format") + else: + result_url = urllib.parse.urlunsplit(parsed_url) + return result_url if not result_url.endswith('/') else result_url[:-1] From 7adeb983d14ea845dd096732a3201c506407850d Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sat, 23 Nov 2019 20:44:52 +0300 Subject: [PATCH 03/31] Create module for html parsing Created module with function for parsing html, which can be in "description" field of RSS item. --- final_task/rss_reader/html_to_text.py | 43 +++++++++++++++++++++++ final_task/rss_reader/links_collection.py | 27 ++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 final_task/rss_reader/html_to_text.py create mode 100644 final_task/rss_reader/links_collection.py diff --git a/final_task/rss_reader/html_to_text.py b/final_task/rss_reader/html_to_text.py new file mode 100644 index 0000000..09dc1a1 --- /dev/null +++ b/final_task/rss_reader/html_to_text.py @@ -0,0 +1,43 @@ +from bs4 import BeautifulSoup +from .links_collection import LinksCollection + + +def _handle_image(tag, refs): + if "src" in tag.attrs and tag.attrs["src"]: + index = refs.add(tag.attrs["src"]) + alt_text = tag.attrs.get("alt", "no description") + tag.replace_with(f"[image {index}: {alt_text}][{index}]") + + +def _handle_link(tag, link, refs): + if ( + len(tag.get_text()) > 0 and + "href" in tag.attrs and + len(tag.attrs["href"]) > 0 and + tag.href != link + ): + index = refs.add(tag.attrs["href"]) + tag.insert_after(f"[{index}]") + + +def _create_refs_text(refs): + s = ["Links:\n"] + for i, r in enumerate(refs): + s.append(f"[{(i+1)}]: {r}\n") + s = "".join(s) + return s + + +def parse(html, skip_link=None): + refs = LinksCollection() + if skip_link is not None: + refs.add(skip_link) + soup = BeautifulSoup(html, features="lxml") + for tag in soup.find_all(name=["img", "a"]): + if tag.name == 'img': + _handle_image(tag, refs) + else: + _handle_link(tag, skip_link, refs) + + text = soup.get_text() + "\n\n" + _create_refs_text(refs) + return text diff --git a/final_task/rss_reader/links_collection.py b/final_task/rss_reader/links_collection.py new file mode 100644 index 0000000..0061e8c --- /dev/null +++ b/final_task/rss_reader/links_collection.py @@ -0,0 +1,27 @@ +from collections.abc import Collection + + +class LinksCollection(Collection): + + def __len__(self) -> int: + return len(self.items_dict) + + def __contains__(self, __x: object) -> bool: + return __x in self.items_dict + + def __iter__(self): + return self.items_dict.__iter__() + + def __init__(self, initial=None): + self.items_dict = dict() + self._next_index_ = 1 + + if initial is not None: + for element in initial: + self.add(element) + + def add(self, item): + if item is not None and item not in self.items_dict: + self.items_dict[item] = self._next_index_ + self._next_index_ += 1 + return self.items_dict[item] From 8df0ad317e22cbda10afff5fd3c8a8b67ba69cff Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sat, 23 Nov 2019 20:51:43 +0300 Subject: [PATCH 04/31] Add html parsing to Feed class --- final_task/rss_reader/feed.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index 8b55f0d..d7b3b96 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -3,6 +3,8 @@ import feedparser +from . import html_to_text + class URLFormatError(ValueError): pass @@ -64,8 +66,12 @@ def parse_remote_item(self, entry): if len(entry.enclosures) > 0: item["enclosure"] = entry.enclosures[0] - item["description"] = entry.get("description") - + if "description" in entry: + item["description"] = entry.description + if self._is_html(entry.description_detail.type): + item["description_parsed"] = html_to_text.parse(item["description"], skip_link=item.get("link")) + else: + item["description"] = None return item @staticmethod @@ -81,3 +87,7 @@ def _try_fix_url(url): else: result_url = urllib.parse.urlunsplit(parsed_url) return result_url if not result_url.endswith('/') else result_url[:-1] + + @staticmethod + def _is_html(element_type): + return element_type in ["text/html", "application/xhtml+xml"] From 9ab49d1666c6ef7d50fc886a082b000d6f3fb398 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sat, 23 Nov 2019 21:03:19 +0300 Subject: [PATCH 05/31] Add methods for generating plain text and json from RSS Added methods for generating json and plain text with links list to Feed class --- final_task/rss_reader/feed.py | 40 ++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index d7b3b96..cd305b0 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -1,5 +1,6 @@ import urllib.parse import html +import json import feedparser @@ -54,7 +55,6 @@ def parse_remote_item(self, entry): if "published_parsed" not in entry: return None item["date"] = entry.published - # item["date_str"] = self._create_date_str(entry.published_parsed) if "title" in entry: item["title"] = html.unescape(entry.title) @@ -74,6 +74,44 @@ def parse_remote_item(self, entry): item["description"] = None return item + def render_text(self): + s = [] + + title = self.title or "no title" + s.append(title) + s.append("\n") + + for item in self.items: + s.append("\n\n") + + item_title = item["title"] or "no title" + s.append("Title: " + item_title + "\n") + + s.append("Date: " + item["date"] + "\n") + + item_link = item["link"] or "no link" + s.append("Link: " + item_link + "\n") + + s.append("\n") + if "enclosure" in item: + s.append("Enclosure: " + item["enclosure"]+"\n") + s.append("\n") + + s.append("Description: ") + if "description_parsed" in item: + s.append(item["description_parsed"]) + else: + description = item["description"] or "no description" + s.append(description) + + s = "".join(s) + return s + + def render_json(self): + feed_dict = {"title": self.title, "items": self.items} + s = json.dumps(feed_dict, indent="\t") + return s + @staticmethod def _try_fix_url(url): """ From 498b4f820d286dd3c6b164020bd2584911278424 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sat, 23 Nov 2019 21:06:13 +0300 Subject: [PATCH 06/31] Remove unnecessary parameter from Feed constructor --- final_task/rss_reader/feed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index cd305b0..9884077 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -21,7 +21,7 @@ class IncorrectRSSError(Exception): class Feed: - def __init__(self, link, limit=0, *, date=None, to_json=False): + def __init__(self, link, limit=0, *, date=None): self.link = self._try_fix_url(link) self.title = None self.items = [] From 67ed39ce674a99054e59f5aaa7550a42aed42dae Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sat, 23 Nov 2019 21:18:09 +0300 Subject: [PATCH 07/31] Fix relative module import error --- final_task/rss_reader/feed.py | 2 +- final_task/rss_reader/html_to_text.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index 9884077..cc9c1a2 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -4,7 +4,7 @@ import feedparser -from . import html_to_text +import html_to_text class URLFormatError(ValueError): diff --git a/final_task/rss_reader/html_to_text.py b/final_task/rss_reader/html_to_text.py index 09dc1a1..3524575 100644 --- a/final_task/rss_reader/html_to_text.py +++ b/final_task/rss_reader/html_to_text.py @@ -1,5 +1,5 @@ from bs4 import BeautifulSoup -from .links_collection import LinksCollection +from links_collection import LinksCollection def _handle_image(tag, refs): From 5acb43da53f9ee9fed9f400c734bc93452ddcd50 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sat, 23 Nov 2019 21:36:26 +0300 Subject: [PATCH 08/31] Add RSS parsing and printing and handle custom exceptions --- final_task/rss_reader/rss_reader.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 8e9b6ac..f47b4b7 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -1,5 +1,6 @@ import argparse +from feed import Feed, URLFormatError, FeedNotFoundError, IncorrectRSSError # temp __version__ = "0.1" @@ -22,3 +23,13 @@ print("verbosity turned on") limit = args.limit if args.limit is not None else 0 + + try: + feed = Feed(rss_url, limit) + except (URLFormatError, FeedNotFoundError, IncorrectRSSError) as e: + print("Error: " + str(e)) + else: + if not args.json: + print(feed.render_text()) + else: + print(feed.render_json()) From b93e90bcfa5cee616eee842658a1d78698deeeef Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sun, 24 Nov 2019 01:09:21 +0300 Subject: [PATCH 09/31] Add logging and implement --verbose argument Added logging with ERROR level to display handled custom exceptions; and INFO level to display verbose status messages if corresponding argument is provided. --- final_task/rss_reader/feed.py | 15 +++++++++++++-- final_task/rss_reader/rss_reader.py | 12 ++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index cc9c1a2..281582a 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -1,7 +1,7 @@ import urllib.parse import html import json - +import logging import feedparser import html_to_text @@ -23,6 +23,7 @@ class Feed: def __init__(self, link, limit=0, *, date=None): self.link = self._try_fix_url(link) + logging.info(f'The link to the rss feed is "{self.link}"') self.title = None self.items = [] @@ -38,17 +39,23 @@ def parse_remote(self): if "title" not in parsed_rss.feed and "link" not in parsed_rss and len(parsed_rss.entries) < 0: raise IncorrectRSSError("URL is not a correct RSS feed") + logging.info("Feed successfully received") self.title = parsed_rss.feed.get("title") or parsed_rss.feed.get("link") if self.limit < 1 or self.limit > len(parsed_rss.entries): limit = len(parsed_rss.entries) + logging.info("Parsing items without limit") else: limit = self.limit + logging.info(f"Items limit is {limit}") self.items = [] - for entry in parsed_rss.entries[:limit]: + for i, entry in enumerate(parsed_rss.entries[:limit]): + logging.info(f"Parsing item {(i+1)}") item = self.parse_remote_item(entry) if item is not None: self.items.append(item) + else: + logging.info("Skipping invalid item") def parse_remote_item(self, entry): item = dict() @@ -70,11 +77,14 @@ def parse_remote_item(self, entry): item["description"] = entry.description if self._is_html(entry.description_detail.type): item["description_parsed"] = html_to_text.parse(item["description"], skip_link=item.get("link")) + logging.info("Item description is html and therefore converted to plain text") else: item["description"] = None return item def render_text(self): + logging.info("Generating plain text representation of feed") + s = [] title = self.title or "no title" @@ -108,6 +118,7 @@ def render_text(self): return s def render_json(self): + logging.info("Generating json representation of feed") feed_dict = {"title": self.title, "items": self.items} s = json.dumps(feed_dict, indent="\t") return s diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index f47b4b7..47f724c 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -1,4 +1,5 @@ import argparse +import logging from feed import Feed, URLFormatError, FeedNotFoundError, IncorrectRSSError @@ -20,16 +21,23 @@ rss_url = args.source if args.verbose: - print("verbosity turned on") + log_level = logging.INFO + else: + log_level = logging.WARNING limit = args.limit if args.limit is not None else 0 + logging.basicConfig(format="%(levelname)s:%(message)s", level=log_level) + + logging.info("Program starts") try: feed = Feed(rss_url, limit) except (URLFormatError, FeedNotFoundError, IncorrectRSSError) as e: - print("Error: " + str(e)) + logging.error(str(e)) else: if not args.json: print(feed.render_text()) else: print(feed.render_json()) + + logging.info("Program finishes") From fcbf10fe1526edc07f90965582528ed671bc4e42 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sun, 24 Nov 2019 18:57:52 +0300 Subject: [PATCH 10/31] Rename internal methods and remove incorrect docstring --- final_task/rss_reader/feed.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index 281582a..46ec6ee 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -28,9 +28,9 @@ def __init__(self, link, limit=0, *, date=None): self.items = [] self.limit = int(limit) - self.parse_remote() + self._parse_remote() - def parse_remote(self): + def _parse_remote(self): parsed_rss = feedparser.parse(self.link) # Did Feedparser access feed as remote and is the HTTP status ok if "status" not in parsed_rss or parsed_rss.status >= 400: @@ -51,13 +51,13 @@ def parse_remote(self): self.items = [] for i, entry in enumerate(parsed_rss.entries[:limit]): logging.info(f"Parsing item {(i+1)}") - item = self.parse_remote_item(entry) + item = self._parse_remote_item(entry) if item is not None: self.items.append(item) else: logging.info("Skipping invalid item") - def parse_remote_item(self, entry): + def _parse_remote_item(self, entry): item = dict() if "published_parsed" not in entry: return None @@ -125,10 +125,6 @@ def render_json(self): @staticmethod def _try_fix_url(url): - """ - Attempts to fix and uniform url - :type url: str - """ try: parsed_url = urllib.parse.urlsplit(url, "https") except ValueError: From 914c17b34f85a26afc1e558ae37f38931de1a04e Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sun, 24 Nov 2019 20:01:16 +0300 Subject: [PATCH 11/31] Update README --- final_task/README.md | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/final_task/README.md b/final_task/README.md index 7af281f..c17f2c6 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -1,3 +1,30 @@ -# Your readme here -Some text. -Checkout how to write this file using *markdown*. +# Console RSS Reader +Cli-based RSS Reader built with Python 3.8. +Supports all RSS standards, can handle incorrect RSS. +Also partially supports Atom feeds. + +## Getting Started +### Prerequisites +- Python 3.8 +- feedparser, lxml, beautifulsoup4 + +``` +pip install feedparser lxml beautifulsoup4 +``` +### Usage +The program runs from file `rss_parser.py` +``` +usage: rss_reader.py [-h] [--version] [--json] [-v] [--limit LIMIT] source + +Pure Python command-line RSS reader. + +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --version Print version info + --json Print result as JSON in stdout + -v, --verbose Outputs verbose status messages + --limit LIMIT Limit news topics if this parameter provided +``` From 2bc74105b372499f38c3098d769a8d67409c5f05 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sun, 24 Nov 2019 21:30:24 +0300 Subject: [PATCH 12/31] Add LICENSE and update README --- final_task/LICENSE | 21 +++++++++++++++++++++ final_task/README.md | 2 ++ 2 files changed, 23 insertions(+) create mode 100644 final_task/LICENSE diff --git a/final_task/LICENSE b/final_task/LICENSE new file mode 100644 index 0000000..5450455 --- /dev/null +++ b/final_task/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Denis Marfonov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/final_task/README.md b/final_task/README.md index c17f2c6..0f7f447 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -28,3 +28,5 @@ optional arguments: -v, --verbose Outputs verbose status messages --limit LIMIT Limit news topics if this parameter provided ``` +### Licence +This project is licensed under the MIT License - see the LICENSE file for details. From 24c0cc615be1d7ca71ea9c4de796388931ed8132 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sun, 24 Nov 2019 22:28:45 +0300 Subject: [PATCH 13/31] Move code to separate function This is done in order to add main() function to 'console scripts' in setup.py --- final_task/rss_reader/rss_reader.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 47f724c..f5e095c 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -6,7 +6,8 @@ # temp __version__ = "0.1" -if __name__ == '__main__': + +def main(): parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.") parser.add_argument("source", help="RSS URL") @@ -41,3 +42,7 @@ print(feed.render_json()) logging.info("Program finishes") + + +if __name__ == '__main__': + main() From 9f214b50230c4235661f800b6b2f5dfc6d379fde Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sun, 24 Nov 2019 22:36:36 +0300 Subject: [PATCH 14/31] Make correct package from code folder and fill setup.py Added __init__ to make correct python package. Created __main__ to allow starting package as program Added content to setup.py. --- final_task/{ => rss_reader}/__init__.py | 0 final_task/rss_reader/__main__.py | 3 +++ final_task/setup.py | 29 +++++++++++++++++++++++++ 3 files changed, 32 insertions(+) rename final_task/{ => rss_reader}/__init__.py (100%) create mode 100644 final_task/rss_reader/__main__.py diff --git a/final_task/__init__.py b/final_task/rss_reader/__init__.py similarity index 100% rename from final_task/__init__.py rename to final_task/rss_reader/__init__.py diff --git a/final_task/rss_reader/__main__.py b/final_task/rss_reader/__main__.py new file mode 100644 index 0000000..9223eaf --- /dev/null +++ b/final_task/rss_reader/__main__.py @@ -0,0 +1,3 @@ +from rss_reader import main + +main() \ No newline at end of file diff --git a/final_task/setup.py b/final_task/setup.py index e69de29..48ee6b5 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -0,0 +1,29 @@ +from setuptools import setup, find_packages + +with open("README.md", "r") as fh: + long_description = fh.read() + +setup( + name='rss-reader_scarzdz', + version='0.1', + url='https://github.com/scarzdz/FinalTaskRssParser', + license='MIT', + author='Denis Marfonov', + author_email='marfonovdenis@gmail.com', + description='Cli-based RSS Reader', + long_description=long_description, + long_description_content_type="text/markdown", + packages=['rss_reader'], + classifiers=[ + "Programming Language :: Python :: 3" + "Programming Language :: Python :: 3.8", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.8', + entry_points={ + 'console_scripts': [ + 'rss-reader=rss_reader.rss_reader:main', + ], + }, +) From cb62489ce21b8526fd6ab31522deaf6dabf662dd Mon Sep 17 00:00:00 2001 From: scarzdz Date: Sun, 24 Nov 2019 22:50:31 +0300 Subject: [PATCH 15/31] Add dependencies list and reformat file --- final_task/setup.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/final_task/setup.py b/final_task/setup.py index 48ee6b5..5d9dd6e 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -1,29 +1,34 @@ -from setuptools import setup, find_packages +from setuptools import setup with open("README.md", "r") as fh: long_description = fh.read() setup( - name='rss-reader_scarzdz', - version='0.1', - url='https://github.com/scarzdz/FinalTaskRssParser', - license='MIT', - author='Denis Marfonov', - author_email='marfonovdenis@gmail.com', - description='Cli-based RSS Reader', + name="rss-reader_scarzdz", + version="0.1", + url="https://github.com/scarzdz/FinalTaskRssParser", + license="MIT", + author="Denis Marfonov", + author_email="marfonovdenis@gmail.com", + description="Cli-based RSS Reader", long_description=long_description, long_description_content_type="text/markdown", - packages=['rss_reader'], + packages=["rss_reader"], classifiers=[ "Programming Language :: Python :: 3" "Programming Language :: Python :: 3.8", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - python_requires='>=3.8', + python_requires=">=3.8", + install_requires=[ + "beautifulsoup4", + "feedparser", + "lxml" + ], entry_points={ - 'console_scripts': [ - 'rss-reader=rss_reader.rss_reader:main', + "console_scripts": [ + "rss-reader=rss_reader.rss_reader:main", ], }, ) From 0d63bc4145cb8572fb7d8cccdbc9a4b6472e8ff6 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 01:00:54 +0300 Subject: [PATCH 16/31] Fix typo --- final_task/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/final_task/setup.py b/final_task/setup.py index 5d9dd6e..aa62797 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -15,7 +15,7 @@ long_description_content_type="text/markdown", packages=["rss_reader"], classifiers=[ - "Programming Language :: Python :: 3" + "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.8", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", From 7c50244bc7357b600c77466b3d474a0c72f6c1d5 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 01:20:08 +0300 Subject: [PATCH 17/31] Change imports to relative --- final_task/rss_reader/__main__.py | 4 ++-- final_task/rss_reader/feed.py | 2 +- final_task/rss_reader/html_to_text.py | 2 +- final_task/rss_reader/rss_reader.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/final_task/rss_reader/__main__.py b/final_task/rss_reader/__main__.py index 9223eaf..f8e010c 100644 --- a/final_task/rss_reader/__main__.py +++ b/final_task/rss_reader/__main__.py @@ -1,3 +1,3 @@ -from rss_reader import main +from .rss_reader import main -main() \ No newline at end of file +main() diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index 46ec6ee..476f630 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -4,7 +4,7 @@ import logging import feedparser -import html_to_text +from . import html_to_text class URLFormatError(ValueError): diff --git a/final_task/rss_reader/html_to_text.py b/final_task/rss_reader/html_to_text.py index 3524575..09dc1a1 100644 --- a/final_task/rss_reader/html_to_text.py +++ b/final_task/rss_reader/html_to_text.py @@ -1,5 +1,5 @@ from bs4 import BeautifulSoup -from links_collection import LinksCollection +from .links_collection import LinksCollection def _handle_image(tag, refs): diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index f5e095c..0cbadf8 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -1,7 +1,7 @@ import argparse import logging -from feed import Feed, URLFormatError, FeedNotFoundError, IncorrectRSSError +from .feed import Feed, URLFormatError, FeedNotFoundError, IncorrectRSSError # temp __version__ = "0.1" From 9ec3adec203657946f38ad02040d1c742855a082 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 01:21:56 +0300 Subject: [PATCH 18/31] Update README --- final_task/README.md | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/final_task/README.md b/final_task/README.md index 0f7f447..dbb2189 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -1,5 +1,5 @@ -# Console RSS Reader -Cli-based RSS Reader built with Python 3.8. +# RSS Reader +Cli-based RSS reader built with Python 3.8. Supports all RSS standards, can handle incorrect RSS. Also partially supports Atom feeds. @@ -11,10 +11,25 @@ Also partially supports Atom feeds. ``` pip install feedparser lxml beautifulsoup4 ``` +### Installation +``` +pip install -i https://test.pypi.org/simple/ rss-reader-scarzdz +``` +Also you can just download source code and install using: +``` +$ python final_task/setup.py install +``` +### Running +After installation, `rss-reader` command is added to PATH. + +Alternatively, the application can be run from the source file: +``` +$ cd final_task +$ python -m rss_reader ... +``` ### Usage -The program runs from file `rss_parser.py` ``` -usage: rss_reader.py [-h] [--version] [--json] [-v] [--limit LIMIT] source +usage: rss-reader [-h] [--version] [--json] [-v] [--limit LIMIT] source Pure Python command-line RSS reader. From 53e0a60377500686d651a0763d07947b13d2d42a Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 01:25:21 +0300 Subject: [PATCH 19/31] Make CLI always show correct program name --- final_task/rss_reader/rss_reader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 0cbadf8..12351af 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -5,10 +5,11 @@ # temp __version__ = "0.1" +PROG = "rss-reader" def main(): - parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.") + parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.", prog=PROG) parser.add_argument("source", help="RSS URL") parser.add_argument("--version", action="version", version=f"{parser.prog}s {__version__}", From 26cf900dac80c8d4e28e20d773ff3d895915d512 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 01:36:42 +0300 Subject: [PATCH 20/31] Fill requirements.txt --- final_task/rss_reader/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/final_task/rss_reader/requirements.txt b/final_task/rss_reader/requirements.txt index e69de29..5c90ddd 100644 --- a/final_task/rss_reader/requirements.txt +++ b/final_task/rss_reader/requirements.txt @@ -0,0 +1,3 @@ +lxml~=4.4.1 +feedparser~=5.2.1 +beautifulsoup4~=4.8.1 \ No newline at end of file From ca5ae76db61139522ecd1e4642421e5263449370 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 01:38:47 +0300 Subject: [PATCH 21/31] Bump version number to 0.2 --- final_task/rss_reader/rss_reader.py | 2 +- final_task/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 12351af..50a047a 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -4,7 +4,7 @@ from .feed import Feed, URLFormatError, FeedNotFoundError, IncorrectRSSError # temp -__version__ = "0.1" +__version__ = "0.2" PROG = "rss-reader" diff --git a/final_task/setup.py b/final_task/setup.py index aa62797..f037459 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -5,7 +5,7 @@ setup( name="rss-reader_scarzdz", - version="0.1", + version="0.2", url="https://github.com/scarzdz/FinalTaskRssParser", license="MIT", author="Denis Marfonov", From 80e321b37819238bd310db9ce28f99afb61b0631 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 02:19:06 +0300 Subject: [PATCH 22/31] Add --date argument to CLI --- final_task/rss_reader/feed.py | 8 +++++++- final_task/rss_reader/rss_reader.py | 25 ++++++++++++++++++++----- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index 476f630..31142f5 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -6,6 +6,8 @@ from . import html_to_text +DATE_FORMAT = "%Y%m%d" + class URLFormatError(ValueError): pass @@ -28,7 +30,11 @@ def __init__(self, link, limit=0, *, date=None): self.items = [] self.limit = int(limit) - self._parse_remote() + if date is None: + self._parse_remote() + else: + pass + def _parse_remote(self): parsed_rss = feedparser.parse(self.link) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 50a047a..4228076 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -1,13 +1,23 @@ import argparse import logging +import time -from .feed import Feed, URLFormatError, FeedNotFoundError, IncorrectRSSError +from . import feed # temp __version__ = "0.2" PROG = "rss-reader" +def date_str(string): + logging.info("Checking date argument") + try: + time.strptime(string, format=feed.DATE_FORMAT) + except ValueError: + raise argparse.ArgumentTypeError(f"Incorrect date format: {string}") + return string + + def main(): parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.", prog=PROG) @@ -18,6 +28,11 @@ def main(): parser.add_argument("-v", "--verbose", action="store_true", help="Outputs verbose status messages") parser.add_argument("--limit", type=int, help="Limit news topics if this parameter provided") + # It is done because argparse treat '%' in parameters as old-style formatting + date_format_escaped = feed.DATE_FORMAT.replace("%", "%%") + parser.add_argument("--date", type=date_str, + help=f"Load news with date ({date_format_escaped}) from cache, if this parameter provided") + args = parser.parse_args() rss_url = args.source @@ -33,14 +48,14 @@ def main(): logging.info("Program starts") try: - feed = Feed(rss_url, limit) - except (URLFormatError, FeedNotFoundError, IncorrectRSSError) as e: + rss_feed = feed.Feed(rss_url, limit) + except (feed.URLFormatError, feed.FeedNotFoundError, feed.IncorrectRSSError) as e: logging.error(str(e)) else: if not args.json: - print(feed.render_text()) + print(rss_feed.render_text()) else: - print(feed.render_json()) + print(rss_feed.render_json()) logging.info("Program finishes") From 7944473d01f2d7004ff862ca581b33d3ffa6096b Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 13:00:37 +0300 Subject: [PATCH 23/31] Implement local caching using sqlite database --- final_task/rss_reader/database.py | 118 ++++++++++++++++++++++++++++ final_task/rss_reader/feed.py | 79 ++++++++++++++----- final_task/rss_reader/rss_reader.py | 19 ++--- 3 files changed, 186 insertions(+), 30 deletions(-) create mode 100644 final_task/rss_reader/database.py diff --git a/final_task/rss_reader/database.py b/final_task/rss_reader/database.py new file mode 100644 index 0000000..070eb9a --- /dev/null +++ b/final_task/rss_reader/database.py @@ -0,0 +1,118 @@ +import sqlite3 +import time +import pkg_resources + +DATE_FORMAT = "%Y-%m-%d" +DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" + +DATA_FILE = "data/rss.sqlite" + + +class DBError(Exception): + pass + + +class DB: + def __init__(self): + path = pkg_resources.resource_filename(__name__, DATA_FILE) + try: + self.conn = sqlite3.connect(path) + except sqlite3.OperationalError as e: + raise DBError(str(e)) + self.cursor = self.conn.cursor() + self.feed_id = None + + def _create_db(self): + self.cursor.execute("DROP TABLE IF EXISTS feeds") + self.cursor.execute("DROP TABLE IF EXISTS items") + self.cursor.execute(''' + CREATE TABLE feeds( + id INTEGER PRIMARY KEY, + title TEXT, + link TEXT UNIQUE NOT NULL + ) + ''') + self.cursor.execute(''' + CREATE TABLE items( + id INTEGER PRIMARY KEY, + feed_id INTEGER NOT NULL, + published TEXT NOT NULL, + title TEXT, + link TEXT, + enclosure TEXT, + description TEXT, + description_parsed TEXT, + FOREIGN KEY (feed_id) REFERENCES feeds(id) + ) + ''') + self.cursor.execute(''' + CREATE UNIQUE INDEX un_items ON items(published, title, link) + ''') + self.conn.commit() + + def get_feed(self, feed_link, req_date, limit=-1): + feed_info = self._get_feed_info_if_exists(feed_link) + if feed_info is not None: + req_date = time.strftime(DATE_FORMAT, req_date) + + self.cursor.execute(''' + select i.title, i.published, i.link, i.enclosure, i.description, i.description_parsed + from items i join feeds f on i.feed_id = f.id + where f.id=(?) and date(i.published)=date(?) limit (?) + ''', (feed_info[0], req_date, limit) + ) + + items = [] + for i in self.cursor.fetchall(): + item = dict(title=i[0], date=time.strptime(i[1], DATETIME_FORMAT), link=i[2], enclosure=i[3], + description=i[4], description_parsed=i[5]) + items.append(item) + return dict(title=feed_info[1], items=items) + else: + + return None + + def store_feed(self, link, title, items): + feed_id = self._get_feed_id_if_exists(link) + if feed_id is None: + self.cursor.execute("insert into feeds(title, link) values (?, ?)", (title, link)) + feed_id = self.cursor.lastrowid + self.conn.commit() + self.feed_id = feed_id + self._store_items(items) + + def _store_items(self, items): + for i in items: + item_dict = dict( + feed_id=self.feed_id, + published=time.strftime(DATETIME_FORMAT, i["date"]), + title=i["title"], + link=i["link"], + enclosure=i["enclosure"], + description=i["description"], + description_parsed=i["description_parsed"] + ) + self.cursor.execute(''' + insert or ignore into items(feed_id, published, title, link, enclosure, description, description_parsed) + values (:feed_id, :published, :title, :link, :enclosure, :description, :description_parsed) + ''', item_dict) + self.conn.commit() + + def _get_feed_id_if_exists(self, feed_link): + feed_info = self._get_feed_info_if_exists(feed_link) + if feed_info is not None: + return feed_info[0] + else: + return None + + def _get_feed_info_if_exists(self, feed_link): + self.cursor.execute("select feeds.id, feeds.title from feeds where feeds.link=?", (feed_link,)) + feed_info = self.cursor.fetchone() + if feed_info is not None: + return feed_info[0], feed_info[1] + return None + + +if __name__ == '__main__': + db = DB() + db._create_db() diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index 31142f5..14ab064 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -2,11 +2,11 @@ import html import json import logging +import time import feedparser from . import html_to_text - -DATE_FORMAT = "%Y%m%d" +from .database import DB, DBError class URLFormatError(ValueError): @@ -21,6 +21,10 @@ class IncorrectRSSError(Exception): pass +class LocalCacheError(Exception): + pass + + class Feed: def __init__(self, link, limit=0, *, date=None): @@ -30,13 +34,25 @@ def __init__(self, link, limit=0, *, date=None): self.items = [] self.limit = int(limit) + self.date = date + try: + self.db = DB() + except DBError: + if self.date is not None: + raise LocalCacheError("Unable to open local cache") + else: + self.db = None + logging.warning("Unable to open local cache. Feed items will not be saved.") + if date is None: self._parse_remote() + self._save_to_cache() else: - pass - + self.date = date + self._load_cache() def _parse_remote(self): + logging.info("Parsing feed from remote source") parsed_rss = feedparser.parse(self.link) # Did Feedparser access feed as remote and is the HTTP status ok if "status" not in parsed_rss or parsed_rss.status >= 400: @@ -67,7 +83,7 @@ def _parse_remote_item(self, entry): item = dict() if "published_parsed" not in entry: return None - item["date"] = entry.published + item["date"] = entry.published_parsed if "title" in entry: item["title"] = html.unescape(entry.title) @@ -78,7 +94,10 @@ def _parse_remote_item(self, entry): if len(entry.enclosures) > 0: item["enclosure"] = entry.enclosures[0] + else: + item["enclosure"] = None + item["description_parsed"] = None if "description" in entry: item["description"] = entry.description if self._is_html(entry.description_detail.type): @@ -88,6 +107,24 @@ def _parse_remote_item(self, entry): item["description"] = None return item + def _load_cache(self): + logging.info("Loading feed from cache") + limit = -1 + if self.limit > 0: + limit = self.limit + db_feed = self.db.get_feed(self.link, self.date, limit) + if db_feed is not None: + logging.info("Feed successfully loaded") + self.title = db_feed["title"] + self.items = db_feed["items"] + else: + logging.warning("Feed with the specified link is not found in cache") + + def _save_to_cache(self): + if self.db is not None: + self.db.store_feed(self.link, self.title, self.items) + logging.info("Feed saved to cache") + def render_text(self): logging.info("Generating plain text representation of feed") @@ -96,28 +133,28 @@ def render_text(self): title = self.title or "no title" s.append(title) s.append("\n") + if len(self.items) == 0: + s.append("\nno items to show\n") + else: + for item in self.items: + s.append("\n\n") - for item in self.items: - s.append("\n\n") - - item_title = item["title"] or "no title" - s.append("Title: " + item_title + "\n") + item_title = item["title"] or "no title" + s.append("Title: " + item_title + "\n") - s.append("Date: " + item["date"] + "\n") + item_date = time.strftime("%a, %d %b %Y %H:%M:%S +0000", item["date"]) + s.append("Date: " + item_date + "\n") - item_link = item["link"] or "no link" - s.append("Link: " + item_link + "\n") + item_link = item["link"] or "no link" + s.append("Link: " + item_link + "\n") - s.append("\n") - if "enclosure" in item: - s.append("Enclosure: " + item["enclosure"]+"\n") s.append("\n") + if item["enclosure"] is not None: + s.append("Enclosure: " + item["enclosure"]+"\n") + s.append("\n") - s.append("Description: ") - if "description_parsed" in item: - s.append(item["description_parsed"]) - else: - description = item["description"] or "no description" + s.append("Description: ") + description = item["description_parsed"] or item["description"] or "no description" s.append(description) s = "".join(s) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 4228076..73c8391 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -7,15 +7,16 @@ # temp __version__ = "0.2" PROG = "rss-reader" +DATE_FORMAT = "%Y%m%d" def date_str(string): logging.info("Checking date argument") try: - time.strptime(string, format=feed.DATE_FORMAT) + date = time.strptime(string, DATE_FORMAT) except ValueError: raise argparse.ArgumentTypeError(f"Incorrect date format: {string}") - return string + return date def main(): @@ -29,7 +30,7 @@ def main(): parser.add_argument("--limit", type=int, help="Limit news topics if this parameter provided") # It is done because argparse treat '%' in parameters as old-style formatting - date_format_escaped = feed.DATE_FORMAT.replace("%", "%%") + date_format_escaped = DATE_FORMAT.replace("%", "%%") parser.add_argument("--date", type=date_str, help=f"Load news with date ({date_format_escaped}) from cache, if this parameter provided") @@ -48,9 +49,13 @@ def main(): logging.info("Program starts") try: - rss_feed = feed.Feed(rss_url, limit) - except (feed.URLFormatError, feed.FeedNotFoundError, feed.IncorrectRSSError) as e: + if args.date is None: + rss_feed = feed.Feed(rss_url, limit) + else: + rss_feed = feed.Feed(rss_url, limit, date=args.date) + except (feed.URLFormatError, feed.FeedNotFoundError, feed.IncorrectRSSError, feed.LocalCacheError) as e: logging.error(str(e)) + else: if not args.json: print(rss_feed.render_text()) @@ -58,7 +63,3 @@ def main(): print(rss_feed.render_json()) logging.info("Program finishes") - - -if __name__ == '__main__': - main() From aa24f8b8d0eff2b940fb5cb8be6eeab8bb5828e8 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 18:22:59 +0300 Subject: [PATCH 24/31] Add check for db, turn main class into context manager If db is not exist or invalid, it is (re-)created. Feed class become context manager, it no longer contains heavy methods in init. Also, it helps to close db connector properly. Finally, slightly change plain text conversion. --- final_task/rss_reader/database.py | 59 ++++++++++++++++++++++------- final_task/rss_reader/feed.py | 33 +++++++++++----- final_task/rss_reader/rss_reader.py | 17 +++------ 3 files changed, 74 insertions(+), 35 deletions(-) diff --git a/final_task/rss_reader/database.py b/final_task/rss_reader/database.py index 070eb9a..2982337 100644 --- a/final_task/rss_reader/database.py +++ b/final_task/rss_reader/database.py @@ -1,6 +1,7 @@ import sqlite3 import time import pkg_resources +import logging DATE_FORMAT = "%Y-%m-%d" DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" @@ -19,10 +20,24 @@ def __init__(self): self.conn = sqlite3.connect(path) except sqlite3.OperationalError as e: raise DBError(str(e)) + self.cursor = self.conn.cursor() self.feed_id = None + if not self._is_db_exist(): + self._create_db() + + def _is_db_exist(self): + try: + self.cursor.execute('''SELECT count(name) FROM sqlite_master + WHERE type='table' AND (name='items' or name='feeds')''') + except sqlite3.DatabaseError: + return False + # if the count is 2, then tables exists + return self.cursor.fetchone()[0] == 2 + def _create_db(self): + logging.info("Creating DB for local cache") self.cursor.execute("DROP TABLE IF EXISTS feeds") self.cursor.execute("DROP TABLE IF EXISTS items") self.cursor.execute(''' @@ -54,13 +69,15 @@ def get_feed(self, feed_link, req_date, limit=-1): feed_info = self._get_feed_info_if_exists(feed_link) if feed_info is not None: req_date = time.strftime(DATE_FORMAT, req_date) - - self.cursor.execute(''' - select i.title, i.published, i.link, i.enclosure, i.description, i.description_parsed - from items i join feeds f on i.feed_id = f.id - where f.id=(?) and date(i.published)=date(?) limit (?) - ''', (feed_info[0], req_date, limit) - ) + try: + self.cursor.execute(''' + select i.title, i.published, i.link, i.enclosure, i.description, i.description_parsed + from items i join feeds f on i.feed_id = f.id + where f.id=(?) and date(i.published)=date(?) limit (?) + ''', (feed_info[0], req_date, limit) + ) + except sqlite3.Error: + raise DBError("Error getting items from db") items = [] for i in self.cursor.fetchall(): @@ -75,12 +92,19 @@ def get_feed(self, feed_link, req_date, limit=-1): def store_feed(self, link, title, items): feed_id = self._get_feed_id_if_exists(link) if feed_id is None: - self.cursor.execute("insert into feeds(title, link) values (?, ?)", (title, link)) - feed_id = self.cursor.lastrowid + try: + self.cursor.execute("insert into feeds(title, link) values (?, ?)", (title, link)) + feed_id = self.cursor.lastrowid + except sqlite3.Error: + self.conn.rollback() + raise DBError("Error adding feed to db") self.conn.commit() self.feed_id = feed_id self._store_items(items) + def close(self): + self.conn.close() + def _store_items(self, items): for i in items: item_dict = dict( @@ -92,10 +116,14 @@ def _store_items(self, items): description=i["description"], description_parsed=i["description_parsed"] ) - self.cursor.execute(''' - insert or ignore into items(feed_id, published, title, link, enclosure, description, description_parsed) - values (:feed_id, :published, :title, :link, :enclosure, :description, :description_parsed) - ''', item_dict) + try: + self.cursor.execute(''' + insert or ignore into items(feed_id, published, title, link, enclosure, description, description_parsed) + values (:feed_id, :published, :title, :link, :enclosure, :description, :description_parsed) + ''', item_dict) + except sqlite3.Error: + self.conn.rollback() + raise DBError("Error adding item to db") self.conn.commit() def _get_feed_id_if_exists(self, feed_link): @@ -106,7 +134,10 @@ def _get_feed_id_if_exists(self, feed_link): return None def _get_feed_info_if_exists(self, feed_link): - self.cursor.execute("select feeds.id, feeds.title from feeds where feeds.link=?", (feed_link,)) + try: + self.cursor.execute("select feeds.id, feeds.title from feeds where feeds.link=?", (feed_link,)) + except sqlite3.Error: + raise DBError("Error checking Feed in db") feed_info = self.cursor.fetchone() if feed_info is not None: return feed_info[0], feed_info[1] diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index 14ab064..8dea1c7 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -9,19 +9,23 @@ from .database import DB, DBError -class URLFormatError(ValueError): +class FeedError(Exception): pass -class FeedNotFoundError(Exception): +class URLFormatError(FeedError, ValueError): pass -class IncorrectRSSError(Exception): +class FeedNotFoundError(FeedError): pass -class LocalCacheError(Exception): +class IncorrectRSSError(FeedError): + pass + + +class LocalCacheError(FeedError): pass @@ -32,9 +36,18 @@ def __init__(self, link, limit=0, *, date=None): logging.info(f'The link to the rss feed is "{self.link}"') self.title = None self.items = [] - self.limit = int(limit) self.date = date + self.db = None + + def __enter__(self): + self.load() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def load(self): try: self.db = DB() except DBError: @@ -44,13 +57,15 @@ def __init__(self, link, limit=0, *, date=None): self.db = None logging.warning("Unable to open local cache. Feed items will not be saved.") - if date is None: + if self.date is None: self._parse_remote() self._save_to_cache() else: - self.date = date self._load_cache() + def close(self): + self.db.close() + def _parse_remote(self): logging.info("Parsing feed from remote source") parsed_rss = feedparser.parse(self.link) @@ -128,8 +143,7 @@ def _save_to_cache(self): def render_text(self): logging.info("Generating plain text representation of feed") - s = [] - + s = ["\nFeed: "] title = self.title or "no title" s.append(title) s.append("\n") @@ -153,7 +167,6 @@ def render_text(self): s.append("Enclosure: " + item["enclosure"]+"\n") s.append("\n") - s.append("Description: ") description = item["description_parsed"] or item["description"] or "no description" s.append(description) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 73c8391..926f585 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -49,17 +49,12 @@ def main(): logging.info("Program starts") try: - if args.date is None: - rss_feed = feed.Feed(rss_url, limit) - else: - rss_feed = feed.Feed(rss_url, limit, date=args.date) - except (feed.URLFormatError, feed.FeedNotFoundError, feed.IncorrectRSSError, feed.LocalCacheError) as e: + with feed.Feed(rss_url, limit, date=args.date) as rss_feed: + if not args.json: + print(rss_feed.render_text()) + else: + print(rss_feed.render_json()) + except feed.FeedError as e: logging.error(str(e)) - else: - if not args.json: - print(rss_feed.render_text()) - else: - print(rss_feed.render_json()) - logging.info("Program finishes") From 962802243221d6bf79e5283932dc5d7a171ae076 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 18:28:36 +0300 Subject: [PATCH 25/31] Add DB exceptions handle(re-raising) to main class --- final_task/rss_reader/feed.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index 8dea1c7..b4520a1 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -13,7 +13,7 @@ class FeedError(Exception): pass -class URLFormatError(FeedError, ValueError): +class URLFormatError(FeedError): pass @@ -127,7 +127,10 @@ def _load_cache(self): limit = -1 if self.limit > 0: limit = self.limit - db_feed = self.db.get_feed(self.link, self.date, limit) + try: + db_feed = self.db.get_feed(self.link, self.date, limit) + except DBError: + raise LocalCacheError("Unable to load items from cache") if db_feed is not None: logging.info("Feed successfully loaded") self.title = db_feed["title"] @@ -137,7 +140,10 @@ def _load_cache(self): def _save_to_cache(self): if self.db is not None: - self.db.store_feed(self.link, self.title, self.items) + try: + self.db.store_feed(self.link, self.title, self.items) + except DBError: + raise LocalCacheError("Unable to save feed to cache") logging.info("Feed saved to cache") def render_text(self): From 8ae5afdfffc9ae0041967352721dcc4786cfb409 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 19:41:48 +0300 Subject: [PATCH 26/31] Update README and slightly change json output --- final_task/README.md | 55 ++++++++++++++++++++++++++++++++++- final_task/rss_reader/feed.py | 4 +-- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/final_task/README.md b/final_task/README.md index dbb2189..60adc59 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -42,6 +42,59 @@ optional arguments: --json Print result as JSON in stdout -v, --verbose Outputs verbose status messages --limit LIMIT Limit news topics if this parameter provided + --date DATE Load news with date (%Y%m%d) from cache, if this parameter + provided ``` -### Licence +## Behavior +RSS Reader can work in online or offline mode. + +In **online** mode, when `--date` argument is not provided, the application loads and parses rss feed from `source` argument. +It is done using `feedparser` library. +Parsed news saved in **_sqlite database_**, which located in `rss_parser/data/rss.sqlite`. +If item contains _html_ markup, it converted to plain text. + +In **offline** mode, when `--date` argument is provided, +the application loads news with specified feed link and date from the database. + +News printed to stdout in the following format: + +``` +Feed: *RSS feed title* + + +Title: *item 1 title* +Date: *%a, %d %b %Y %H:%M:%S +0000* +Link: https://example.com/link_to_item + +*Item description* + +Links: +[1]: *first link is always link to item* +[2]: Others can be links parsed from or tags + + +Title: *item 2 title* +Date: ... +``` + +News is converted to json like this: +``` +{ + "title": "*Feed title*", + "link": "*link to feed*" + "items": [ + { + "title": "*item 1 title*", + "date": *time.struct_time tuple*, + "link": "*link to item*", + "enclosure": *null* or *link to eclosure*, + "description": "*item description*", + "description_parsed": "*description parsed to plain text*" or *null* if description is text + }, + ... + ] +} +``` + +## Licence This project is licensed under the MIT License - see the LICENSE file for details. diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index b4520a1..eb29605 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -181,8 +181,8 @@ def render_text(self): def render_json(self): logging.info("Generating json representation of feed") - feed_dict = {"title": self.title, "items": self.items} - s = json.dumps(feed_dict, indent="\t") + feed_dict = {"title": self.title, "link": self.link, "items": self.items} + s = json.dumps(feed_dict, indent=2) return s @staticmethod From ef7f12a3a6b88fa3fe9bc2cea643dd03000128a2 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 19:42:56 +0300 Subject: [PATCH 27/31] Bumps version number --- final_task/rss_reader/rss_reader.py | 2 +- final_task/setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index 926f585..a7e2dec 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -5,7 +5,7 @@ from . import feed # temp -__version__ = "0.2" +__version__ = "0.3" PROG = "rss-reader" DATE_FORMAT = "%Y%m%d" diff --git a/final_task/setup.py b/final_task/setup.py index f037459..2b3336b 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -5,7 +5,7 @@ setup( name="rss-reader_scarzdz", - version="0.2", + version="0.3", url="https://github.com/scarzdz/FinalTaskRssParser", license="MIT", author="Denis Marfonov", @@ -30,5 +30,5 @@ "console_scripts": [ "rss-reader=rss_reader.rss_reader:main", ], - }, + } ) From 7f63ba87db2517835418c0207ad4e184d232a1bc Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 19:55:30 +0300 Subject: [PATCH 28/31] Make post-release to correct mistake --- final_task/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/final_task/setup.py b/final_task/setup.py index 2b3336b..0341368 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -5,7 +5,7 @@ setup( name="rss-reader_scarzdz", - version="0.3", + version="0.3.post1", url="https://github.com/scarzdz/FinalTaskRssParser", license="MIT", author="Denis Marfonov", From 4eda8c6fd11775ac16b2f6e2e53d0b5ecdc36b6b Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 22:47:29 +0300 Subject: [PATCH 29/31] Add epub and html creating --- final_task/rss_reader/book_gen.py | 110 ++++++++++++++++++++++++++++ final_task/rss_reader/feed.py | 7 ++ final_task/rss_reader/rss_reader.py | 17 ++++- 3 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 final_task/rss_reader/book_gen.py diff --git a/final_task/rss_reader/book_gen.py b/final_task/rss_reader/book_gen.py new file mode 100644 index 0000000..605c5e6 --- /dev/null +++ b/final_task/rss_reader/book_gen.py @@ -0,0 +1,110 @@ +import time +import os.path +import html + +from ebooklib import epub + +DATE_FORMAT = "%Y%m%d" + + +def _render_document(title, items, date_str): + html = [""] + h_title = f"{date_str}" + html.append(h_title) + html.append("") + + html.append("

") + html.append(title) + html.append("

") + + for i in items: + html.append(_render_item(i)) + + html = "".join(html) + return html + + +def _render_item(item): + html = ["
"] + title = item["title"] or "No Headline" + html.append(f"

{title}

") + date = time.strftime("%a, %d %b %Y %H:%M:%S", item["date"]) + html.append(f"

{date}

") + if item["link"] is not None: + link = item["link"] + html.append(f"

{link}

") + else: + html.append(f"

no link

") + description = item["description"] or "No description" + html.append(f"

{description}

") + html = "".join(html) + return html + + +def _gen_id(title: str, date:str) -> str: + """ + generate string for book id and file name + + :return: generated string + """ + string = title + "_" + date + string = string.lower() + for c in r'\|/:*?"<>': + string.replace(c, '_') + if len(string) > 122: + string = string[:121] + "..." + return string + + +def _create_html(book_id, bookpath, text): + # text = html.escape(text) + file = None + if os.path.isdir(bookpath): + file = open(os.path.join(bookpath, book_id + ".html"), "w", encoding="utf-8") + elif not os.path.exists(bookpath) or os.path.isfile(bookpath) or os.path.islink(bookpath): + file = open(bookpath, "w", encoding="utf-8") + try: + file.write(text) + file.close() + except (AttributeError, OSError): + pass + + +def _create_epub(book_title, book_id, bookpath, text): + + book = epub.EpubBook() + book.set_language('en') + book.set_identifier(book_id) + book.set_title(book_title) + + content = epub.EpubHtml(title='News', file_name='content.xhtml') + + content.set_content(text) + book.add_item(content) + + book.toc = (content,) + book.spine = [content] + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + if os.path.isdir(bookpath): + epub.write_epub(os.path.join(bookpath, book_id+".epub"), book) + elif not os.path.exists(bookpath) or os.path.isfile(bookpath) or os.path.islink(bookpath): + epub.write_epub(bookpath, book) + + +def create_book(title, items, bookpath, date=None, *, html=False): + if date is not None: + date = time.strftime(DATE_FORMAT, date) + else: + date = "Latest" + + if title is None: + title = "No Title" + book_title = title + " - " + date + text = _render_document(book_title, items, date) + book_id = _gen_id(title, date) + + if html: + _create_html(book_id, bookpath, text) + else: + _create_epub(book_title, book_id, bookpath, text) diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py index eb29605..99852e8 100644 --- a/final_task/rss_reader/feed.py +++ b/final_task/rss_reader/feed.py @@ -7,6 +7,7 @@ from . import html_to_text from .database import DB, DBError +from . import book_gen class FeedError(Exception): @@ -185,6 +186,12 @@ def render_json(self): s = json.dumps(feed_dict, indent=2) return s + def create_html(self, path): + book_gen.create_book(self.title, self.items, path, self.date, html=True) + + def create_epub(self, path): + book_gen.create_book(self.title, self.items, path, self.date) + @staticmethod def _try_fix_url(url): try: diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index a7e2dec..c0fa3c6 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -1,6 +1,7 @@ import argparse import logging import time +import os.path from . import feed @@ -21,11 +22,15 @@ def date_str(string): def main(): parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.", prog=PROG) + group = parser.add_mutually_exclusive_group() + group.add_argument("--json", action="store_true", help="Print result as JSON in stdout") + group.add_argument("--html", type=str, help="Generate html book on path", metavar='PATH') + group.add_argument("--epub", type=str, help="Generate epub book an path", metavar='PATH') parser.add_argument("source", help="RSS URL") parser.add_argument("--version", action="version", version=f"{parser.prog}s {__version__}", help="Print version info") - parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout") + # parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout") parser.add_argument("-v", "--verbose", action="store_true", help="Outputs verbose status messages") parser.add_argument("--limit", type=int, help="Limit news topics if this parameter provided") @@ -50,10 +55,14 @@ def main(): try: with feed.Feed(rss_url, limit, date=args.date) as rss_feed: - if not args.json: - print(rss_feed.render_text()) - else: + if args.json: print(rss_feed.render_json()) + elif args.epub is not None: + rss_feed.create_epub(args.epub) + elif args.html is not None: + rss_feed.create_html(args.html) + else: + print(rss_feed.render_text()) except feed.FeedError as e: logging.error(str(e)) From 665e65bdc1c80b4a5c965e6537e375d0454a17b7 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 22:49:00 +0300 Subject: [PATCH 30/31] Update README --- final_task/README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/final_task/README.md b/final_task/README.md index 60adc59..1808c1b 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -29,7 +29,9 @@ $ python -m rss_reader ... ``` ### Usage ``` -usage: rss-reader [-h] [--version] [--json] [-v] [--limit LIMIT] source +usage: rss-reader [-h] [--json | --html PATH | --epub PATH] [--version] [-v] + [--limit LIMIT] [--date DATE] + source Pure Python command-line RSS reader. @@ -38,8 +40,10 @@ positional arguments: optional arguments: -h, --help show this help message and exit - --version Print version info --json Print result as JSON in stdout + --html PATH Generate html book on path + --epub PATH Generate epub book an path + --version Print version info -v, --verbose Outputs verbose status messages --limit LIMIT Limit news topics if this parameter provided --date DATE Load news with date (%Y%m%d) from cache, if this parameter From 8553cf45c22440c35c2b5b0d60db03f2ea86b260 Mon Sep 17 00:00:00 2001 From: scarzdz Date: Mon, 25 Nov 2019 22:53:29 +0300 Subject: [PATCH 31/31] Bump version number and add new requirement --- final_task/rss_reader/requirements.txt | 3 ++- final_task/rss_reader/rss_reader.py | 2 +- final_task/setup.py | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/final_task/rss_reader/requirements.txt b/final_task/rss_reader/requirements.txt index 5c90ddd..ccccda4 100644 --- a/final_task/rss_reader/requirements.txt +++ b/final_task/rss_reader/requirements.txt @@ -1,3 +1,4 @@ lxml~=4.4.1 feedparser~=5.2.1 -beautifulsoup4~=4.8.1 \ No newline at end of file +beautifulsoup4~=4.8.1 +ebooklib~=0.17.1 \ No newline at end of file diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index c0fa3c6..0d652fe 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -6,7 +6,7 @@ from . import feed # temp -__version__ = "0.3" +__version__ = "0.4" PROG = "rss-reader" DATE_FORMAT = "%Y%m%d" diff --git a/final_task/setup.py b/final_task/setup.py index 0341368..7ab1d7f 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -5,7 +5,7 @@ setup( name="rss-reader_scarzdz", - version="0.3.post1", + version="0.4", url="https://github.com/scarzdz/FinalTaskRssParser", license="MIT", author="Denis Marfonov", @@ -24,7 +24,8 @@ install_requires=[ "beautifulsoup4", "feedparser", - "lxml" + "lxml", + "ebooklib" ], entry_points={ "console_scripts": [