diff --git a/final_task/LICENSE b/final_task/LICENSE new file mode 100644 index 0000000..5450455 --- /dev/null +++ b/final_task/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Denis Marfonov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/final_task/README.md b/final_task/README.md index 7af281f..1808c1b 100644 --- a/final_task/README.md +++ b/final_task/README.md @@ -1,3 +1,104 @@ -# Your readme here -Some text. -Checkout how to write this file using *markdown*. +# RSS Reader +Cli-based RSS reader built with Python 3.8. +Supports all RSS standards, can handle incorrect RSS. +Also partially supports Atom feeds. + +## Getting Started +### Prerequisites +- Python 3.8 +- feedparser, lxml, beautifulsoup4 + +``` +pip install feedparser lxml beautifulsoup4 +``` +### Installation +``` +pip install -i https://test.pypi.org/simple/ rss-reader-scarzdz +``` +Also you can just download source code and install using: +``` +$ python final_task/setup.py install +``` +### Running +After installation, `rss-reader` command is added to PATH. + +Alternatively, the application can be run from the source file: +``` +$ cd final_task +$ python -m rss_reader ... +``` +### Usage +``` +usage: rss-reader [-h] [--json | --html PATH | --epub PATH] [--version] [-v] + [--limit LIMIT] [--date DATE] + source + +Pure Python command-line RSS reader. + +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --json Print result as JSON in stdout + --html PATH Generate html book on path + --epub PATH Generate epub book an path + --version Print version info + -v, --verbose Outputs verbose status messages + --limit LIMIT Limit news topics if this parameter provided + --date DATE Load news with date (%Y%m%d) from cache, if this parameter + provided +``` +## Behavior +RSS Reader can work in online or offline mode. + +In **online** mode, when `--date` argument is not provided, the application loads and parses rss feed from `source` argument. +It is done using `feedparser` library. +Parsed news saved in **_sqlite database_**, which located in `rss_parser/data/rss.sqlite`. +If item contains _html_ markup, it converted to plain text. + +In **offline** mode, when `--date` argument is provided, +the application loads news with specified feed link and date from the database. + +News printed to stdout in the following format: + +``` +Feed: *RSS feed title* + + +Title: *item 1 title* +Date: *%a, %d %b %Y %H:%M:%S +0000* +Link: https://example.com/link_to_item + +*Item description* + +Links: +[1]: *first link is always link to item* +[2]: Others can be links parsed from or tags + + +Title: *item 2 title* +Date: ... +``` + +News is converted to json like this: +``` +{ + "title": "*Feed title*", + "link": "*link to feed*" + "items": [ + { + "title": "*item 1 title*", + "date": *time.struct_time tuple*, + "link": "*link to item*", + "enclosure": *null* or *link to eclosure*, + "description": "*item description*", + "description_parsed": "*description parsed to plain text*" or *null* if description is text + }, + ... + ] +} +``` + +## Licence +This project is licensed under the MIT License - see the LICENSE file for details. diff --git a/final_task/__init__.py b/final_task/rss_reader/__init__.py similarity index 100% rename from final_task/__init__.py rename to final_task/rss_reader/__init__.py diff --git a/final_task/rss_reader/__main__.py b/final_task/rss_reader/__main__.py new file mode 100644 index 0000000..f8e010c --- /dev/null +++ b/final_task/rss_reader/__main__.py @@ -0,0 +1,3 @@ +from .rss_reader import main + +main() diff --git a/final_task/rss_reader/book_gen.py b/final_task/rss_reader/book_gen.py new file mode 100644 index 0000000..605c5e6 --- /dev/null +++ b/final_task/rss_reader/book_gen.py @@ -0,0 +1,110 @@ +import time +import os.path +import html + +from ebooklib import epub + +DATE_FORMAT = "%Y%m%d" + + +def _render_document(title, items, date_str): + html = [""] + h_title = f"{date_str}" + html.append(h_title) + html.append("") + + html.append("

") + html.append(title) + html.append("

") + + for i in items: + html.append(_render_item(i)) + + html = "".join(html) + return html + + +def _render_item(item): + html = ["
"] + title = item["title"] or "No Headline" + html.append(f"

{title}

") + date = time.strftime("%a, %d %b %Y %H:%M:%S", item["date"]) + html.append(f"

{date}

") + if item["link"] is not None: + link = item["link"] + html.append(f"

{link}

") + else: + html.append(f"

no link

") + description = item["description"] or "No description" + html.append(f"

{description}

") + html = "".join(html) + return html + + +def _gen_id(title: str, date:str) -> str: + """ + generate string for book id and file name + + :return: generated string + """ + string = title + "_" + date + string = string.lower() + for c in r'\|/:*?"<>': + string.replace(c, '_') + if len(string) > 122: + string = string[:121] + "..." + return string + + +def _create_html(book_id, bookpath, text): + # text = html.escape(text) + file = None + if os.path.isdir(bookpath): + file = open(os.path.join(bookpath, book_id + ".html"), "w", encoding="utf-8") + elif not os.path.exists(bookpath) or os.path.isfile(bookpath) or os.path.islink(bookpath): + file = open(bookpath, "w", encoding="utf-8") + try: + file.write(text) + file.close() + except (AttributeError, OSError): + pass + + +def _create_epub(book_title, book_id, bookpath, text): + + book = epub.EpubBook() + book.set_language('en') + book.set_identifier(book_id) + book.set_title(book_title) + + content = epub.EpubHtml(title='News', file_name='content.xhtml') + + content.set_content(text) + book.add_item(content) + + book.toc = (content,) + book.spine = [content] + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + if os.path.isdir(bookpath): + epub.write_epub(os.path.join(bookpath, book_id+".epub"), book) + elif not os.path.exists(bookpath) or os.path.isfile(bookpath) or os.path.islink(bookpath): + epub.write_epub(bookpath, book) + + +def create_book(title, items, bookpath, date=None, *, html=False): + if date is not None: + date = time.strftime(DATE_FORMAT, date) + else: + date = "Latest" + + if title is None: + title = "No Title" + book_title = title + " - " + date + text = _render_document(book_title, items, date) + book_id = _gen_id(title, date) + + if html: + _create_html(book_id, bookpath, text) + else: + _create_epub(book_title, book_id, bookpath, text) diff --git a/final_task/rss_reader/database.py b/final_task/rss_reader/database.py new file mode 100644 index 0000000..2982337 --- /dev/null +++ b/final_task/rss_reader/database.py @@ -0,0 +1,149 @@ +import sqlite3 +import time +import pkg_resources +import logging + +DATE_FORMAT = "%Y-%m-%d" +DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" + +DATA_FILE = "data/rss.sqlite" + + +class DBError(Exception): + pass + + +class DB: + def __init__(self): + path = pkg_resources.resource_filename(__name__, DATA_FILE) + try: + self.conn = sqlite3.connect(path) + except sqlite3.OperationalError as e: + raise DBError(str(e)) + + self.cursor = self.conn.cursor() + self.feed_id = None + + if not self._is_db_exist(): + self._create_db() + + def _is_db_exist(self): + try: + self.cursor.execute('''SELECT count(name) FROM sqlite_master + WHERE type='table' AND (name='items' or name='feeds')''') + except sqlite3.DatabaseError: + return False + # if the count is 2, then tables exists + return self.cursor.fetchone()[0] == 2 + + def _create_db(self): + logging.info("Creating DB for local cache") + self.cursor.execute("DROP TABLE IF EXISTS feeds") + self.cursor.execute("DROP TABLE IF EXISTS items") + self.cursor.execute(''' + CREATE TABLE feeds( + id INTEGER PRIMARY KEY, + title TEXT, + link TEXT UNIQUE NOT NULL + ) + ''') + self.cursor.execute(''' + CREATE TABLE items( + id INTEGER PRIMARY KEY, + feed_id INTEGER NOT NULL, + published TEXT NOT NULL, + title TEXT, + link TEXT, + enclosure TEXT, + description TEXT, + description_parsed TEXT, + FOREIGN KEY (feed_id) REFERENCES feeds(id) + ) + ''') + self.cursor.execute(''' + CREATE UNIQUE INDEX un_items ON items(published, title, link) + ''') + self.conn.commit() + + def get_feed(self, feed_link, req_date, limit=-1): + feed_info = self._get_feed_info_if_exists(feed_link) + if feed_info is not None: + req_date = time.strftime(DATE_FORMAT, req_date) + try: + self.cursor.execute(''' + select i.title, i.published, i.link, i.enclosure, i.description, i.description_parsed + from items i join feeds f on i.feed_id = f.id + where f.id=(?) and date(i.published)=date(?) limit (?) + ''', (feed_info[0], req_date, limit) + ) + except sqlite3.Error: + raise DBError("Error getting items from db") + + items = [] + for i in self.cursor.fetchall(): + item = dict(title=i[0], date=time.strptime(i[1], DATETIME_FORMAT), link=i[2], enclosure=i[3], + description=i[4], description_parsed=i[5]) + items.append(item) + return dict(title=feed_info[1], items=items) + else: + + return None + + def store_feed(self, link, title, items): + feed_id = self._get_feed_id_if_exists(link) + if feed_id is None: + try: + self.cursor.execute("insert into feeds(title, link) values (?, ?)", (title, link)) + feed_id = self.cursor.lastrowid + except sqlite3.Error: + self.conn.rollback() + raise DBError("Error adding feed to db") + self.conn.commit() + self.feed_id = feed_id + self._store_items(items) + + def close(self): + self.conn.close() + + def _store_items(self, items): + for i in items: + item_dict = dict( + feed_id=self.feed_id, + published=time.strftime(DATETIME_FORMAT, i["date"]), + title=i["title"], + link=i["link"], + enclosure=i["enclosure"], + description=i["description"], + description_parsed=i["description_parsed"] + ) + try: + self.cursor.execute(''' + insert or ignore into items(feed_id, published, title, link, enclosure, description, description_parsed) + values (:feed_id, :published, :title, :link, :enclosure, :description, :description_parsed) + ''', item_dict) + except sqlite3.Error: + self.conn.rollback() + raise DBError("Error adding item to db") + self.conn.commit() + + def _get_feed_id_if_exists(self, feed_link): + feed_info = self._get_feed_info_if_exists(feed_link) + if feed_info is not None: + return feed_info[0] + else: + return None + + def _get_feed_info_if_exists(self, feed_link): + try: + self.cursor.execute("select feeds.id, feeds.title from feeds where feeds.link=?", (feed_link,)) + except sqlite3.Error: + raise DBError("Error checking Feed in db") + feed_info = self.cursor.fetchone() + if feed_info is not None: + return feed_info[0], feed_info[1] + return None + + +if __name__ == '__main__': + db = DB() + db._create_db() diff --git a/final_task/rss_reader/feed.py b/final_task/rss_reader/feed.py new file mode 100644 index 0000000..99852e8 --- /dev/null +++ b/final_task/rss_reader/feed.py @@ -0,0 +1,207 @@ +import urllib.parse +import html +import json +import logging +import time +import feedparser + +from . import html_to_text +from .database import DB, DBError +from . import book_gen + + +class FeedError(Exception): + pass + + +class URLFormatError(FeedError): + pass + + +class FeedNotFoundError(FeedError): + pass + + +class IncorrectRSSError(FeedError): + pass + + +class LocalCacheError(FeedError): + pass + + +class Feed: + + def __init__(self, link, limit=0, *, date=None): + self.link = self._try_fix_url(link) + logging.info(f'The link to the rss feed is "{self.link}"') + self.title = None + self.items = [] + self.limit = int(limit) + self.date = date + self.db = None + + def __enter__(self): + self.load() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def load(self): + try: + self.db = DB() + except DBError: + if self.date is not None: + raise LocalCacheError("Unable to open local cache") + else: + self.db = None + logging.warning("Unable to open local cache. Feed items will not be saved.") + + if self.date is None: + self._parse_remote() + self._save_to_cache() + else: + self._load_cache() + + def close(self): + self.db.close() + + def _parse_remote(self): + logging.info("Parsing feed from remote source") + parsed_rss = feedparser.parse(self.link) + # Did Feedparser access feed as remote and is the HTTP status ok + if "status" not in parsed_rss or parsed_rss.status >= 400: + raise FeedNotFoundError("Could not connect or find RSS feed") + # Checks does parsed_feed object contain at least one main element of feed + if "title" not in parsed_rss.feed and "link" not in parsed_rss and len(parsed_rss.entries) < 0: + raise IncorrectRSSError("URL is not a correct RSS feed") + + logging.info("Feed successfully received") + self.title = parsed_rss.feed.get("title") or parsed_rss.feed.get("link") + + if self.limit < 1 or self.limit > len(parsed_rss.entries): + limit = len(parsed_rss.entries) + logging.info("Parsing items without limit") + else: + limit = self.limit + logging.info(f"Items limit is {limit}") + self.items = [] + for i, entry in enumerate(parsed_rss.entries[:limit]): + logging.info(f"Parsing item {(i+1)}") + item = self._parse_remote_item(entry) + if item is not None: + self.items.append(item) + else: + logging.info("Skipping invalid item") + + def _parse_remote_item(self, entry): + item = dict() + if "published_parsed" not in entry: + return None + item["date"] = entry.published_parsed + + if "title" in entry: + item["title"] = html.unescape(entry.title) + else: + item["title"] = None + + item["link"] = entry.get("link") + + if len(entry.enclosures) > 0: + item["enclosure"] = entry.enclosures[0] + else: + item["enclosure"] = None + + item["description_parsed"] = None + if "description" in entry: + item["description"] = entry.description + if self._is_html(entry.description_detail.type): + item["description_parsed"] = html_to_text.parse(item["description"], skip_link=item.get("link")) + logging.info("Item description is html and therefore converted to plain text") + else: + item["description"] = None + return item + + def _load_cache(self): + logging.info("Loading feed from cache") + limit = -1 + if self.limit > 0: + limit = self.limit + try: + db_feed = self.db.get_feed(self.link, self.date, limit) + except DBError: + raise LocalCacheError("Unable to load items from cache") + if db_feed is not None: + logging.info("Feed successfully loaded") + self.title = db_feed["title"] + self.items = db_feed["items"] + else: + logging.warning("Feed with the specified link is not found in cache") + + def _save_to_cache(self): + if self.db is not None: + try: + self.db.store_feed(self.link, self.title, self.items) + except DBError: + raise LocalCacheError("Unable to save feed to cache") + logging.info("Feed saved to cache") + + def render_text(self): + logging.info("Generating plain text representation of feed") + + s = ["\nFeed: "] + title = self.title or "no title" + s.append(title) + s.append("\n") + if len(self.items) == 0: + s.append("\nno items to show\n") + else: + for item in self.items: + s.append("\n\n") + + item_title = item["title"] or "no title" + s.append("Title: " + item_title + "\n") + + item_date = time.strftime("%a, %d %b %Y %H:%M:%S +0000", item["date"]) + s.append("Date: " + item_date + "\n") + + item_link = item["link"] or "no link" + s.append("Link: " + item_link + "\n") + + s.append("\n") + if item["enclosure"] is not None: + s.append("Enclosure: " + item["enclosure"]+"\n") + s.append("\n") + + description = item["description_parsed"] or item["description"] or "no description" + s.append(description) + + s = "".join(s) + return s + + def render_json(self): + logging.info("Generating json representation of feed") + feed_dict = {"title": self.title, "link": self.link, "items": self.items} + s = json.dumps(feed_dict, indent=2) + return s + + def create_html(self, path): + book_gen.create_book(self.title, self.items, path, self.date, html=True) + + def create_epub(self, path): + book_gen.create_book(self.title, self.items, path, self.date) + + @staticmethod + def _try_fix_url(url): + try: + parsed_url = urllib.parse.urlsplit(url, "https") + except ValueError: + raise URLFormatError("Error in url format") + else: + result_url = urllib.parse.urlunsplit(parsed_url) + return result_url if not result_url.endswith('/') else result_url[:-1] + + @staticmethod + def _is_html(element_type): + return element_type in ["text/html", "application/xhtml+xml"] diff --git a/final_task/rss_reader/html_to_text.py b/final_task/rss_reader/html_to_text.py new file mode 100644 index 0000000..09dc1a1 --- /dev/null +++ b/final_task/rss_reader/html_to_text.py @@ -0,0 +1,43 @@ +from bs4 import BeautifulSoup +from .links_collection import LinksCollection + + +def _handle_image(tag, refs): + if "src" in tag.attrs and tag.attrs["src"]: + index = refs.add(tag.attrs["src"]) + alt_text = tag.attrs.get("alt", "no description") + tag.replace_with(f"[image {index}: {alt_text}][{index}]") + + +def _handle_link(tag, link, refs): + if ( + len(tag.get_text()) > 0 and + "href" in tag.attrs and + len(tag.attrs["href"]) > 0 and + tag.href != link + ): + index = refs.add(tag.attrs["href"]) + tag.insert_after(f"[{index}]") + + +def _create_refs_text(refs): + s = ["Links:\n"] + for i, r in enumerate(refs): + s.append(f"[{(i+1)}]: {r}\n") + s = "".join(s) + return s + + +def parse(html, skip_link=None): + refs = LinksCollection() + if skip_link is not None: + refs.add(skip_link) + soup = BeautifulSoup(html, features="lxml") + for tag in soup.find_all(name=["img", "a"]): + if tag.name == 'img': + _handle_image(tag, refs) + else: + _handle_link(tag, skip_link, refs) + + text = soup.get_text() + "\n\n" + _create_refs_text(refs) + return text diff --git a/final_task/rss_reader/links_collection.py b/final_task/rss_reader/links_collection.py new file mode 100644 index 0000000..0061e8c --- /dev/null +++ b/final_task/rss_reader/links_collection.py @@ -0,0 +1,27 @@ +from collections.abc import Collection + + +class LinksCollection(Collection): + + def __len__(self) -> int: + return len(self.items_dict) + + def __contains__(self, __x: object) -> bool: + return __x in self.items_dict + + def __iter__(self): + return self.items_dict.__iter__() + + def __init__(self, initial=None): + self.items_dict = dict() + self._next_index_ = 1 + + if initial is not None: + for element in initial: + self.add(element) + + def add(self, item): + if item is not None and item not in self.items_dict: + self.items_dict[item] = self._next_index_ + self._next_index_ += 1 + return self.items_dict[item] diff --git a/final_task/rss_reader/requirements.txt b/final_task/rss_reader/requirements.txt index e69de29..ccccda4 100644 --- a/final_task/rss_reader/requirements.txt +++ b/final_task/rss_reader/requirements.txt @@ -0,0 +1,4 @@ +lxml~=4.4.1 +feedparser~=5.2.1 +beautifulsoup4~=4.8.1 +ebooklib~=0.17.1 \ No newline at end of file diff --git a/final_task/rss_reader/rss_reader.py b/final_task/rss_reader/rss_reader.py index e69de29..0d652fe 100644 --- a/final_task/rss_reader/rss_reader.py +++ b/final_task/rss_reader/rss_reader.py @@ -0,0 +1,69 @@ +import argparse +import logging +import time +import os.path + +from . import feed + +# temp +__version__ = "0.4" +PROG = "rss-reader" +DATE_FORMAT = "%Y%m%d" + + +def date_str(string): + logging.info("Checking date argument") + try: + date = time.strptime(string, DATE_FORMAT) + except ValueError: + raise argparse.ArgumentTypeError(f"Incorrect date format: {string}") + return date + + +def main(): + parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.", prog=PROG) + group = parser.add_mutually_exclusive_group() + group.add_argument("--json", action="store_true", help="Print result as JSON in stdout") + group.add_argument("--html", type=str, help="Generate html book on path", metavar='PATH') + group.add_argument("--epub", type=str, help="Generate epub book an path", metavar='PATH') + + parser.add_argument("source", help="RSS URL") + parser.add_argument("--version", action="version", version=f"{parser.prog}s {__version__}", + help="Print version info") + # parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout") + parser.add_argument("-v", "--verbose", action="store_true", help="Outputs verbose status messages") + parser.add_argument("--limit", type=int, help="Limit news topics if this parameter provided") + + # It is done because argparse treat '%' in parameters as old-style formatting + date_format_escaped = DATE_FORMAT.replace("%", "%%") + parser.add_argument("--date", type=date_str, + help=f"Load news with date ({date_format_escaped}) from cache, if this parameter provided") + + args = parser.parse_args() + + rss_url = args.source + + if args.verbose: + log_level = logging.INFO + else: + log_level = logging.WARNING + + limit = args.limit if args.limit is not None else 0 + logging.basicConfig(format="%(levelname)s:%(message)s", level=log_level) + + logging.info("Program starts") + + try: + with feed.Feed(rss_url, limit, date=args.date) as rss_feed: + if args.json: + print(rss_feed.render_json()) + elif args.epub is not None: + rss_feed.create_epub(args.epub) + elif args.html is not None: + rss_feed.create_html(args.html) + else: + print(rss_feed.render_text()) + except feed.FeedError as e: + logging.error(str(e)) + + logging.info("Program finishes") diff --git a/final_task/setup.py b/final_task/setup.py index e69de29..7ab1d7f 100644 --- a/final_task/setup.py +++ b/final_task/setup.py @@ -0,0 +1,35 @@ +from setuptools import setup + +with open("README.md", "r") as fh: + long_description = fh.read() + +setup( + name="rss-reader_scarzdz", + version="0.4", + url="https://github.com/scarzdz/FinalTaskRssParser", + license="MIT", + author="Denis Marfonov", + author_email="marfonovdenis@gmail.com", + description="Cli-based RSS Reader", + long_description=long_description, + long_description_content_type="text/markdown", + packages=["rss_reader"], + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires=">=3.8", + install_requires=[ + "beautifulsoup4", + "feedparser", + "lxml", + "ebooklib" + ], + entry_points={ + "console_scripts": [ + "rss-reader=rss_reader.rss_reader:main", + ], + } +)