Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
.vscode/
__pycache__/
jsonout.txt
*.pyc
.mypy_cache
build/
dist/
rss_reader.egg-info/
*.json
*.pkl
*.pdf
*.html
temp-img-2250147051588681835.jpg
final_task/tests/xml/no_items_feed.xml
final_task/tests/xml/no_items_fields.xml
.coverage
.coveragerc
1 change: 1 addition & 0 deletions final_task/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include rss_reader/Arial-Unicode-Regular.ttf
93 changes: 90 additions & 3 deletions final_task/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,90 @@
# Your readme here
Some text.
Checkout how to write this file using *markdown*.
# Python RSS parser
***
Yet another RSS parser
***
# Quick start

## Usage
usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [source]

RRS feed receiver

positional arguments:
source URL for RSS feed

optional arguments:
-h, --help show this help message and exit
--version prints version
--json converts news to JSON
--verbose output verbose status messages
--limit LIMIT determines the number of showed news.
--date DATE shows cached news at given date
--to_pdf TO_PDF coverts news to PDF.
--to_html TO_HTML coverts news to HTML.

TO_PDF/TO_HTML - path to directory for file
File's name is in format feed-*current datetime*.*extention*
Comment on lines +8 to +26

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems you forgot to add --colorize here.


If there are no news found while using both `--date` and `--to_pdf` or `--to_html` convertion does not happen

## Installation
1. Install setuptools

pip install setuptools
2. Download source code
3. Unpack downloaded *.zip
4. Go to `FinalTaskRssParser-master/final_task`
5. In terminal execute:

python setup.py sdist
6. Go to `/dist` directory
7. Execute `pip install rss_reader-1.4.tar.gz`

Done!
To see help use

rss-reader --help

## JSON format
{
"description": "description",
"link": "link",
"news_list": [
news_item,
news_item,
news_item,
...
],
"title": "title"
}

news_item is represented as:

{
"date": "date",
"desctiption": "description",
"img": "base64",
"link": "link",
"media": "media",
"published": "published",
"source": "source",
"title": "title"
}
Base64 string is pretty long, so it've been shortened to `"base64"` while printing, but it is stored as valid string in memory and cache
## Caching
TinyDB have been used for caching.

Items are stored in json format.
News are stored in db.json
##### Database item format
"id": {
"date": "date",
"img": "base64_representation_of_an_image"
"desctiption": "description",
"link": "link",
"media": "media",
"published": "published",
"source": "source",
"title": "title"
},
`date` is stored in format `yyyy%mm%dd`
Binary file added final_task/rss_reader/Arial-Unicode-Regular.ttf
Binary file not shown.
Empty file.
191 changes: 191 additions & 0 deletions final_task/rss_reader/converters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
from pkg_resources import resource_filename
import datetime
import logging
import os
import warnings
import platform

import requests
from dominate import document
from dominate.tags import h1, h3, h5, p, a, div, img
from fpdf import FPDF

from exceptions_ import ConvertionError

FONT_BLACK = resource_filename(__name__, 'Arial-Unicode-Regular.ttf')

LOGGER = logging.getLogger('rss_logger')


def path_validation(path, mode):
'''
Raises an exception if the path is not valid
Otherwise returns correct path with filename

Mode is boolean to determine file extention
True - .html
False - .pdf
'''
get_extention = (lambda mode: '.html' if mode else '.pdf')
path = os.path.abspath(path)
LOGGER.debug('CHECKING PATH...')
if os.path.exists(path):
LOGGER.debug('PATH IS OK')
if platform.system() == 'Linux' or platform.system() == 'Darwin':
path += '/feed-' + str(datetime.datetime.now()) + get_extention(mode)
elif platform.system() == 'Windows':
# Processing path to be valid in Windows
path += '\\feed-' + str(datetime.datetime.now()) + get_extention(mode)
# Second replace replaces ':', so path will be like C-/Users/...
# Third replace restores correct path C:/Users/...
# Had to say that I'm not proud of this solution at all
path = path.replace(' ', '_').replace(':', '-').replace('-', ':', 1)
else:
raise ConvertionError('Unknown OS. Try Windows or UNIX/XNU')
return path
else:
raise ConvertionError('Wrong path')


def get_html_doc(news_list):
'''
Converts news to .html

news_list - is a list of dicts
'''
LOGGER.debug('CONVERTING TO HTML')
with document(title='RSS FEED') as doc:
h1('News:')
for news_item in news_list:
with div():
h3(news_item['title'])
h5('IMAGE')
LOGGER.debug('PROCESSING IMAGE')
if news_item['img'] is None:
p('NO IMAGE')
else:
# Image is stored in base64. It's needed to skip first 2 and
# the last chars to take valid_base64_string because it's stored as
# b'valid_base64_string'
img(src='data:image/png;base64, ' + str(news_item['img'])[2:-1])
LOGGER.debug('DONE')
h5('DESCRIPTION: ')
if not news_item['description']:
p('NO DESCRIPTION')
else:
p(news_item['description'])
p(news_item['published'])
p('SOURCE: ' + news_item['source'])
a('LINK', href=news_item['link'])
return str(doc)


def to_html(path, item_list):
try:
path = path_validation(path, True)
except ConvertionError as exc:
raise exc
document = get_html_doc(item_list)
LOGGER.debug('WRITING .html')
with open(path, 'w', encoding='utf-8') as html_file:
html_file.write(str(document))


def get_image_path(url):
'''
FPDF can't handle image in base64
The function tries to take the image from the source
If it does it create temp-img file and returns path
to it
If it doesn't it raises an requests.ConnectionError
exception which handled in Image adding section
'''
LOGGER.debug('GETTING IMAGE FROM URL...')
temp_img = 'temp-img' + str(hash(url)) + '.jpg'
img = requests.get(url).content
with open(temp_img, 'wb') as img_out:
img_out.write(img)
LOGGER.debug('DONE')
return temp_img


def get_pdf_doc(news_list):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is too large, it is better to try to split it.

'''
Converts news to .pdf

news_list - is a list of dicts
'''
LOGGER.debug('CONVERTING TO PDF')
pdf = FPDF(format='A4')
LOGGER.debug('SETIING FONTS')
pdf.add_font("ArialUni", style="", fname=FONT_BLACK, uni=True)
pdf.add_font("ArialUni", style='B', fname=FONT_BLACK, uni=True)
pdf.set_font("ArialUni", 'B', size=24)
pdf.add_page()
pdf.set_xy(0, 0)
pdf.cell(50, 30, txt='News Feed:', ln=1, align='L')
for news_item in news_list:
pdf.set_font("ArialUni", '', size=12)
pdf.set_x(4)
pdf.cell(20, 6, 'Title:', ln=1)
pdf.set_font("ArialUni", '', size=12)
pdf.set_x(20)
pdf.multi_cell(150, 5, news_item['title'])
pdf.set_x(4)
pdf.set_font("ArialUni", '', size=12)
pdf.cell(20, 6, 'Image:', ln=1)
# Image adding
LOGGER.debug('IMAGE ADDING')
if news_item['img'] is None:
LOGGER.debug('IMAGE IS NONE')
pdf.set_font("ArialUni", '', size=12)
pdf.set_x(20)
pdf.cell(20, 6, 'No image', ln=1)
pdf.set_x(4)
else:
try:
img_path = get_image_path(news_item['media'])
pdf.image(img_path, x=20)
os.remove(img_path)
except (requests.Timeout, requests.TooManyRedirects, requests.ConnectionError) as exc:
print(str(exc))
pdf.set_x(20)
pdf.multi_cell(150, 6, str(exc))
except Exception as exc:
print(str(exc))
pdf.set_x(20)
pdf.multi_cell(150, 6, 'Image error')
os.remove(img_path)
# End image adding
pdf.set_x(4)
pdf.cell(20, 6, 'Description:', ln=1)
pdf.set_x(20)
if not news_item['description']:
pdf.multi_cell(150, 5, news_item['title'])
else:
pdf.multi_cell(150, 5, news_item['description'])
pdf.set_x(4)
pdf.cell(20, 6, 'LINK', link=news_item['link'], ln=1)
pdf.set_x(4)
pdf.cell(20, 6, 'Source: ' + news_item['source'], link=news_item['source'], ln=1)
pdf.cell(0, 10, '='*85, align='C', ln=1)
return pdf


def to_pdf(path, news_list):
try:
path = path_validation(path, False)
except ConvertionError as exc:
raise exc
pdf = get_pdf_doc(news_list)
LOGGER.debug('SAVING .pdf')
print(path)
try:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
pdf.output(path)
except OSError as exc:
raise ConvertionError('Wrong path')
except Exception:
raise ConvertionError('News contain unsupported characters. Stop exporting')
LOGGER.debug('DONE!')
13 changes: 13 additions & 0 deletions final_task/rss_reader/exceptions_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
class FeedError(Exception):
'''
Raise when something is wrong with feed
'''
pass


class InvalidArgs(FeedError):
pass


class ConvertionError(FeedError):
pass
9 changes: 9 additions & 0 deletions final_task/rss_reader/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
feedparser==5.2.1
argparse==1.4.0
jsonpickle==1.2
tinydb==3.15.1
requests==2.22.0
dominate==2.4.0
beautifulsoup4==4.8.1
fpdf==1.7.2
colorama==0.4.1
Loading