Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
*.pyc
*.idea'.idea'
Comment thread
HenadziStantchik marked this conversation as resolved.
*.db
*.pkl
*.cw127.pkl
*.egg-info
final_task/FinalTaskRssParser.egg-info
final_task/dist
*.log

136 changes: 0 additions & 136 deletions Final_Task/FinalTask.md

This file was deleted.

3 changes: 0 additions & 3 deletions Final_Task/README.md

This file was deleted.

34 changes: 32 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,32 @@
# FinalTaskRssParser
For final task pull requests.
# That's how it works

* Creating rss_read class object
* Using feedparser to get a page with function parse
* Then using output functions get info from the page
* Info (source link, image link, etc.) for every novelty pack in class Novelty
* Create a pack of news filled with novelty class objects
* When a pack of news is done come back to rss_reader.py
* Here we prepare to output info according to arguments from console and write down information into DB
* If there is '--to-pdf' or '--to-html' (or both arguments) argument in console we use functions
from PDF_and_HTML_converting to:
1. Get some images (to avoid many copies of pictures we first of all delete images
folder if it exists)
2. Add them into PDF or/and html file
3. Add all other information

* If there is also '--date Y%M%D' in console with '--to-pdf' or/and '--to-html' we write down into the

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have some info about how to install your app, and also more info about the way you store news entries locally.

pdf or/and html file(s) news for that date.
* If there is '--date Y%M%D' in console we take news with that date from our DB. If there is also
'--limit N' arguments, we take N news from our DB.
* If in addition to '--to-pdf' or/and '--to-html' and '--date Y%M%D' there is '--limit N' we write down
N news with that date to file(s) pdf or/and html
* If '--colorize' is in console args then we colorize our news in random colors. If there is no '--colorize'
we use usual color (grey-white)
## Important!
When using pdf or html converting input your path in look like this: "C:\\Test\\" or "C:\\Test"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about Linux OS?


When input arguments to parse any page first of all put link, EXAMPLE:
python rss_reader.py https://bla-bla-bla.by --limit 1

If you don't want to input link and want to get news stored in local storage input for EXAMPLE
like this: python rss_reader.py --colorize --limit 15
Empty file added rss_task/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions rss_task/rss_reader/Classes/novelty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from dataclasses import dataclass


@dataclass
class Novelty:
number_of_novelty: int
title_of_novelty: str
time_of_novelty: str
source_link: str
description: str
images_links: str
alt_text: str
date_corrected: str
main_source: str
163 changes: 163 additions & 0 deletions rss_task/rss_reader/Classes/rss_read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import feedparser
from output_functions import getting_full_info, getting_pack_of_news, converting_to_json, \
writing_to_cache, getting_from_database_to_pack
from pdf_and_html_converting import converting_to_pdf, converting_to_html, pdf_path, html_path
import logging
import re


class RSSParser:
"""
class RSSParser has 3 parameters and it calls function parse when created
"""

def __init__(self, param_url, num_of_news=None, list_of_args=None):
self.feed_url = param_url
self.number = num_of_news
self.list_of_args = list_of_args

def parse(self):
"""
1. Use feedparser to get the page
2. If we have some problems with connection - raise ConnectionError
3. Handle Exception without showing a traceback
4. Do parse method
5. If there are some arguments from console - work with them
"""
try:
logging.info("Trying to get page from feedparser!")
the_feed = feedparser.parse(self.feed_url)
logging.info("Got it (the page)!")
if the_feed.get('bozo'):
if '--date' in self.list_of_args:
if '--to-pdf' in self.list_of_args:
path_pdf = pdf_path(self.list_of_args)
pack_news = self.news_for_date()
converting_to_pdf(path_pdf, pack_news)
elif '--to-html' in self.list_of_args:
path_html = html_path(self.list_of_args)
pack_news = self.news_for_date()
converting_to_pdf(path_html, pack_news)
else:
logging.info("Getting news for date!")
news = self.news_for_date()
if '--json' not in self.list_of_args:
getting_full_info(the_feed, news, self.list_of_args)
print("\nJSON VIEW OF NEWS:", converting_to_json(news, the_feed))
logging.info("Got news for date!")
else:
logging.info("Got some problems due to connection!")
except ConnectionError:
logging.critical("CONNECTION ERROR, HELP!")
print("You have some connection problems!")
if '--date' in self.list_of_args:
if '--to-pdf' in self.list_of_args:
path_pdf = pdf_path(self.list_of_args)
pack_news = self.news_for_date()
converting_to_pdf(path_pdf, pack_news)
elif '--to-html' in self.list_of_args:
path_html = html_path(self.list_of_args)
pack_news = self.news_for_date()
converting_to_html(path_html, pack_news)
else:
logging.info("Getting news for date!")
news = self.news_for_date()
if '--json' not in self.list_of_args:
getting_full_info(the_feed, news, self.list_of_args)
print("\nJSON VIEW OF NEWS:", converting_to_json(news, the_feed))
logging.info("Got news for date!")

logging.info("Getting pack of news!")
pack_of_news, pack_of_news_for_db = getting_pack_of_news(the_feed, self.feed_url,
self.list_of_args, self.number)
logging.info("Got pack of news!")
logging.info("Writing news from source and DB to file!")
writing_to_cache(pack_of_news, pack_of_news_for_db, 'news_cache.txt')
logging.info("News are in the file!")
if '--to-html' in self.list_of_args:
path_html = html_path(self.list_of_args)
if '--date' in self.list_of_args:
pack = self.news_for_date()
converting_to_html(path_html, pack)
else:
converting_to_html(path_html, pack_of_news)
if '--to-pdf' in self.list_of_args:
path_pdf = pdf_path(self.list_of_args)
if '--date' in self.list_of_args:
pack = self.news_for_date()
converting_to_pdf(path_pdf, pack)
else:
converting_to_pdf(path_pdf, pack_of_news)
if '--to-pdf' not in self.list_of_args and '--to-html' not in self.list_of_args:
if '--date' in self.list_of_args and '--json' not in self.list_of_args:
logging.info("Getting full info!")
getting_full_info(the_feed, self.news_for_date(), self.list_of_args)
logging.info("Got full info!")
else:
logging.info("Getting full info!")
if not the_feed.get('bozo') and '--json' not in self.list_of_args:
getting_full_info(the_feed, pack_of_news, self.list_of_args)
logging.info("Got full info!")

if '--json' in self.list_of_args and '--date' not in self.list_of_args:
print("\nJSON VIEW OF NEWS:", converting_to_json(pack_of_news, the_feed))
elif '--json' in self.list_of_args and '--date' in self.list_of_args:
print("\nJSON VIEW OF NEWS:", converting_to_json(self.news_for_date(), the_feed))

def news_if_not_source(self, the_feed):
# Looking for url address: if it is => doing all the thing; if it is not => printing all the news
chk_pat = '(?:{})'.format('|'.join(self.list_of_args))
s = 'http'
if not bool(re.search(s, chk_pat, flags=re.I)):
pack_of, pack_db = getting_pack_of_news(the_feed, self.feed_url, self.list_of_args, self.number)
getting_full_info(the_feed, pack_db, self.list_of_args)

def news_for_date(self):
"""
Finding news by date and rss
If your rss and date are correct we append the novelty to the pack_of_news_needed
If not we continue our searching
"""
try:
news_for_date_needed = []
date_needed = self.list_of_args[self.list_of_args.index('--date') + 1]
pack_of_db_news = getting_from_database_to_pack()
if '--limit' in self.list_of_args:
cycle_counter = 0
number_of_news_found = 0
while cycle_counter != len(pack_of_db_news):
if str(pack_of_db_news[cycle_counter].date_corrected) == date_needed and \
self.feed_url == pack_of_db_news[cycle_counter].main_source:
news_for_date_needed.append(pack_of_db_news[cycle_counter])
number_of_news_found += 1
if number_of_news_found == self.number:
break
cycle_counter += 1
else:
for item in pack_of_db_news:
if str(item.date_corrected) == date_needed and \
self.feed_url == item.main_source:
news_for_date_needed.append(item)
if self.feed_url is None:
counter = 0
number_of_news_f = 0
while counter != len(pack_of_db_news):
if str(pack_of_db_news[counter].date_corrected) == date_needed:
pack_of_db_news[counter].number_of_novelty = number_of_news_f + 1
news_for_date_needed.append(pack_of_db_news[counter])
number_of_news_f += 1
counter += 1
if '--limit' in self.list_of_args:
if number_of_news_f == self.number:
break
if not news_for_date_needed:
if '--limit' in self.list_of_args:
print("No news have been found for this date with your limits!")
elif 'source' in self.list_of_args:
print("No news have been found for your source")
else:
print("No news have been found for this date!")
return news_for_date_needed
except IndexError:
print("You forgot to enter date in format %Y%m%d")

Binary file added rss_task/rss_reader/DejaVuSans.ttf
Binary file not shown.
Loading