-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwall_fall.py
More file actions
executable file
·52 lines (43 loc) · 1.61 KB
/
wall_fall.py
File metadata and controls
executable file
·52 lines (43 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python3
import argparse
import re
import requests
from bs4 import BeautifulSoup as bs
# takes a url to a paywalled sz article and print the text behind the wall
def sz_extractor(url):
html_article = requests.get(url)
bs_article = bs(html_article.text, 'html.parser')
# this could be easier, but the sz journalists format text like my granny..
for p in bs_article.find(class_ = 'article-content paywall').find_all('p'):
print(p.getText(strip=True))
# takes a url to a paywalled dnn article and print the text behind the wall
def dnn_extractor(url):
html_article = requests.get(url)
bs_article = bs(html_article.text, 'html.parser')
headline_pattern = re.compile('"(headline)":"(.*?)"')
article_pattern = re.compile('"(articleBody)":"(.*?)"')
for s in bs_article.find_all('script', type="application/ld+json"):
if re.findall(article_pattern, s.text):
js = s
break
_, headline = re.findall(headline_pattern, js.text)[0]
_, article = re.findall(article_pattern, js.text)[0]
print(headline)
print(article)
return
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'url',
metavar = 'URL',
help = 'url of a sz or dnn article behind an paywall',
type = str)
args = parser.parse_args()
dnn_pat = re.compile("(dnn\.de)")
sz_pat = re.compile("(saechsische\.de)")
if re.findall(dnn_pat, args.url):
dnn_extractor(args.url)
elif re.findall(sz_pat, args.url):
sz_extractor(args.url)
else:
print('Neither a sz, or a dnn link')