wall_fall/wall_fall.py at master · futureshit/wall_fall · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python3

import argparse
import re
import requests
from bs4 import BeautifulSoup as bs

# takes a url to a paywalled sz article and print the text behind the wall
def sz_extractor(url):
    html_article = requests.get(url)
    bs_article = bs(html_article.text, 'html.parser')
    # this could be easier, but the sz journalists format text like my granny..
    for p in bs_article.find(class_ = 'article-content paywall').find_all('p'):
        print(p.getText(strip=True))


# takes a url to a paywalled dnn article and print the text behind the wall
def dnn_extractor(url):
    html_article = requests.get(url)
    bs_article = bs(html_article.text, 'html.parser')
    headline_pattern = re.compile('"(headline)":"(.*?)"')
    article_pattern = re.compile('"(articleBody)":"(.*?)"')
    for s in bs_article.find_all('script', type="application/ld+json"):
        if re.findall(article_pattern, s.text):
            js = s
            break

    _, headline = re.findall(headline_pattern, js.text)[0]
    _, article = re.findall(article_pattern, js.text)[0]
    print(headline)
    print(article)

    return

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'url',
        metavar = 'URL',
        help = 'url of a sz or dnn article behind an paywall',
        type = str)

    args = parser.parse_args()

    dnn_pat = re.compile("(dnn\.de)")
    sz_pat = re.compile("(saechsische\.de)")
    if re.findall(dnn_pat, args.url):
        dnn_extractor(args.url)
    elif re.findall(sz_pat, args.url):
        sz_extractor(args.url)
    else:
        print('Neither a sz, or a dnn link')