python-experiments/crawl.py at master · nabaz/python-experiments · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
import bs4
from bs4 import BeautifulSoup
from urlparse import urljoin
from urlparse import urlparse
from collections import Counter

def find_philosophy(url, visited_url=None, path_length=0):
    """(str, list, int ) -> int
    Return path length or -1 if the `url` does not lead to
    philosophy page."""
    if visited_url == None:
        visited_url = []
    r = requests.get(url)
    html_content = r.text
    philosophy_url = 'https://en.wikipedia.org/wiki/Philosophy'
    if r.url == philosophy_url:
        return path_length
    elif r.url in visited_url:
        return -1
    else:
        link = find_link(html_content)
        # Check if the page contains link that meets the requirements.
        if link and link.attrs.has_key('href'):
            new_url = urljoin(r.url, link.attrs['href'])
            if not differrent_path(r.url, new_url):
                return -1
            else:
                path_length += 1
                visited_url.append(r.url)
                # Recurse
                return find_philosophy(new_url, visited_url, path_length)
        else:
            return -1

def find_link(html_content):
    """(str) -> bs4.element.Tag or None
    Return the first link in the main body of the article that is
    not within parenthesis or italicized.
    Since Wikipedia uses templates we assume the not
    italicized link will be in one of the `p` tags."""
    soup = BeautifulSoup(html_content, "html.parser")
    paragraphs = soup.find_all('p')
    for p in paragraphs:
        string = ''
        for element in p:
            if type(element) == bs4.element.NavigableString:
                string += element
            elif type(element) == bs4.element.Tag and element.name == 'a':
                if balanced_parenths(string):
                    return element
            else:
                string += element.get_text()
    return None

def find_percentage(urls):
    """(list) -> int
    Return the percentage of pages that lead to philosophy.
    """
    # n is the number of pages that lead to philosophy
    n = 0
    for url in urls:
        if find_philosophy(url, [], 0) != -1:
            n += 1
    percentage = n * 100 / len(urls)
    return percentage

def random_percentage(m):
    """(int) - int
    Return the precentage of pages that lead to `philosophy` for
    `m` number of pages."""
    pages = []
    for i in range(m):
        p = 'https://en.wikipedia.org/wiki/Special:Random'
        pages.append(p)
    return find_percentage(pages)


def distribution(urls):
    """(list) -> list
    Distrbution of path lengths to reach `philosophy` page.
    Elements of `urls` are the starting urls.
    """
    distr = []
    for url in urls:
        r = find_philosophy(url, [], 0)
        if r != -1:
            distr.append(r)
    return distr

def random_distribution(m):
    """(int) -> list
    Distribution of path lengths mapped to occurrence for `m` pages."""
    urls = []
    for i in range(m):
        p = 'https://en.wikipedia.org/wiki/Special:Random'
        urls.append(p)
    return Counter(distribution(urls))

def balanced_parenths(string):
    """(str) -> bool
    Return True if `string` contains the same number of opening and closing parenths,
    otherwise False."""
    balanced = 0
    for c in string:
        if c == '(':
            balanced += 1
        elif c == ')':
            balanced -= 1
    return balanced == 0

def differrent_path(old_url, new_url):
    """(str, str) -> bool
    Make sure we are not staying on the same page."""
    parsed_old = urlparse(old_url)
    parsed_new = urlparse(new_url)
    return parsed_old.path != parsed_new.path

if __name__ == '__main__':
    m = 5
    # print random_percentage(m)
    print random_distribution(m)