HTMLParser/parser.py at master · onyazuka/HTMLParser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from html.parser import HTMLParser
import http.client
from collections import namedtuple
from dom import *
from connection import *


PARSER_MODE = {
    "RAW": 0,
    "URL": 1
}


class HTMLDomParser(HTMLParser):

    '''
    tags, that are always empty
    we need to do push and pop in tag start as we see them(if they are not self-closing)
    '''
    EMPTY_TAGS = [
        "area",
        "base",
        "basefont",
        "br",
        "col",
        "frame",
        "hr",
        "img",
        "input",
        "isindex",
        "link",
        "meta",
        "param",
    ]

    def __init__(self, mode, content, connection=None):

        assert mode in PARSER_MODE.values(), \
               "HTMLDomParser mode invalid"
        HTMLParser.__init__(self)
        rawHtml = content if mode == PARSER_MODE["RAW"] else self._fromUrl(content, connection)
        # stack[0] is always a document element
        self.stack = []
        self.stack.append(HTMLDocument())
        self.feed(rawHtml)
        self._siblingify(self.getDocument())
        self._siblingifyElements(self.getDocument())

    def getDocument(self):
        assert len(self.stack) > 0, "HTMLDomParser: invalid DOM, stack is empty"
        return self.stack[0]

    def handle_starttag(self, tag, attrs):
        newElem = HTMLDomElement(self.getDocument(), self.stack[-1], tag, attrs)
        self.stack[-1].appendChild(newElem)
        self.stack.append(newElem)
        if(tag in HTMLDomParser.EMPTY_TAGS):
            self.handle_endtag(tag)

    def handle_startendtag(self, tag, attrs):
        self.stack.append(HTMLDomElement(self.getDocument(), self.stack[-1], tag, attrs))
        self.handle_endtag(tag)

    def handle_endtag(self, tag):
        self.stack.pop()

    def handle_data(self, data):
        # stripping whitespaces, tabulation, new line chars
        strippedData = data.strip("\n\t ")
        # nothing to do here
        if(len(strippedData) == 0):
            return
        if(len(self.stack) > 0):
            self.stack[-1].appendChild(HTMLDomNode(self.getDocument(), parent=self.stack[-1], text=strippedData))

    '''
        Two modes supported: with alive connection and without it.
        Connection is instance of HTTPConnection
    '''
    def _fromUrl(self, url, connection=None):
        if connection is None:
            rawHtml = Connection.getUrlContentsAsUtf8(url)
        else:
            assert isinstance(connection, Connection), 'HTMLDomParser::_fromUrl() - invalid connection passed'
            rawHtml = connection.getFromConnection(url)
        return rawHtml

    '''
        Must be called when tree is completed.
        Recursively makes double linked lists from one-level siblings.
    '''
    def _siblingify(self, startElem):
        assert isinstance(startElem, HTMLDomNode), "HTMLDomParser::_siblingify() - 'startElem' is not HTMLDomNode"
        children = startElem.childNodes()
        for i in range(0, len(children)):
            if i != 0:
                children[i]._setLeftSibling(children[i-1])
            self._siblingify(children[i])
            if i != (len(children) - 1):
                children[i]._setRightSibling(children[i+1])

    '''
            Must be called when tree is completed.
            Recursively makes double linked lists from one-level siblings.
            Does siblingification for ELEMENTS only
        '''

    def _siblingifyElements(self, startElem):
        assert isinstance(startElem, HTMLDomElement), "HTMLDomParser::_siblingifyElements() - 'startElem' is not HTMLDomElement"
        children = startElem.children()
        for i in range(0, len(children)):
            if i != 0:
                children[i]._setLeftElementSibling(children[i - 1])
            self._siblingifyElements(children[i])
            if i != (len(children) - 1):
                children[i]._setRightElementSibling(children[i + 1])