-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
116 lines (100 loc) · 3.81 KB
/
parser.py
File metadata and controls
116 lines (100 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from html.parser import HTMLParser
import http.client
from collections import namedtuple
from dom import *
from connection import *
PARSER_MODE = {
"RAW": 0,
"URL": 1
}
class HTMLDomParser(HTMLParser):
'''
tags, that are always empty
we need to do push and pop in tag start as we see them(if they are not self-closing)
'''
EMPTY_TAGS = [
"area",
"base",
"basefont",
"br",
"col",
"frame",
"hr",
"img",
"input",
"isindex",
"link",
"meta",
"param",
]
def __init__(self, mode, content, connection=None):
assert mode in PARSER_MODE.values(), \
"HTMLDomParser mode invalid"
HTMLParser.__init__(self)
rawHtml = content if mode == PARSER_MODE["RAW"] else self._fromUrl(content, connection)
# stack[0] is always a document element
self.stack = []
self.stack.append(HTMLDocument())
self.feed(rawHtml)
self._siblingify(self.getDocument())
self._siblingifyElements(self.getDocument())
def getDocument(self):
assert len(self.stack) > 0, "HTMLDomParser: invalid DOM, stack is empty"
return self.stack[0]
def handle_starttag(self, tag, attrs):
newElem = HTMLDomElement(self.getDocument(), self.stack[-1], tag, attrs)
self.stack[-1].appendChild(newElem)
self.stack.append(newElem)
if(tag in HTMLDomParser.EMPTY_TAGS):
self.handle_endtag(tag)
def handle_startendtag(self, tag, attrs):
self.stack.append(HTMLDomElement(self.getDocument(), self.stack[-1], tag, attrs))
self.handle_endtag(tag)
def handle_endtag(self, tag):
self.stack.pop()
def handle_data(self, data):
# stripping whitespaces, tabulation, new line chars
strippedData = data.strip("\n\t ")
# nothing to do here
if(len(strippedData) == 0):
return
if(len(self.stack) > 0):
self.stack[-1].appendChild(HTMLDomNode(self.getDocument(), parent=self.stack[-1], text=strippedData))
'''
Two modes supported: with alive connection and without it.
Connection is instance of HTTPConnection
'''
def _fromUrl(self, url, connection=None):
if connection is None:
rawHtml = Connection.getUrlContentsAsUtf8(url)
else:
assert isinstance(connection, Connection), 'HTMLDomParser::_fromUrl() - invalid connection passed'
rawHtml = connection.getFromConnection(url)
return rawHtml
'''
Must be called when tree is completed.
Recursively makes double linked lists from one-level siblings.
'''
def _siblingify(self, startElem):
assert isinstance(startElem, HTMLDomNode), "HTMLDomParser::_siblingify() - 'startElem' is not HTMLDomNode"
children = startElem.childNodes()
for i in range(0, len(children)):
if i != 0:
children[i]._setLeftSibling(children[i-1])
self._siblingify(children[i])
if i != (len(children) - 1):
children[i]._setRightSibling(children[i+1])
'''
Must be called when tree is completed.
Recursively makes double linked lists from one-level siblings.
Does siblingification for ELEMENTS only
'''
def _siblingifyElements(self, startElem):
assert isinstance(startElem, HTMLDomElement), "HTMLDomParser::_siblingifyElements() - 'startElem' is not HTMLDomElement"
children = startElem.children()
for i in range(0, len(children)):
if i != 0:
children[i]._setLeftElementSibling(children[i - 1])
self._siblingifyElements(children[i])
if i != (len(children) - 1):
children[i]._setRightElementSibling(children[i + 1])