-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
192 lines (144 loc) · 6.87 KB
/
utils.py
File metadata and controls
192 lines (144 loc) · 6.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import re
from bs4 import BeautifulSoup
from collections import defaultdict
def xml_to_soup(xml_string):
'''
Create a beautiful soup object from a MetaMap XML string.
'''
return BeautifulSoup(xml_string[xml_string.find('<?xml'):], 'lxml')
def parse_candidate(candidate, candidatematch, map_dict):
'''
For a given MetaMap candidate, extract relevant information
and store in map_dict for future reference.
Arguments:
candidate (beautifulsoup object): A BS object containing
the candidate
metainformation
candidatematch (str): The substring of the utterance that
triggered a MetaMap candidate match.
map_dict (defaultdict): A dictionary providing mappings
from candidate matches to their
metainformation.
Returns:
map_dict (defaultdict): An updated version of the input.
'''
cui = candidate.find('candidatecui').get_text()
semtype = candidate.find('semtype').get_text()
negated = candidate.find('negated').get_text()
preferred = candidate.find('candidatepreferred').get_text()
map_dict[candidatematch]['CUI'] = cui
map_dict[candidatematch]['SemanticType'] = semtype
map_dict[candidatematch]['Negated'] = negated
map_dict[candidatematch]['PreferredTerm'] = preferred
return map_dict
def handle_multi_word_candidates(candidatematch, start, extra, extra_d):
'''
A method used to track special cases where the MetaMap returned candidate
does not match the original text. e.g. 'heart' and 'attack' are two
seperate lexical units, but one semantic unit 'heart attack'. As such
they are tracked and used to reference metainformation which would
otherwise only be assigned to a single word.
Arguments:
candidatematch (str): The substring of the utterance that
triggered a MetaMap candidate match.
start (str): The index location of the substring. Possibly useful
in cases where identical substrings may have different
syntactical/semantic mappings.
extra (str): A substring of `utt_text` that is being used as a key
to reference the full utterance and location.
extra_d (defaultdict): A container for tracking the lexical-semantic
string differences.
Returns:
extra_d (defaultdict): An updated version of the input.
'''
extra = extra.lower()
if extra not in extra_d or start not in extra_d[extra]:
extra_d[extra][start]['T'] = candidatematch
return extra_d
def handle_syntax_units(inputmatch, output_d, extra_d, map_dict,
utt_text, lexcat):
'''
For each syntactical unit identified by MetaMap, aggregate the stored
metainformation and create a new row entry in the output storage
dictionary.
Arguments:
inputmatch (str): The full string that led to a MetaMap match
output_d (defaultdict): A container for holding the final output
information.
extra_d (defaultdict): A container for tracking the lexical-semantic
string differences.
map_dict (defaultdict): A dictionary providing mappings
from candidate matches to their
metainformation.
utt_text (str): The text of the MetaMap utterance.
lexcat (str): The part of speech as tagged by MetaMap
Returns:
output_d (defaultdict): An updated version of the input.
'''
match = inputmatch.lower()
match_loc = utt_text.lower().find(match)
keys = ('CUI', 'SemanticType', 'PreferredTerm', 'Negated')
for word in inputmatch.split():
output_d['FullLexicalUnit'].append(inputmatch)
output_d['Word'].append(word)
output_d['LexicalCategory'].append(lexcat)
#Get the metainformation for the match
if inputmatch.lower() in map_dict:
for key in keys:
output_d[key].append(map_dict[match][key])
#Handle multi-word metainformation
elif match in extra_d and match_loc in extra_d[match]:
map_T = map_dict[extra_d[match][match_loc]['T']]
for key in keys:
output_d[key].append(map_T[key])
else:
for key in keys:
output_d[key].append('')
return output_d
def extract_results_from_soup(soup):
'''
Given a beautiful soup object of a MetaMap xml output, parse the results
and return a dictionary of ordered metainformtion.
Arguments:
soup (beautifulsoup object): A beautiful soup object created from
a MetaMap xml output string.
Returns:
output_d (defaultdict): A container for holding the final output
information.
extra_d (defaultdict): A container for tracking the lexical-semantic
string differences.
'''
utterances = soup.find_all('utterance')
extra_d = defaultdict(lambda: defaultdict(dict))
output_d = defaultdict(list)
for utterance in utterances:
#Process at a sentene level
utt_text = utterance.find('utttext').get_text()
map_dict = defaultdict(dict)
candidates = utterance.find_all('candidate')
for candidate in candidates:
#Process at a ~word level
candidatematch = candidate.find('candidatematched')
candidatematch = candidatematch.get_text().lower()
length = int(candidate.find('length').get_text())
can_len = len(candidatematch)
#Handle cases where multiple lexical units inform a single candidate
if len(candidatematch.split()) > 1 or can_len != length:
start = int(candidate.find('startpos').get_text())
for extra in utt_text[start:start+length].split():
extra_d = handle_multi_word_candidates(candidatematch,
start, extra,
extra_d)
if candidatematch not in map_dict:
map_dict = parse_candidate(candidate, candidatematch, map_dict)
units = utterance.find_all('syntaxunit')
for unit in units:
try:
lexcat = unit.find('lexcat').get_text()
except AttributeError:
lexcat = 'punc'
inputmatch = unit.find('inputmatch').get_text()
output_d = handle_syntax_units(inputmatch, output_d,
extra_d, map_dict,
utt_text, lexcat)
return output_d, extra_d