MMapPy/utils.py at master · Shakesbeery/MMapPy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import re

from bs4 import BeautifulSoup
from collections import defaultdict


def xml_to_soup(xml_string):
    '''
    Create a beautiful soup object from a MetaMap XML string.

    '''
    return BeautifulSoup(xml_string[xml_string.find('<?xml'):], 'lxml')


def parse_candidate(candidate, candidatematch, map_dict):
    '''
    For a given MetaMap candidate, extract relevant information
    and store in map_dict for future reference.

    Arguments:
        candidate (beautifulsoup object): A BS object containing
                                          the candidate
                                          metainformation

        candidatematch (str): The substring of the utterance that
                              triggered a MetaMap candidate match.

        map_dict (defaultdict): A dictionary providing mappings
                                from candidate matches to their
                                metainformation.

    Returns:
        map_dict (defaultdict): An updated version of the input.

    '''
    cui = candidate.find('candidatecui').get_text()
    semtype = candidate.find('semtype').get_text()
    negated = candidate.find('negated').get_text()
    preferred = candidate.find('candidatepreferred').get_text()

    map_dict[candidatematch]['CUI'] = cui
    map_dict[candidatematch]['SemanticType'] = semtype
    map_dict[candidatematch]['Negated'] = negated
    map_dict[candidatematch]['PreferredTerm'] = preferred

    return map_dict


def handle_multi_word_candidates(candidatematch, start, extra, extra_d):
    '''
    A method used to track special cases where the MetaMap returned candidate
    does not match the original text. e.g. 'heart' and 'attack' are two
    seperate lexical units, but one semantic unit 'heart attack'. As such
    they are tracked and used to reference metainformation which would
    otherwise only be assigned to a single word.

    Arguments:
        candidatematch (str): The substring of the utterance that
                              triggered a MetaMap candidate match.

        start (str): The index location of the substring. Possibly useful
                     in cases where identical substrings may have different
                     syntactical/semantic mappings.

        extra (str): A substring of `utt_text` that is being used as a key
                     to reference the full utterance and location.

        extra_d (defaultdict): A container for tracking the lexical-semantic
                               string differences.

    Returns:
        extra_d (defaultdict): An updated version of the input.

    '''
    extra = extra.lower()
    if extra not in extra_d or start not in extra_d[extra]:
        extra_d[extra][start]['T'] = candidatematch

    return extra_d


def handle_syntax_units(inputmatch, output_d, extra_d, map_dict,
                        utt_text, lexcat):
    '''
    For each syntactical unit identified by MetaMap, aggregate the stored
    metainformation and create a new row entry in the output storage
    dictionary.

    Arguments:
        inputmatch (str): The full string that led to a MetaMap match

        output_d (defaultdict): A container for holding the final output
                                information.

        extra_d (defaultdict): A container for tracking the lexical-semantic
                               string differences.

        map_dict (defaultdict): A dictionary providing mappings
                                from candidate matches to their
                                metainformation.

        utt_text (str): The text of the MetaMap utterance.

        lexcat (str): The part of speech as tagged by MetaMap

    Returns:
        output_d (defaultdict): An updated version of the input.

    '''
    match = inputmatch.lower()
    match_loc = utt_text.lower().find(match)
    keys = ('CUI', 'SemanticType', 'PreferredTerm', 'Negated')

    for word in inputmatch.split():
        output_d['FullLexicalUnit'].append(inputmatch)
        output_d['Word'].append(word)
        output_d['LexicalCategory'].append(lexcat)

        #Get the metainformation for the match
        if inputmatch.lower() in map_dict:
            for key in keys:
                output_d[key].append(map_dict[match][key])

        #Handle multi-word metainformation
        elif match in extra_d and match_loc in extra_d[match]:
            map_T = map_dict[extra_d[match][match_loc]['T']]
            for key in keys:
                output_d[key].append(map_T[key])

        else:
            for key in keys:
                output_d[key].append('')

    return output_d


def extract_results_from_soup(soup):
    '''
    Given a beautiful soup object of a MetaMap xml output, parse the results
    and return a dictionary of ordered metainformtion.

    Arguments:
        soup (beautifulsoup object): A beautiful soup object created from
                                     a MetaMap xml output string.

    Returns:
        output_d (defaultdict): A container for holding the final output
                                information.

        extra_d (defaultdict): A container for tracking the lexical-semantic
                               string differences.

    '''
    utterances = soup.find_all('utterance')
    extra_d = defaultdict(lambda: defaultdict(dict))
    output_d = defaultdict(list)
    for utterance in utterances:
        #Process at a sentene level
        utt_text = utterance.find('utttext').get_text()
        map_dict = defaultdict(dict)
        candidates = utterance.find_all('candidate')
        for candidate in candidates:
            #Process at a ~word level
            candidatematch = candidate.find('candidatematched')
            candidatematch = candidatematch.get_text().lower()

            length = int(candidate.find('length').get_text())
            can_len = len(candidatematch)

            #Handle cases where multiple lexical units inform a single candidate
            if len(candidatematch.split()) > 1 or can_len != length:
                start = int(candidate.find('startpos').get_text())

                for extra in utt_text[start:start+length].split():
                    extra_d = handle_multi_word_candidates(candidatematch,
                                                           start, extra,
                                                           extra_d)
            if candidatematch not in map_dict:
                map_dict = parse_candidate(candidate, candidatematch, map_dict)

        units = utterance.find_all('syntaxunit')
        for unit in units:
            try:
                lexcat = unit.find('lexcat').get_text()
            except AttributeError:
                lexcat = 'punc'

            inputmatch = unit.find('inputmatch').get_text()
            output_d = handle_syntax_units(inputmatch, output_d,
                                           extra_d, map_dict,
                                           utt_text, lexcat)
    return output_d, extra_d