-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdictionary_preprocessing.py
More file actions
122 lines (107 loc) · 5.36 KB
/
dictionary_preprocessing.py
File metadata and controls
122 lines (107 loc) · 5.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
This file is used to preprocess the dictionary files
"""
import re, io, json, os
if __name__ == "__main__":
#The first dictionary
with open("data/dictionary/source/subjclueslen1-HLTEMNLP05.tff") as dicobject:
dicdata = dicobject.readlines()
dicItems = []
for line in dicdata:
dicline = re.split('\s+|\=+', line)
dicItems.append({'word': dicline[5], 'type': dicline[1], 'priorpolarity': dicline[11]})
with io.open('data/dictionary/mydictionary_6bins.json', 'w', encoding='utf-8') as outfile:
outfile.write(unicode(json.dumps(dicItems, ensure_ascii=False)))
#The second first dictionary
with open("data/dictionary/source/neg.txt") as negdicobject, open("data/dictionary/source/pos.txt") as posdicobject, open("data/dictionary/source/negation.txt") as negationdicobject:
negdicdata = negdicobject.readlines()
posdicdata = posdicobject.readlines()
negationdicdata = negationdicobject.readlines()
dicItems = []
for line in negdicdata:
word = line.split()[0]
normed = re.sub('[^a-z]', '', word.lower())
if normed:
dicItems.append({'word': normed, 'type': 'none', 'priorpolarity': 'negative'})
for line in posdicdata:
word = line.split()[0]
normed = re.sub('[^a-z]', '', word.lower())
if normed:
dicItems.append({'word': normed, 'type': 'none', 'priorpolarity': 'positive'})
for line in negationdicdata:
word = line.split()[0]
normed = re.sub('[^a-z]', '', word.lower())
if normed:
if normed not in dicItems:
dicItems.append({'word': normed, 'type': 'none', 'priorpolarity': 'negation'})
with io.open('data/dictionary/mydictionary_3bins.json', 'w', encoding='utf-8') as outfile:
outfile.write(unicode(json.dumps(dicItems, ensure_ascii=False)))
#The third dictionary
with open("data/dictionary/source/SentiWordNet_3.0.0_20130122.txt") as dicobject:
dicdata = dicobject.readlines()
length = str(len(dicdata))
dicItems = []
i = 0
print("Step 1\n")
try:
for line in dicdata:
i +=1
print ("line " + str(i) + " of " + length + '\n')
dicline = re.split('\s+|\=+', line)
for word in dicline:
if "#" in word:
test = re.split('\#+', word)
if len(test)==2 and test[0] != '':
word = test[0]
quan = float(test[1])
pos = quan * float(dicline[2])
neg = quan * float(dicline[3])
#print(word + " " + str(quan) + " " + str(pos) + " " + str(neg))
if (len(dicItems) == 0):
dicItems.append({'word': word, 'quantity': quan, 'pos': pos, 'neg': neg})
isFound = False
for idx, item in enumerate(dicItems):
if item["word"] == word:
newQuan = quan + int(item["quantity"])
newPos = pos + float(item["pos"])
newNeg = neg + float(item["neg"])
dicItem = {'word': word, 'quantity': newQuan, 'pos': newPos, 'neg': newNeg}
dicItems[idx] = dicItem
isFound = True
if isFound == False:
dicItems.append({'word': word, 'quantity': quan, 'pos': pos, 'neg': neg})
except Exception:
print line
print len(test)
print(test[0])
print(test[0])
raise
print("Step 2\n")
for idx, item in enumerate(dicItems):
newPosScore = float(item["pos"]) / float(item["quantity"])
newNegScore = float(item["neg"]) / float(item["quantity"])
newDicItem = {'word': item["word"], 'pos': newPosScore, 'neg': newNegScore}
dicItems[idx] = newDicItem
print(dicItems)
print("Step 3\n")
with io.open('data/dictionary/mydictionary_2bins_temporary.json', 'w', encoding='utf-8') as outfile:
outfile.write(unicode(json.dumps(dicItems, ensure_ascii=False)))
with open("data/dictionary/mydictionary_2bins_temporary.json") as dicObject2Bins:
dicData2Bins = json.load(dicObject2Bins)
dicItems = []
i = 0
for item in dicData2Bins:
if float(item["pos"]) == 0.0 and float(item["neg"]) == 0.0:
continue
print("skip")
if "-" in item["word"] or "_" in item["word"]:
continue
print("-------------------------------------------------------------------------------------")
dicItems.append(item)
i += 1
print(str(i))
with io.open('data/dictionary/mydictionary_2bins.json', 'w', encoding='utf-8') as outfile:
outfile.write(unicode(json.dumps(dicItems, ensure_ascii=False)))
if os.path.isfile('data/dictionary/mydictionary_2bins_temporary.json'):
os.remove('data/dictionary/mydictionary_2bins_temporary.json')
print("Finish preprocessing dictionaries!")