-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathClassnameProcessing.py
More file actions
93 lines (75 loc) · 3.49 KB
/
ClassnameProcessing.py
File metadata and controls
93 lines (75 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import torch.nn as nn
import _pickle as pickle
import numpy as np
import torch.nn.functional as F
def class_embedding(sentence, word2vec, emb_dim):
# print('Sentence shape: ', sentence.shape)
# print('Word2vec shape: ', word2vec.shape)
# print('Word2index shape: ', len(word2index))
# print('Embedding dimension: ', emb_dim)
batch, num_class, max_words = sentence.shape # num_class is the number of top-k class ids
rnn_size = 1024
sentence = sentence.reshape(batch * num_class, max_words)
sentence = sentence.long()
# create word embedding
embed_ques_W = word2vec.clone().detach().requires_grad_(True)
embed_ques_W = torch.nn.Parameter(embed_ques_W)
# create LSTM
lstm_1 = nn.LSTM(emb_dim, rnn_size, batch_first=True)
lstm_dropout_1 = nn.Dropout(0.2, inplace=False)
lstm_2 = nn.LSTM(rnn_size, rnn_size, batch_first=True)
lstm_dropout_2 = nn.Dropout(0.2, inplace=False)
state = (torch.zeros(1, num_class, rnn_size),
torch.zeros(1, num_class, rnn_size))
for i in range(max_words):
# print(sentence)
# print(sentence.size())
# print(i, sentence[:, i])
# print(embed_ques_W)
# print(embed_ques_W.size())
print(sentence[:, i])
cls_emb_linear = F.embedding(sentence[:, i], embed_ques_W)
cls_emb_drop = F.dropout(cls_emb_linear, .8)
cls_emb = torch.tanh(cls_emb_drop)
cls_emb = cls_emb.view(batch, num_class, emb_dim)
cls_emb = cls_emb.permute(1, 0, 2)
# print(cls_emb.shape, state[0].shape, state[1].shape)
with torch.no_grad():
output, state = lstm_1(lstm_dropout_1(cls_emb), state)
output, state = lstm_2(lstm_dropout_2(output), state)
output = output.reshape(batch, rnn_size, num_class)
return output
def sentences_to_indices(X, word_to_index, max_len):
"""
Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
The output shape should be such that it can be given to `Embedding()`
Arguments:
X -- array of sentences (strings), of shape (m, 1)
word_to_index -- a dictionary containing the each word mapped to its index
max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.
Returns:
X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
"""
m = X.shape[0] # number of training examples
# Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
X_indices = np.zeros((m, max_len))
for i in range(m): # loop over training examples
# Convert the ith training sentence in lower case and split is into words. You should get a list of words.
sentence_words = (X[i].lower()).split()
sentence_words = sentence_words[:max_len] # ex)sentence_words = ['american', 'three', 'toed', 'woodpecker']
# Initialize j to 0
j = 0
# Loop over the words of sentence_words
for w in sentence_words:
# Set the (i,j)th entry of X_indices to the index of the correct word.
X_indices[i, j] = word_to_index[w]
# Increment j to j + 1
j += 1
return X_indices
def read_glove_vecs(glove_file, dictionary_file):
d = pickle.load(open(dictionary_file, 'rb'))
word_to_vec_map = np.load(glove_file)
words_to_index = d[0]
index_to_words = d[1]
return words_to_index, index_to_words, word_to_vec_map