Bird/ClassnameProcessing.py at main · Programming-in-AI/Bird · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import torch.nn as nn
import _pickle as pickle
import numpy as np
import torch.nn.functional as F


def class_embedding(sentence, word2vec, emb_dim):
    # print('Sentence shape: ', sentence.shape)
    # print('Word2vec shape: ', word2vec.shape)
    # print('Word2index shape: ', len(word2index))
    # print('Embedding dimension: ', emb_dim)

    batch, num_class, max_words = sentence.shape # num_class is the number of top-k class ids
    rnn_size = 1024
    sentence = sentence.reshape(batch * num_class, max_words)
    sentence = sentence.long()

    # create word embedding
    embed_ques_W = word2vec.clone().detach().requires_grad_(True)
    embed_ques_W = torch.nn.Parameter(embed_ques_W)

    # create LSTM
    lstm_1 = nn.LSTM(emb_dim, rnn_size, batch_first=True)
    lstm_dropout_1 = nn.Dropout(0.2, inplace=False)
    lstm_2 = nn.LSTM(rnn_size, rnn_size, batch_first=True)
    lstm_dropout_2 = nn.Dropout(0.2, inplace=False)

    state = (torch.zeros(1, num_class, rnn_size),
             torch.zeros(1, num_class, rnn_size))

    for i in range(max_words):
        # print(sentence)
        # print(sentence.size())
        # print(i, sentence[:, i])
        # print(embed_ques_W)
        # print(embed_ques_W.size())
        print(sentence[:, i])
        cls_emb_linear = F.embedding(sentence[:, i], embed_ques_W)
        cls_emb_drop = F.dropout(cls_emb_linear, .8)
        cls_emb = torch.tanh(cls_emb_drop)
        cls_emb = cls_emb.view(batch, num_class, emb_dim)
        cls_emb = cls_emb.permute(1, 0, 2)

        # print(cls_emb.shape, state[0].shape, state[1].shape)
        with torch.no_grad():
            output, state = lstm_1(lstm_dropout_1(cls_emb), state)
            output, state = lstm_2(lstm_dropout_2(output), state)

    output = output.reshape(batch, rnn_size, num_class)

    return output


def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()`

    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.

    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """

    m = X.shape[0]  # number of training examples
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))

    for i in range(m):  # loop over training examples
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = (X[i].lower()).split()
        sentence_words = sentence_words[:max_len]  # ex)sentence_words = ['american', 'three', 'toed', 'woodpecker']
        # Initialize j to 0
        j = 0
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = word_to_index[w]
            # Increment j to j + 1
            j += 1
    return X_indices


def read_glove_vecs(glove_file, dictionary_file):
    d = pickle.load(open(dictionary_file, 'rb'))
    word_to_vec_map = np.load(glove_file)
    words_to_index = d[0]
    index_to_words = d[1]
    return words_to_index, index_to_words, word_to_vec_map