-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexamples.py
More file actions
82 lines (69 loc) · 3.21 KB
/
Copy pathexamples.py
File metadata and controls
82 lines (69 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import numpy as np
import torch
import torch.optim as optim
from tqdm import tqdm
from BERT.model.BERT import BERT
from BERT.model.TrainableBERT import TrainableBERT
from BERT.utils.BERTLoss import BERTLoss
from BERT.utils.BERTTokenizer import load_vocab
from BERT.utils.BERTTrainer import BERTTrainer
from BERT.utils.BERTTrainingDataset import BERTTrainingDataset
from BERT.utils.HuggingfaceUtils import load_huggingface_pretrained_bert
def main():
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
ds = BERTTrainingDataset(total_size=1000, batch_size=5, device=device)
config = {
"hidden_size": 256, # Hidden size for the network
"vocab_size": 30522, # The size of the vocabulary including [MASK], [PAD], [CLS], and [SEP]
"num_layers": 3, # Number of transformer blocks
"positional_learnt": False, # Whether to learn positional embeddings or use sinusoids
"n_heads": 2, # Number of attention heads
"attention_dropout_rate": 0., # Dropout probability for attention
"dropout_rate": 0., # Dropout probability for feed-forward layers
"max_sentence_length": 100, # Maximum sentence length
"bottleneck_size": 256, # Bottleneck size for the residual links
"eps_value": 1e-12 # Epsilon for layer normalization
}
model = TrainableBERT(config).to(device)
criterion = BERTLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
trainer = BERTTrainer(ds, run_desc="toy_run", log=True)
trainer.train(model, criterion, optimizer, iters=5, save_every=100)
del model
del optimizer
print("Loading pre-trained BERT...")
config, state_dict = load_huggingface_pretrained_bert("../pretrained/bert-base-uncased.tar.gz")
model = TrainableBERT(config)
model.load_state_dict(state_dict)
model = model.to(device)
print("Evaluating pre-trained BERT...")
with torch.no_grad():
losses = []
for batch in tqdm(ds):
losses.append(criterion(model(batch["input"]), batch["tags"]))
print(f"Loss for pre-trained BERT: {np.mean(losses)}")
del model
del state_dict
del ds
# You should download this from the HuggingFace repository
config, state_dict = load_huggingface_pretrained_bert("../pretrained/bert-base-uncased.tar.gz", False)
model = BERT(config)
model.load_state_dict(state_dict)
model = model.to(device)
vocab, _ = load_vocab("../data/uncased_vocab.txt")
sentence = np.load("../data/dataset_toy.npy")[0]
print("Encoding the first sentence in the dataset")
# Adding CLS and SEP token
input_ = np.full((1, len(sentence) + 2), vocab["[PAD]"], dtype=np.int64)
input_[0, 1:len(sentence) + 1] = sentence
input_[0, 0] = vocab['[CLS]']
input_[0, len(sentence) + 1] = vocab['[SEP]']
input_ = torch.tensor(input_).to(device)
sentence_type = torch.ones_like(input_)
sentence_mask = torch.ones_like(input_).float()
batch = {"sentences": input_, "sentence_type": sentence_type, "sentence_mask": sentence_mask}
np.set_printoptions(formatter={'float': '{: 0.3f}'.format}, threshold=10)
with torch.no_grad():
print(model(batch)[1][0].cpu().numpy())
if __name__ == "__main__":
main()