diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 20ad5d8..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,130 +0,0 @@
-##################### PYTHON GIT IGNORES #################################
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-# Usually these files are written by a python script from a template
-# before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-# However, in case of collaboration, if having platform-specific dependencies or dependencies
-# having no cross-platform support, pipenv may install dependencies that don't work, or not
-# install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
diff --git a/README.md b/README.md
deleted file mode 100644
index c12f11c..0000000
--- a/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# DiversityDataAugmentation
-
-## Data
-- Download IMDB data [here](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
-- `python -m spacy download en` following torchtext tokenization [tutorial](https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html)
-- Download SST-2
diff --git a/__init__.py b/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/bart_rl.py b/bart_rl.py
deleted file mode 100644
index 8224794..0000000
--- a/bart_rl.py
+++ /dev/null
@@ -1,201 +0,0 @@
-from transformers import (
- MaxLengthCriteria,
- TemperatureLogitsWarper,
- TopKLogitsWarper,
- MinLengthLogitsProcessor,
- LogitsProcessorList
-)
-from transformers import (
- BartTokenizerFast, BartForConditionalGeneration
-)
-import torch
-from utils import DEV, LOG_EPS, MODEL_KEY
-
-
-def load_bart_model(layers=None):
- '''
- Load pretrained BartForConditionalGeneration
- '''
- config = dict()
- if layers:
- assert type(layers) == int
- config = dict(encoder_layers=layers, decoder_layers=layers)
- model = BartForConditionalGeneration.from_pretrained(MODEL_KEY, **config)
- return model
-
-def load_bart_tokenizer():
- '''
- Load pretrained bart tokenizer
- '''
- return BartTokenizerFast.from_pretrained(MODEL_KEY)
-
-class BartReinforce():
- '''
- BART for RL - specifically policy gradient with REINFORCE
- '''
- def __init__(self, model, device):
- self.model = model
- self.model = self.model.to(device)
- self.device = device
- # Contains actions from latest batch. List of token_id LongTensors
- self.actions = list()
- # Contains log_probs of actions from latest batch. FloatTensor(batch_size, num_steps) of log probs
- self.log_probs = None
- # if MULTI_GPU:
- # self.parallel_forward = torch.nn.DataParallel(self.model, device_ids=device_ids).to(dev)
- # else:
- self.pad_token_id = self.model.config.pad_token_id
- self.eos_token_id = self.model.config.eos_token_id
- self.bos_token_id = self.model.config.bos_token_id
- # This is 2 which equals , the eos_token_id
- self.decoder_start_token_id = self.model.config.decoder_start_token_id
- self.is_encoder_decoder = self.model.config.is_encoder_decoder
-
- @property
- def encoder(self):
- '''
- Getter for encoder
- '''
- return self.model.model.encoder
-
- def freeze_encoder_params(self):
- '''
- Freeze encoder params from updates
- '''
- for layer in self.encoder.parameters():
- layer.requires_grad= False
-
- def clear_episode_batch(self):
- '''
- Clear actions and log probs from last batch of episodes
- '''
- self.actions = list()
- self.log_probs = None
-
- def sample_policy(self, probs):
- '''
- Epsilon-greedy sampling from softmax distribution
-
- Args:
- probs :torch.FloatTensor of shape (batch_size, vocab_size): Softmax probs for token
- '''
- # epsilon-greedy (note this goes across batch), use uniform probs
- if torch.rand(1).item() < self.epsilon:
- # filter out 0 probability tokens
- num_nonzero = (probs != 0).sum().item()
- sample_probs = torch.ones(probs.shape)/num_nonzero
- sample_probs[probs == 0] = 0.
- # use policy probs
- else:
- sample_probs = probs
- return torch.distributions.Categorical(sample_probs).sample()
-
- def run_step(self, probs, unfinished_sequences):
- '''
- Sample next tokens for batch and store actions and log probabilities
- '''
- next_tokens = self.sample_policy(probs).to(self.device)
- next_tokens = next_tokens * unfinished_sequences + self.pad_token_id * (1 - unfinished_sequences)
- probs = probs + LOG_EPS
- selected_log_probs = torch.log(probs.gather(1, next_tokens.unsqueeze(-1)))
- self.actions.append(next_tokens.cpu())
- if self.log_probs is None:
- self.log_probs = selected_log_probs
- else:
- self.log_probs = torch.cat((self.log_probs, selected_log_probs), 1)
- return next_tokens
-
- def prepare_inputs_for_decoder(self, input_ids, model_kwargs):
- '''
- Run encoder and set up decoder input ids
-
- Returns:
- :torch.LongTensor of shape (batch_size, vocab_size): Decoder input ids
- :dict: Updated model_kwargs with encoder_outputs
- '''
- # Should be True for BART models. Run encoder and set up decoder inputs
- if self.is_encoder_decoder:
- encoder_input_ids, attention_mask = input_ids, model_kwargs['attention_mask']
- # Get encoder outputs of type BaseModelOutput
- self.encoder.to(self.device)
- model_kwargs['encoder_outputs'] = self.encoder(input_ids=encoder_input_ids, attention_mask=attention_mask)
- model_kwargs = self.model._prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)
- # Set input_ids as decoder_input_ids
- if "decoder_input_ids" in model_kwargs:
- input_ids = model_kwargs.pop("decoder_input_ids")
- else:
- input_ids = self.model._prepare_decoder_input_ids_for_generation(
- input_ids, decoder_start_token_id=self.decoder_start_token_id, bos_token_id=self.bos_token_id
- )
- if "encoder_outputs" not in model_kwargs:
- raise ValueError("Make sure that `model_kwargs` include `encoder_outputs`.")
- return input_ids, model_kwargs
-
-
- def generate_episodes(self, batch, min_length=0, max_length=None, temperature=1.0, epsilon=0.001, topk=500, verbose=False):
- '''
- Generate episodes over batch of sequences
-
- Args:
- batch :List[torch.Tensor]: Contains the below elements (in order)
- input_ids :shape (batch_size, seq_length): Input sequence for generation.
- attention_mask :shape (batch_size, seq_length): Attention mask.
- labels :shape (batch_size, ): Label for each sequence.
- max_length :int: Max output sequence length
- temperature :float: Rescale logits before softmax by `logits = logits/temperature`. Higher temperatures t result in softer probability distribution. which goes to uniform as t->infinity.
- '''
- # Set epsilon for this batch
- self.epsilon = epsilon
- model_kwargs = dict()
- input_ids, attention_mask, labels = batch
- input_ids, attention_mask, labels = input_ids.to(self.device), attention_mask.to(self.device), labels.to(self.device)
- model_kwargs['attention_mask'] = attention_mask
- input_ids, model_kwargs = self.prepare_inputs_for_decoder(input_ids, model_kwargs)
- max_length = max_length if max_length is not None else self.model.config.max_length
- # For setting sequence length limit
- stopping_criteria = MaxLengthCriteria(max_length)
- # Get distribution pre_processing samplers
- logits_warper = LogitsProcessorList()
- if temperature > 0:
- logits_warper.append(TemperatureLogitsWarper(temperature))
- if topk > 0:
- logits_warper.append(TopKLogitsWarper(topk))
- if min_length > 0:
- logits_warper.append(MinLengthLogitsProcessor(min_length, self.eos_token_id))
- ## Generation ##
- # Keep track of which sequences are already finished
- ###
- # Initially unfinished_sequences = sequence of 1s with length batch size
- # and cur_len = tensor of shape (batch_size, 1) containing decoder_start_token_id.
- ####
- unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
- cur_len = input_ids.shape[-1]
- i = 0
- while True:
- i += 1
- if verbose:
- print("Step", i)
- ## Run decoder for one time step ##
- # Dictionary with masks, decoder input ids, and encoder outputs
- model_inputs = self.model.prepare_inputs_for_generation(input_ids, **model_kwargs)
- # Seq2SeqLMOutput
- outputs = self.model(**model_inputs, return_dict=True)
- # Logits of shape (batch_size, 1, vocab_size) -> (batch_size, vocab_size). See top k with torch.topk
- next_token_scores = logits_warper(input_ids, outputs.logits[:, -1, :])
- probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
- next_tokens = self.run_step(probs, unfinished_sequences)
- ## Update for next step ##
- # append next tokens
- input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
- # update past to past_key_values from outputs, attention mask should be same as from model_inputs
- model_kwargs = self.model._update_model_kwargs_for_generation(
- outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder
- )
- # update length
- cur_len = cur_len + 1
- # If eos_token was found in one sentence, set sentence to finished
- unfinished_sequences = unfinished_sequences.mul((next_tokens != self.eos_token_id).long())
- # stop when each sentence is finished, or if we exceed the maximum length
- if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, None):
- break
- return input_ids
\ No newline at end of file
diff --git a/clf_sst2.py b/clf_sst2.py
deleted file mode 100644
index 25cfd8c..0000000
--- a/clf_sst2.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
-import utils
-from data import SSTLoader, TokenizerWrapper
-import torch
-from torch import nn
-import numpy as np
-from tqdm import tqdm
-from typing import Union, List
-
-class DistilBertSST(nn.Module):
- '''
- Get finetuned checkpoint of distilbert on sst2
- '''
- model_key = 'distilbert-base-uncased-finetuned-sst-2-english'
- def __init__(self, device=utils.DEV):
- super().__init__()
- self.device = device
- self.model = DistilBertForSequenceClassification.from_pretrained(self.model_key).to(self.device)
- self.tokenizer = TokenizerWrapper(
- DistilBertTokenizerFast.from_pretrained(self.model_key),
- {
- 'return_tensors': 'pt',
- 'padding': True,
- 'truncation': True
- }
- )
-
- def forward(self, *args, **kwargs):
- return self.model(*args, **kwargs)
-
- @torch.no_grad()
- def predict_on_text(self, text: Union[str, List[str]]) -> np.ndarray:
- '''
- Get predicted labels from applying model to text or texts.
- Let N := number of input texts and C := the number of classes
-
- Returns:
- :np.ndarray of shape (N, ): Predicted labels
- :np.ndarray of shape (N, C): Probabilities for each class
- '''
- self.model.eval()
- encodings = self.tokenizer.encode(text)
- outputs = self(encodings['input_ids'].to(self.device), encodings['attention_mask'].to(self.device))
- return (
- np.argmax(utils.convert_to_numpy(outputs.logits), axis=1).flatten(),
- nn.functional.softmax(outputs.logits, dim=-1)
- )
-
-def run_validate_model():
- model = DistilBertSST()
- model.to(self.device)
- sst2 = SSTLoader(model.tokenizer, batch_size=128, lim=-1)
- train_loader, val_loader, test_loader = sst2.get_train_loader(), sst2.get_val_loader(), sst2.get_test_loader()
-
- print(f"Testing on {len(val_loader)*val_loader.batch_size} inputs")
- model.eval()
- with tqdm(val_loader, unit="batch") as pbar:
- for batch in pbar:
- accs = []
- with torch.no_grad():
- batch = [data.to(self.device) for data in batch]
- outputs = model(batch[0], batch[1], labels=batch[2])
- accs.append(utils.flat_accuracy(outputs.logits, batch[2]))
- pbar.set_description(f"Mean Accuracy so far = {np.mean(accs)}")
- print("Final Accuracy = ", np.mean(accs))
-
-if __name__ == '__main__':
- run_validate_model()
-
-
-
diff --git a/data.py b/data.py
deleted file mode 100644
index ff236d7..0000000
--- a/data.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import torch
-import torch.utils.data as torch_data
-from typing import List, Dict, Union
-import datasets
-
-class TokenizerWrapper():
- '''
- Wrapper for tokenizer
- '''
- default_encode_config = dict(
- add_special_tokens = True,
- padding=True, truncation=True,
- return_tensors='pt'
- )
- def __init__(self, tokenizer, encode_config=default_encode_config):
- self.t = tokenizer
- self.encode_config = encode_config
-
- @property
- def mask_token_id(self):
- return self.t.mask_token_id
-
- def encode(self, sentences: Union[str, List[str]]) -> Dict[str, torch.Tensor]:
- '''
- Return tokenized sentences
-
- Returns
- encodings: dict with 'input_ids' and 'attention_mask'
- '''
- return self.t(sentences, **self.encode_config)
-
- def decode(self, encodings: torch.LongTensor, skip_special_tokens=True) -> List[str]:
- '''
- Return decoded sentences from token id sequences
- '''
- def decode_id_seq(s):
- decoded = self.t.decode(s, skip_special_tokens=skip_special_tokens)
- try:
- return bytes(decoded, 'utf8').decode('latin1', 'ignore')
- except UnicodeEncodeError as e:
- import pdb; pdb.set_trace()
- _ = 1
- return [decode_id_seq(e) for e in encodings]
-
-class SSTLoader():
- '''
- Data loading for sst data. Does encoding etc
-
- Params:
- lim: Use lim of -1 to use all samples. Otherwise uses up to lim samples.
- batch_size: Batch size for loaders
- tokenizer: Tokenizer
-
- Returns:
- batches from TensorDataset with elements: input_ids, attention_mask, labels
-
- Dataset reference:
- DatasetDict({
- train: Dataset({
- features: ['sentence', 'label', 'idx'],
- num_rows: 67349
- })
- validation: Dataset({
- features: ['sentence', 'label', 'idx'],
- num_rows: 872
- })
- test: Dataset({
- features: ['sentence', 'label', 'idx'],
- num_rows: 1821
- })
- })
- Features:
- {
- 'sentence': Value(dtype='string', id=None),
- 'label': ClassLabel(num_classes=2, names=['negative', 'positive'],
- 'idx': Value(dtype='int32', id=None)
- }
- '''
- def __init__(self, tokenizer: TokenizerWrapper = None, batch_size: int = 8, lim: int = -1):
- self.lim = lim
- self.tokenizer = tokenizer
- self.batch_size = batch_size
- self.__load_sst_binary()
-
-
-
- def __load_sst_binary(self):
- '''
- Set sst train, val and test data
- '''
- # Training data from glue (not tokenized) containing keys ('sentence', 'idx', 'label')
- raw_dss = datasets.load_dataset("sst")
- dss = [raw_dss['train'], raw_dss['validation'], raw_dss['test']]
- for i, ds in enumerate(dss):
- dss[i] = self.preprocess_dataset(ds)
- self.train_dataset, self.val_dataset, self.test_dataset = dss
-
- def __create_torch_dataloader(self, sents, labels, shuffle) -> torch_data.DataLoader:
- encodings = self.tokenizer.encode(sents)
- torch_ds = torch_data.TensorDataset(
- encodings['input_ids'],
- encodings['attention_mask'],
- labels
- )
- return torch_data.DataLoader(torch_ds, batch_size=self.batch_size, shuffle=shuffle)
-
- def __sst_to_loader(self, d, s):
- return self.__create_torch_dataloader(d['sentence'], torch.as_tensor(d['label'], dtype=torch.long), s)
-
- @staticmethod
- def __sentiment_to_binary(example):
- example['label'] = round(example['label'])
- return example
-
- def __preprocess_example_sents(self, example):
- example['sentence'] = self.preprocess_sentence(example['sentence'])
- example['sentence'] = example['sentence'] + " " + str(int(example['label']))
- return example
-
- @staticmethod
- def preprocess_sentence(s: str):
- # remove_punc = "()-[]{};:\",<>/@#$%^&*_~`"
- # s = s.lower().strip()
- # s = ''.join([c for c in s if c not in remove_punc])
- return s
-
- def preprocess_dataset(self, ds: datasets.Dataset) -> datasets.Dataset:
- if self.lim > 0:
- ds = ds.select(range(self.lim))
- ds = ds.map(self.__sentiment_to_binary)
- ds = ds.map(self.__preprocess_example_sents)
- return ds
-
- def get_train_loader(self, shuffle=True):
- '''
- Encodes dataset and return train dataloader (data batches)
- '''
- return self.__sst_to_loader(self.train_dataset, shuffle)
-
- def get_val_loader(self, shuffle=True):
- '''
- Encodes dataset and return val dataloader (data batches)
- '''
- return self.__sst_to_loader(self.val_dataset, shuffle)
-
- def get_test_loader(self, shuffle=True):
- '''
- Encodes dataset and return test dataloader (data batches)
- '''
- return self.__sst_to_loader(self.test_dataset, shuffle)
-
-if __name__ == '__main__':
- from bart_rl import load_bart_tokenizer
- sst2 = SSTLoader(TokenizerWrapper(load_bart_tokenizer()), lim=100)
- # train_loader = sst2.get_train_loader()
-
-
-
diff --git a/deprecated/train.py b/deprecated/train.py
deleted file mode 100644
index a9643d8..0000000
--- a/deprecated/train.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import torch
-from torch import optim
-import transformers
-from transformers import (BartTokenizerFast, PreTrainedTokenizerFast, BartModel, BartForConditionalGeneration,
- LogitsProcessorList, MinLengthLogitsProcessor, StoppingCriteriaList, MaxLengthCriteria,
- AutoModelForSeq2SeqLM)
-from transformers.generation_utils import GenerationMixin
-import datasets
-from fuzzywuzzy import fuzz
-gm = GenerationMixin
-
-import pdb
-if torch.cuda.is_available():
- dev = "cuda:0"
-else:
- dev = "cpu"
-
-print(f'Using device {dev}')
-# MODEL_KEY = 'sshleifer/distilbart-cnn-12-3'
-MODEL_KEY = 'facebook/bart-base'
-
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
- """
- Shift input ids one token to the right.
- """
- shifted_input_ids = input_ids.new_zeros(input_ids.shape)
- shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
- shifted_input_ids[:, 0] = decoder_start_token_id
-
- assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
- # replace possible -100 values in labels by `pad_token_id`
- shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
- return shifted_input_ids
-
-def load_sst(start=0, end=10):
- return datasets.load_dataset('glue', 'sst2', split=f'train[{start}:{end}]')
-
-def load_model():
- layers = 2 # default is 12
- config = dict(encoder_layers=layers, decoder_layers=layers)
-
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_KEY, **config)
- model = model.to(dev)
- return model
-
-def tokenize_sentences(sentences):
- tokenizer = BartTokenizerFast.from_pretrained(MODEL_KEY)
- ftok = lambda z: tokenizer(z, truncation=True, padding='longest', return_tensors='pt')
- tokenized = ftok([s for s in sentences])
- tokenized.input_ids = tokenized.input_ids.to(dev)
- tokenized.attention_mask = tokenized.attention_mask.to(dev)
-
- return tokenized, tokenizer
-
-def reward_fuzz_match(s_in, s_out):
- '''
- Reward based on fuzzy match ratio. Encourage similarity (toy example)
- '''
- r = fuzz.ratio(s_in, s_out) / 100
- return r
-
-def reward_matching_tokens(s_in, s_out):
- '''
- Reward based on number of matching words
- '''
- t1 = set(s_in.split())
- t2= set(s_out.split())
- return len(t1 & t2) / len(t1 | t2)
-
-def get_inputs(model, input_ids):
- decoder_start_token_id = model.config.decoder_start_token_id
- bos_token_id = model.config.bos_token_id
- model_kwargs = dict()
- # prepare attention mask and encoder output
- model_kwargs["attention_mask"] = gm._prepare_attention_mask_for_generation(
- model, input_ids, pad_token_id, eos_token_id)
- encoder_input_ids = input_ids if model.config.is_encoder_decoder else None
- if model.config.is_encoder_decoder:
- model_kwargs = gm._prepare_encoder_decoder_kwargs_for_generation(model, input_ids, model_kwargs)
-
- input_ids = gm.i(
- model, input_ids, decoder_start_token_id=decoder_start_token_id, bos_token_id=bos_token_id)
-
- model_kwargs["use_cache"] = None
-
- logits_processor = gm._get_logits_processor(
- model,
- repetition_penalty=None,
- bad_words_ids=None,
- min_length=10,
- max_length=11,
- eos_token_id=None,
- prefix_allowed_tokens_fn=None,
- num_beam_groups=None,
- diversity_penalty=None,
- no_repeat_ngram_size=None,
- encoder_no_repeat_ngram_size=None,
- encoder_input_ids=encoder_input_ids,
- forced_bos_token_id=None,
- forced_eos_token_id=None,
- num_beams=None,
- remove_invalid_values=True)
- return input_ids, logits_processor, model_kwargs
-
-def compute_sequence_score(sequence_ids, sequence_scores):
- '''
- sequence_ids: sequence token ids of shape (max_length, )
- sequence_scores: sequence_scores of shape (max_length - 1, vocab_size) containing pre-softmax scores
- '''
- sequence_scores = torch.log_softmax(sequence_scores, 1)
- policy_scores = []
- for i, id in enumerate(sequence_ids[1:]):
- # get score for chosen action i.e which token was generated
- score = sequence_scores[i][id]
- policy_scores.append(score)
- # We should have a score for each token in the sequence
- return torch.tensor(policy_scores, requires_grad=True, device=dev).sum()
-
-def decode_sentences(sequences, tokenizer):
- gen_sentences = [tokenizer.decode(s, skip_special_tokens=True).encode('utf-8') for s in outputs['sequences']]
- return gen_sentences
-
-
-if __name__ == '__main__':
- LR = 1e-3
- USE_AMS = False
- EPOCHS = 100
- interval = 10
- TEST = False
- sst_dataset = load_sst(end=1)
- if TEST:
- test_sents = ['test', 'testing', 'test z', 'test y', 'test w']
- encodings, tokenizer = tokenize_sentences(['test', 'testing', 'test z', 'test y', 'test w'])
- else:
- encodings, tokenizer = tokenize_sentences(sst_dataset['sentence'])
- model = load_model()
- pad_token_id = model.config.pad_token_id
- eos_token_id = model.config.eos_token_id
- if TEST:
- batches = [(list(range(len(test_sents))), test_sents, encodings)]
- else:
- batches = [(sst_dataset['idx'], sst_dataset['sentence'], encodings)]
- optimizer = optim.Adam(model.parameters(), lr=LR, amsgrad=USE_AMS)
- for epoch in range(EPOCHS):
- for b in batches:
- optimizer.zero_grad()
- indices, sentences, e = b
- # inputs = get_inputs(model, e.input_ids)
- # input_ids, logits_processor, model_kwargs = inputs
- ### SAMPLE FOR RL ###
- # outputs = gm.sample(
- # model,
- # input_ids,
- # logits_processor=logits_processor,
- # pad_token_id=pad_token_id,
- # eos_token_id=eos_token_id,
- # output_scores=True,
- # return_dict_in_generate=True,
- # **model_kwargs)
-
-
- # pdb.set_trace()
- encoder_output = model.model.encoder(input_ids=e.input_ids, attention_mask=e.attention_mask)
- decoder_input_ids = model._prepare_decoder_input_ids_for_generation(e.input_ids)
- outputs = model.sample(decoder_input_ids, encoder_outputs=encoder_output, stopping_criteria=MaxLengthCriteria(20), output_scores=True, return_dict_in_generate=True)
- # Decode sentences and compute losses
- gen_sentences = decode_sentences(outputs['sequences'], tokenizer)
- # Get generated sequences of ids and reshape for selecting log probs corresponding to actions
- logits = torch.stack(outputs['scores'], dim=0)
- log_probs = torch.log_softmax(logits.squeeze(), dim=1)
- seq = outputs['sequences'].flatten()[1:].unsqueeze(1)
- selected_probs=torch.gather(log_probs, 1, seq)
- # Compute reward
- rewards = []
- for s_in, s_out in zip(sentences, gen_sentences):
- rewards.append(reward_matching_tokens(s_in, s_out.decode('utf-8')))
- rewards = torch.tensor(rewards, requires_grad=False, device=dev)
- rewards = rewards
- # Compute loss
- loss = (rewards * -selected_probs).mean()
- ### SUPERVISED COPY ####
- # if epoch % interval == 0:
- # gen_sentences = model.generate(input_ids=e.input_ids, attention_mask=e.attention_mask)
- # gen_sentences = [tokenizer.decode(s, skip_special_tokens=True).encode('utf-8') for s in gen_sentences]
- # loss = model(input_ids=e.input_ids, labels=e.input_ids).loss
-
- loss.backward()
- optimizer.step()
-
- if epoch % interval == 0:
- print(f"--------------------------EPOCH {epoch}--------------------------")
- print("REWARDS:", rewards)
- print("SELECTED_PROBS:",selected_probs)
- print("LOSS:", loss)
- print("GENERATED:", gen_sentences)
- print("TARGETS:", sentences)
- # print(log_probs, list(model.named_parameters())[:3])
diff --git a/deprecated/train_1014.py b/deprecated/train_1014.py
deleted file mode 100644
index ad27ac5..0000000
--- a/deprecated/train_1014.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import torch
-from torch import optim
-import transformers
-from transformers import (BartTokenizerFast, PreTrainedTokenizerFast, BartModel, BartForConditionalGeneration,
- LogitsProcessorList, MinLengthLogitsProcessor, StoppingCriteriaList, MaxLengthCriteria,
- AutoModelForSeq2SeqLM)
-from transformers.generation_utils import GenerationMixin
-import datasets
-from fuzzywuzzy import fuzz
-gm = GenerationMixin
-
-import pdb
-if torch.cuda.is_available():
- dev = "cuda:0"
-else:
- dev = "cpu"
-dev = "cpu"
-
-print(f'Using device {dev}')
-# MODEL_KEY = 'sshleifer/distilbart-cnn-12-3'
-MODEL_KEY = 'facebook/bart-base'
-
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
- """
- Shift input ids one token to the right.
- """
- shifted_input_ids = input_ids.new_zeros(input_ids.shape)
- shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
- shifted_input_ids[:, 0] = decoder_start_token_id
-
- assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
- # replace possible -100 values in labels by `pad_token_id`
- shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
- return shifted_input_ids
-
-def load_sst(start=0, end=10):
- return datasets.load_dataset('glue', 'sst2', split=f'train[{start}:{end}]')
-
-def load_model():
- layers = 2 # default is 12
- config = dict(encoder_layers=layers, decoder_layers=layers)
-
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_KEY, **config)
- model = model.to(dev)
- return model
-
-def tokenize_sentences(sentences):
- tokenizer = BartTokenizerFast.from_pretrained(MODEL_KEY)
- ftok = lambda z: tokenizer(z, truncation=True, padding='longest', return_tensors='pt')
- tokenized = ftok([s for s in sentences])
- tokenized.input_ids = tokenized.input_ids.to(dev)
- tokenized.attention_mask = tokenized.attention_mask.to(dev)
-
- return tokenized, tokenizer
-
-def reward_match(s_in, s_out):
- '''
- Reward based on fuzzy match ratio. Encourage similarity (toy example)
- '''
- r = fuzz.ratio(s_in, s_out) / 100
- return r
-
-def get_inputs(model, input_ids):
- decoder_start_token_id = model.config.decoder_start_token_id
- bos_token_id = model.config.bos_token_id
- model_kwargs = dict()
- # prepare attention mask and encoder output
- model_kwargs["attention_mask"] = gm._prepare_attention_mask_for_generation(
- model, input_ids, pad_token_id, eos_token_id)
- encoder_input_ids = input_ids if model.config.is_encoder_decoder else None
- if model.config.is_encoder_decoder:
- model_kwargs = gm._prepare_encoder_decoder_kwargs_for_generation(model, input_ids, model_kwargs)
-
- input_ids = gm._prepare_decoder_input_ids_for_generation(
- model, input_ids, decoder_start_token_id=decoder_start_token_id, bos_token_id=bos_token_id)
-
- model_kwargs["use_cache"] = None
-
- logits_processor = gm._get_logits_processor(
- model,
- repetition_penalty=None,
- bad_words_ids=None,
- min_length=10,
- max_length=11,
- eos_token_id=None,
- prefix_allowed_tokens_fn=None,
- num_beam_groups=None,
- diversity_penalty=None,
- no_repeat_ngram_size=None,
- encoder_no_repeat_ngram_size=None,
- encoder_input_ids=encoder_input_ids,
- forced_bos_token_id=None,
- forced_eos_token_id=None,
- num_beams=None,
- remove_invalid_values=True)
- return input_ids, logits_processor, model_kwargs
-
-def compute_sequence_score(sequence_ids, sequence_scores):
- '''
- sequence_ids: sequence token ids of shape (max_length, )
- sequence_scores: sequence_scores of shape (max_length - 1, vocab_size) containing pre-softmax scores
- '''
- sequence_scores = torch.log_softmax(sequence_scores, 1)
- policy_scores = []
- for i, id in enumerate(sequence_ids[1:]):
- # get score for chosen action i.e which token was generated
- score = sequence_scores[i][id]
- policy_scores.append(score)
- # We should have a score for each token in the sequence
- return torch.tensor(policy_scores, requires_grad=True, device=dev).sum()
-
-
-
-
-if __name__ == '__main__':
- LR = 1e-3
- USE_AMS = False
- EPOCHS = 100
- interval = 10
- TEST = False
- sst_dataset = load_sst(end=1)
- if TEST:
- test_sents = ['test', 'testing', 'test z', 'test y', 'test w']
- encodings, tokenizer = tokenize_sentences(['test', 'testing', 'test z', 'test y', 'test w'])
- else:
- encodings, tokenizer = tokenize_sentences(sst_dataset['sentence'])
- model = load_model()
- pad_token_id = model.config.pad_token_id
- eos_token_id = model.config.eos_token_id
- if TEST:
- batches = [(list(range(len(test_sents))), test_sents, encodings)]
- else:
- batches = [(sst_dataset['idx'], sst_dataset['sentence'], encodings)]
- optimizer = optim.Adam(model.parameters(), lr=LR, amsgrad=USE_AMS)
- for epoch in range(EPOCHS):
- print(epoch)
- for b in batches:
- optimizer.zero_grad()
- indices, sentences, e = b
- # inputs = get_inputs(model, e.input_ids)
- # input_ids, logits_processor, model_kwargs = inputs
- ### SAMPLE FOR RL ###
- # outputs = gm.sample(
- # model,
- # input_ids,
- # logits_processor=logits_processor,
- # pad_token_id=pad_token_id,
- # eos_token_id=eos_token_id,
- # output_scores=True,
- # return_dict_in_generate=True,
- # **model_kwargs)
-
-
- # pdb.set_trace()
- encoder_output = model.model.encoder(input_ids=e.input_ids, attention_mask=e.attention_mask)
- decoder_input_ids = model.prepare_decoder_input_ids_from_labels(e.input_ids)
- outputs = model.sample(decoder_input_ids, encoder_outputs=encoder_output, stopping_criteria=MaxLengthCriteria(30), output_scores=True, return_dict_in_generate=True)
- # Decode sentences and compute losses
- gen_sentences = [tokenizer.decode(s, skip_special_tokens=True).encode('utf-8') for s in outputs['sequences']]
- # get generated sequences of ids and reshape for selecting log probs corresponding to actions
- logits = torch.stack(outputs['scores'], dim=0)
- log_probs = torch.log_softmax(logits.squeeze(), dim=1)
-
- # seq = outputs['sequences'][:,1:]
- # seq = seq.reshape((seq.shape[1], -1))
- # selected_probs=torch.gather(log_probs, 1, seq)
- rewards = []
- for s_in, s_out in zip(sentences, gen_sentences):
- rewards.append(reward_match(s_in, s_out.decode('utf-8')))
- rewards = torch.tensor(rewards, requires_grad=False, device=dev)
- rewards = rewards
- # loss = (rewards * -selected_probs).mean()
- # labels = e.input_ids[:,1:]
- # # PADDING
- padding = torch.tensor([tokenizer.pad_token_id]*(len(logits)-e.input_ids.shape[1])).unsqueeze(0).to(dev)
- labels = torch.cat((e.input_ids, padding), 1)
- # # PADDING
- padding2 = torch.tensor([tokenizer.pad_token_id]*(labels.shape[1]-len(logits))).unsqueeze(0).to(dev)
- # pdb.set_trace()
- if labels is not None:
- loss_fct = torch.nn.CrossEntropyLoss()
- z1, z2 = logits.squeeze(1), labels.flatten().unsqueeze(1)
- yhat = torch.gather(z1, 1, z2)
- try:
- masked_lm_loss = loss_fct(yhat, labels)
- except:
- pdb.set_trace()
- loss = masked_lm_loss
-
-
- ### SUPERVISED COPY ####
- # if epoch % interval == 0:
- # gen_sentences = model.generate(input_ids=e.input_ids, attention_mask=e.attention_mask)
- # gen_sentences = [tokenizer.decode(s, skip_special_tokens=True).encode('utf-8') for s in gen_sentences]
- # loss = model(input_ids=e.input_ids, labels=e.input_ids).loss
-
- loss.backward()
- optimizer.step()
-
- if epoch % interval == 0:
- print(f"--------------------------EPOCH {epoch}--------------------------")
- print("REWARDS:", rewards)
- # print("SELECTED_PROBS:",selected_probs)
- print("LOSS:", loss)
- print("GENERATED:", gen_sentences)
- print("TARGETS:", sentences)
- # print(log_probs, list(model.named_parameters())[:3])
diff --git a/notebooks/Diversity exploration.ipynb b/notebooks/Diversity exploration.ipynb
deleted file mode 100644
index aef1ff9..0000000
--- a/notebooks/Diversity exploration.ipynb
+++ /dev/null
@@ -1,492 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "3979e885",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The autoreload extension is already loaded. To reload it, use:\n",
- " %reload_ext autoreload\n"
- ]
- }
- ],
- "source": [
- "%load_ext autoreload\n",
- "%autoreload 2\n",
- "import pandas as pd\n",
- "from pathlib import Path\n",
- "import sys\n",
- "sys.path.append('..')\n",
- "sys.path.append('../..')\n",
- "###\n",
- "import torchtext\n",
- "import torch\n",
- "from torchtext.data.utils import get_tokenizer\n",
- "from torchtext.vocab import Vocab\n",
- "###\n",
- "from bs4 import BeautifulSoup\n",
- "import re\n",
- "###\n",
- "import spacy\n",
- "# nlp = spacy.load('en_core_web_sm', disable=[\"ner\"])\n",
- "###\n",
- "import random"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "59116071",
- "metadata": {},
- "outputs": [],
- "source": [
- "SEED = 1\n",
- "SAMPLE_FRAC = 0.1\n",
- "random.seed(SEED)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "f2e6f958",
- "metadata": {},
- "outputs": [],
- "source": [
- "### SST-2 ###\n",
- "def parse_line(line):\n",
- " index, sent = line.split('\\t')\n",
- " if 'sentence_index' in index:\n",
- " return (-1,'')\n",
- " sent = re.sub('\\n', '', sent)\n",
- " index = int(index) - 1\n",
- " return (index, sent)\n",
- "\n",
- "def get_original_sst2():\n",
- " # Load SST-2\n",
- " sst_dir = data_dir / 'SST2-Data/SST2-Data/stanfordSentimentTreebank/stanfordSentimentTreebank'\n",
- " fp = sst_dir / 'datasetSentences.txt'\n",
- " sents = {}\n",
- " with fp.open('r') as file:\n",
- " for i, line in enumerate(file):\n",
- " index, sent = parse_line(line)\n",
- " if not (index < 0):\n",
- " sents[index] = sent\n",
- " return sents\n",
- " \n",
- "### IMDB Processing ###\n",
- "#Removing the html strips\n",
- "def strip_html(text):\n",
- " soup = BeautifulSoup(text, \"html.parser\")\n",
- " return soup.get_text()\n",
- "\n",
- "#Removing the square brackets\n",
- "def remove_between_square_brackets(text):\n",
- " return re.sub('\\[[^]]*\\]', '', text)\n",
- "\n",
- "#Removing the noisy text\n",
- "def denoise_text(text):\n",
- " text = strip_html(text)\n",
- " text = remove_between_square_brackets(text)\n",
- " return text"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "f9ef99f2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[PosixPath('../../data/.DS_Store'),\n",
- " PosixPath('../../data/SST2-Data'),\n",
- " PosixPath('../../data/IMDB Dataset.csv')]"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data_dir = Path('../../data')\n",
- "list(data_dir.iterdir())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "3cab6f7c",
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "# # Load IMDB data\n",
- "# df = pd.read_csv(data_dir/'IMDB Dataset.csv')\n",
- "# print(f\"Loaded {len(df)} samples, randomly sampling {int(SAMPLE_FRAC * len(df))} rows\")\n",
- "# # Sample percentage of data\n",
- "# df = df.sample(frac=SAMPLE_FRAC, random_state=SEED)\n",
- "# # Convert sentiment columns to numerical values\n",
- "# df.sentiment = df.sentiment.apply(lambda x: 1 if x=='positive' else 0)\n",
- "# df['review']=df['review'].apply(denoise_text)\n",
- "# df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "f0f69f29",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(11855,\n",
- " \"The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\\\\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth .\")"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Load SST-2 and subsample\n",
- "sents = get_original_sst2()\n",
- "# num_samples = int(SAMPLE_FRAC * len(sents)) \n",
- "# print(f\"Got {len(sents)} samples, randomly sampling {num_samples} samples\")\n",
- "# sents = random.sample(sents, num_samples)\n",
- "len(sents), sents[1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "4cf548ab",
- "metadata": {},
- "outputs": [],
- "source": [
- "# # Tokenize reviews\n",
- "# tokenizer = get_tokenizer('spacy', language='en_core_web_sm')\n",
- "# tokenized_texts = [tokenizer(seq) for seq in df.review]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "6eb6bb87",
- "metadata": {},
- "outputs": [],
- "source": [
- "# # Process reviews for sentences \n",
- "# docs = []\n",
- "# for doc in nlp.pipe(df.review):\n",
- "# docs.append(doc)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "ae52f25b",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Tokenize SST-2 Sentences\n",
- "# tokenizer = get_tokenizer('spacy', language='en_core_web_sm')\n",
- "# sents_tokenized = [tokenizer(sent) for sent in sents]\n",
- "\n",
- "from nltk.tokenize import word_tokenize\n",
- "# sents_tokenized = list(map(word_tokenize, sents))\n",
- "# sents_tokenized[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "b2099317",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[nltk_data] Downloading package stopwords to\n",
- "[nltk_data] /Users/Sameer/nltk_data...\n",
- "[nltk_data] Package stopwords is already up-to-date!\n",
- "[nltk_data] Downloading package punkt to /Users/Sameer/nltk_data...\n",
- "[nltk_data] Package punkt is already up-to-date!\n",
- "[nltk_data] Downloading package wordnet to /Users/Sameer/nltk_data...\n",
- "[nltk_data] Package wordnet is already up-to-date!\n",
- "[nltk_data] Downloading package words to /Users/Sameer/nltk_data...\n",
- "[nltk_data] Package words is already up-to-date!\n",
- "../../eda_nlp/code/eda.py:177: SyntaxWarning: \"is not\" with a literal. Did you mean \"!=\"?\n",
- " words = [word for word in words if word is not '']\n"
- ]
- }
- ],
- "source": [
- "from TextGenerationEvaluationMetrics import multiset_distances as MSD\n",
- "from DataAugmentation.data import augmentation\n",
- "from eda_nlp.code.eda import get_only_chars\n",
- "# from DataAugmentation.data import back_translation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "30c7c225",
- "metadata": {},
- "outputs": [],
- "source": [
- "# from inspect import getmembers, isfunction\n",
- "# print(getmembers(augmentation, isfunction))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "076805ca",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(11855, 11855)"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "'''\n",
- "Generating eda samples:\n",
- "python eda_nlp/code/augment.py \n",
- " --input=data/SST2-Data/SST2-Data/stanfordSentimentTreebank/stanfordSentimentTreebank/datasetSentences.txt \n",
- " --output=./sst2_augmented.txt \n",
- " --num_aug=5 --alpha_sr=0.3 --alpha_rd=0.1 --alpha_ri=0.1 --alpha_rs=0.0\n",
- "'''\n",
- "def get_augmented_sst2():\n",
- " fp = Path('../../sst2_augmented.txt')\n",
- " sents = {}\n",
- " with fp.open('r') as file:\n",
- " for i, line in enumerate(file):\n",
- " index, sent = parse_line(line)\n",
- " if not (index < 0):\n",
- " if index in sents:\n",
- " sents[index].append(sent)\n",
- " else:\n",
- " sents[index] = [sent]\n",
- " return sents\n",
- "\n",
- "orig_sents = get_original_sst2()\n",
- "orig_sents = {j:get_only_chars(t) for j, t in orig_sents.items()}\n",
- "aug_sents = get_augmented_sst2()\n",
- "len(orig_sents), len(aug_sents)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "895edd7b",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('the rock is destined to be the st century s new conan and that he s going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal ',\n",
- " ['the shake is bound to be the st hundred s newly conan and that he s going to make a splash even great than matthew arnold schwarzenegger blue jean claud caravan damme or steven george segal',\n",
- " 'the sway is destined to be the st c s new conan and that he s live on to make believe a splash yet nifty than benedict arnold schwarzenegger jean claud new wave damme or steven george segal',\n",
- " 'the destined to be the st century s new conan and that he s going to make a splash greater than arnold schwarzenegger jean claud van damme steven segal',\n",
- " 'the rock is destined to be george segal the st century s new conan and that he s going to make a splash even matthew arnold greater than arnold schwarzenegger jean claud van damme or steven atomic number segal',\n",
- " 'the rock is destined to be the st century s new conan and that he s going to make a splash even greater than arnold atomic number schwarzenegger jean atomic number claud van damme represent or steven segal',\n",
- " 'the rock is destined to be the st century s new conan and that he s going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal '])"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "orig_sents[0], aug_sents[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "id": "299b938b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Jaccard distances preprocess upto 5!\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "{1: 0.0233949945593036, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0}"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "index = 1\n",
- "\n",
- "# ref1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']\n",
- "# ref2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party']\n",
- "# ref3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party']\n",
- "# sen1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']\n",
- "# sen2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', 'interested', 'in', 'world', 'history']\n",
- "\n",
- "references = map(word_tokenize, aug_sents[index])\n",
- "sentences = map(word_tokenize, [orig_sents[index]])\n",
- "sentences, references = map(list, (sentences, references))\n",
- "\n",
- "msd = MSD.MultisetDistances(references=references, min_n=1, max_n=5)\n",
- "msj_distance = msd.get_jaccard_score(sentences=sentences)\n",
- "msj_distance\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "1e99f1e1",
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['the gorgeously elaborated continuation of the lord of the anchor ring trilogy is so brobdingnagian that a tower of words can not adequately draw co writer director tool old hickory s flesh out visual sensation of joule r r tolkien s middle ground',\n",
- " 'the gorgeously work out continuance of the lord of the reverberate trilogy is so brobdingnagian that a newspaper column of words can not adequately discover co author managing director peter andrew jackson s expanded vision of j radius radius tolkien s middle terra firma',\n",
- " 'the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words can not adequately describe co writer director peter jackson s expanded vision and then of j r r tolkien immense michael joe jackson s middle earth',\n",
- " 'the gorgeously of the lord of the rings trilogy is so that a column of words can not adequately co director peter s expanded vision of j r r tolkien s middle earth',\n",
- " 'the gorgeously elaborate continuation of the lord the rings trilogy is so huge that a column of words can not adequately describe co director peter jackson s expanded vision of j r r tolkien s middle earth',\n",
- " 'the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words can not adequately describe co writer director peter jackson s expanded vision of j r r tolkien s middle earth']"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "[' '.join(s) for s in references]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6f4db50e",
- "metadata": {},
- "outputs": [],
- "source": [
- "' '.join(sentences[0])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "e0b2f334",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['the',\n",
- " 'gorgeously',\n",
- " 'elaborate',\n",
- " 'continuation',\n",
- " 'of',\n",
- " 'the',\n",
- " 'lord',\n",
- " 'of',\n",
- " 'the',\n",
- " 'rings',\n",
- " 'trilogy',\n",
- " 'is',\n",
- " 'so',\n",
- " 'huge',\n",
- " 'that',\n",
- " 'a',\n",
- " 'column',\n",
- " 'of',\n",
- " 'words',\n",
- " 'can',\n",
- " 'not',\n",
- " 'adequately',\n",
- " 'describe',\n",
- " 'co',\n",
- " 'writer',\n",
- " 'director',\n",
- " 'peter',\n",
- " 'jackson',\n",
- " 's',\n",
- " 'expanded',\n",
- " 'vision',\n",
- " 'of',\n",
- " 'j',\n",
- " 'r',\n",
- " 'r',\n",
- " 'tolkien',\n",
- " 's',\n",
- " 'middle',\n",
- " 'earth']"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sentences[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b9dc23dd",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "dda",
- "language": "python",
- "name": "dda"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/package.json b/package.json
new file mode 100755
index 0000000..b026c76
--- /dev/null
+++ b/package.json
@@ -0,0 +1,47 @@
+{
+ "name": "starter-react-app",
+ "version": "1.0.0",
+ "description": "A starter react app based on the latest standards with Typescript.",
+ "main": "src/index.tsx",
+ "scripts": {
+ "start": "react-scripts start",
+ "build": "react-scripts build",
+ "test": "react-scripts test",
+ "eject": "react-scripts eject"
+ },
+ "dependencies": {
+ "@types/jest": "^26.0.15",
+ "@types/node": "^12.0.0",
+ "@types/react": "^17.0.0",
+ "@types/react-dom": "^17.0.0",
+ "firebase": "^8.2.1",
+ "react": "^17.0.2",
+ "react-dom": "^17.0.2",
+ "react-scripts": "4.0.3",
+ "typescript": "^4.1.2"
+ },
+ "devDependencies": {
+ "@typescript-eslint/eslint-plugin": "^4.5.0",
+ "@typescript-eslint/parser": "^4.5.0",
+ "eslint": "^7.11.0",
+ "eslint-plugin-react": "^7.21.5"
+ },
+ "eslintConfig": {
+ "extends": [
+ "react-app",
+ "react-app/jest"
+ ]
+ },
+ "browserslist": {
+ "production": [
+ ">0.2%",
+ "not dead",
+ "not op_mini all"
+ ],
+ "development": [
+ "last 1 chrome version",
+ "last 1 firefox version",
+ "last 1 safari version"
+ ]
+ }
+}
\ No newline at end of file
diff --git a/public/index.html b/public/index.html
new file mode 100755
index 0000000..88a13f6
--- /dev/null
+++ b/public/index.html
@@ -0,0 +1,12 @@
+
+
+
{error}
} +