From d22723ca3feb263dee8666c201ff8b82d9da9218 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Tue, 1 Dec 2020 18:42:12 +0100 Subject: [PATCH 01/25] Initial proposal --- podium/storage/__init__.py | 3 +- podium/storage/vocab.py | 136 ++++++++++++++++++------------------- 2 files changed, 66 insertions(+), 73 deletions(-) diff --git a/podium/storage/__init__.py b/podium/storage/__init__.py index 14f940c4..49253a4b 100644 --- a/podium/storage/__init__.py +++ b/podium/storage/__init__.py @@ -12,7 +12,7 @@ from .vectorizers.impl import GloVe, NlplVectorizer from .vectorizers.tfidf import TfIdfVectorizer from .vectorizers.vectorizer import BasicVectorStorage, VectorStorage -from .vocab import SpecialVocabSymbols, Vocab +from .vocab import Vocab # Convention: class imports from same module are continuous in one line until the length @@ -37,7 +37,6 @@ "SCPLargeResource", "VectorStorage", "BasicVectorStorage", - "SpecialVocabSymbols", "Vocab", "Example", "ExampleFactory", diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index a3f7aa13..1d36237f 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -1,4 +1,5 @@ """Module contains classes related to the vocabulary.""" +import abc import logging from collections import Counter from enum import Enum @@ -32,35 +33,64 @@ def unique(values: Iterable): yield element -class VocabDict(dict): - """Vocab dictionary class that is used like default dict but without adding missing - key to the dictionary.""" +# Make specials singletons +class Special(str): + @abc.abstractmethod + def apply(self, sequence_or_token): + # Method should ONLY be used in Vocab.numericalize + pass - def __init__(self, default_factory=None, *args, **kwargs): - super().__init__(*args, **kwargs) - self._default_factory = default_factory + def __hash__(self): + # Hash class instead of value + return hash(self.__class__) - def __missing__(self, key): - if self._default_factory is None: - raise KeyError( - "Default factory is not defined and key is not in " "the dictionary." - ) - return self._default_factory() + def __eq__(self, other): + # Check equals via class instead of value + return self.__class__ == other.__class__ -class SpecialVocabSymbols(Enum): - """Class for special vocabular symbols +class BOS(Special): + def __init__(cls, token=''): + return super(BOS, cls).__new__(cls, token) - Attributes - ---------- - UNK : str - Tag for unknown word - PAD : str - TAG for padding symbol - """ + def apply(self, sequence): + return [self] + sequence + + +class EOS(Special): + def __new__(cls, token=''): + return super(EOS, cls).__new__(cls, token) + + def apply(self, sequence): + return sequence + [self] + +################# +# Core specials # +################# - UNK = "" - PAD = "" +class MASK(Special): + def __new__(cls, token=''): + return super(MASK, cls).__new__(cls, token) + + def apply(self, sequence): + # Core special, handled by Vocab + return sequence + +class UNK(Special): + def __new__(cls, token=''): + return super(UNK, cls).__new__(cls, token) + + def apply(self, sequence): + # Core special, handled by Vocab + return sequence + +class PAD(Special): + def __new__(cls, token=''): + return super(PAD, cls).__new__(cls, token) + + def apply(self, sequence): + # Core special, handled by Vocab + return sequence class Vocab: @@ -81,9 +111,10 @@ def __init__( self, max_size=None, min_freq=1, - specials=(SpecialVocabSymbols.UNK, SpecialVocabSymbols.PAD), + specials=(UNK(), PAD()), keep_freqs=False, eager=True, + filter_unk=False ): """Vocab constructor. Specials are first in the vocabulary. @@ -110,56 +141,14 @@ def __init__( self._has_specials = len(self.specials) > 0 self.itos = list(self.specials) - self._default_unk_index = self._init_default_unk_index(self.specials) - self.stoi = VocabDict(self._default_unk) - self.stoi.update({k: v for v, k in enumerate(self.itos)}) + #self._default_unk_index = self._init_default_unk_index(self.specials) + self.stoi = {k: v for v, k in enumerate(self.itos)} self._max_size = max_size self.eager = eager self.finalized = False # flag to know if we're ready to numericalize _LOGGER.debug("Vocabulary has been created and initialized.") - @staticmethod - def _init_default_unk_index(specials): - """Method computes index of default unknown symbol in given collection. - - Parameters - ---------- - specials : iter(SpecialVocabSymbols) - collection of special vocab symbols - - Returns - ------- - index : int or None - index of default unkwnown symbol or None if it doesn't exist - """ - ind = 0 - for spec in specials: - if spec == SpecialVocabSymbols.UNK: - return ind - ind += 1 - return None - - def _default_unk(self): - """Method obtains default unknown symbol index. Used for stoi. - - Returns - ------- - index: int - index of default unknown symbol - - Raises - ------ - ValueError - If unknown symbol is not present in the vocab. - """ - if self._default_unk_index is None: - raise ValueError( - "Unknown symbol is not present in the vocab but " - "the user asked for the word that isn't in the vocab." - ) - return self._default_unk_index - def get_freqs(self): """Method obtains vocabulary frequencies. @@ -194,9 +183,9 @@ def padding_index(self): ValueError If the padding symbol is not present in the vocabulary. """ - if SpecialVocabSymbols.PAD not in self.stoi: + if PAD not in self.stoi: raise ValueError("Padding symbol is not in the vocabulary.") - return self.stoi[SpecialVocabSymbols.PAD] + return self.stoi[PAD] def __iadd__(self, values: Union["Vocab", Iterable]): """Adds additional values or another Vocab to this Vocab. @@ -405,7 +394,12 @@ def numericalize(self, data): "Cannot numericalize if the vocabulary has not been " "finalized because itos and stoi are not yet built." ) - return np.array([self.stoi[token] for token in data]) + + if UNK in self.stoi: + return np.array([self.stoi[token] if token in stoi else stoi[UNK] + for token in data]) + else: + return np.array([self.stoi[token] for token in data]) def reverse_numericalize(self, numericalized_data: Iterable): """Transforms an iterable containing numericalized data into a list of tokens. From 6a74734344ab52001eaaa51d259f1bb39a81ab29 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Tue, 1 Dec 2020 19:28:04 +0100 Subject: [PATCH 02/25] init -> new for bos --- podium/storage/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 1d36237f..d486d8b2 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -50,7 +50,7 @@ def __eq__(self, other): class BOS(Special): - def __init__(cls, token=''): + def __new__(cls, token=''): return super(BOS, cls).__new__(cls, token) def apply(self, sequence): From d598afa306df47f6bd4b021fbce933c8987b95ee Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 2 Dec 2020 19:17:56 +0100 Subject: [PATCH 03/25] Add UNK filtering, static constructors --- podium/storage/vocab.py | 51 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index d486d8b2..7903e34d 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -113,8 +113,7 @@ def __init__( min_freq=1, specials=(UNK(), PAD()), keep_freqs=False, - eager=True, - filter_unk=False + eager=True ): """Vocab constructor. Specials are first in the vocabulary. @@ -149,6 +148,46 @@ def __init__( self.finalized = False # flag to know if we're ready to numericalize _LOGGER.debug("Vocabulary has been created and initialized.") + @classmethod + def from_itos(cls, itos): + """Method constructs a vocab from a predefined index-to-string mapping. + + Parameters + ---------- + itos: list | tuple + The index-to-string mapping for tokens in the vocabulary + """ + specials = [token for token in itos if isinstance(token, Special)] + + vocab = cls(specials=specials) + vocab.itos = itos + vocab.stoi = {k: v for k,v in enumerate(itos)} + vocab.finalized = True + + return vocab + + @classmethod + def from_stoi(cls, stoi): + """Method constructs a vocab from a predefined index-to-string mapping. + + Parameters + ---------- + stoi: dict + The string-to-index mapping for the vocabulary + """ + specials = [token for token in stoi.keys() if isinstance(token, Special)] + + vocab = cls(specials=specials) + vocab.stoi = stoi + vocab_max_index = max(stoi.values()) + itos = [None]*(vocab_max_index+1) + for token, index in stoi.items(): + itos[index] = token + vocab.itos = itos + vocab.finalized = True + + return vocab + def get_freqs(self): """Method obtains vocabulary frequencies. @@ -396,10 +435,16 @@ def numericalize(self, data): ) if UNK in self.stoi: + # In order to replace unknown token with UNK, right now both UNK + # has to be present in the Vocab and filter_unk has to be set. + # I'm not sure in which case UNK would be present but filter_unk + # would be set to True. return np.array([self.stoi[token] if token in stoi else stoi[UNK] for token in data]) else: - return np.array([self.stoi[token] for token in data]) + # Either UNK is not in Vocab or the user has requested unknown tokens + # to be filtered out of the instances. + return np.array([self.stoi[token] for token in data if token in stoi]) def reverse_numericalize(self, numericalized_data: Iterable): """Transforms an iterable containing numericalized data into a list of tokens. From 91f86e26a4d9a8140e889fb9ccd65b80a9d63e05 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Thu, 3 Dec 2020 17:05:29 +0100 Subject: [PATCH 04/25] Add uniqueness check for specials --- podium/storage/vocab.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 7903e34d..f83829f3 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -139,6 +139,13 @@ def __init__( self.specials = (self.specials,) self._has_specials = len(self.specials) > 0 + # Apply uniqueness check + if len(specials) > len(set(specials)): + error_msg = ( + f"Specials may not contain multiple instances of same type." + ) + raise ValueError(error_msg) + self.itos = list(self.specials) #self._default_unk_index = self._init_default_unk_index(self.specials) self.stoi = {k: v for v, k in enumerate(self.itos)} From 6736b748a2543a5f7109c19ceae7fb888b8a0a5d Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 16 Dec 2020 00:07:10 +0100 Subject: [PATCH 05/25] Disable caching for nondeterministic numericalizers, style changes --- podium/storage/field.py | 6 ++++ podium/storage/vocab.py | 65 ++++++++++++++++++++++++++++++++--------- 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/podium/storage/field.py b/podium/storage/field.py index d1595b27..646e271f 100644 --- a/podium/storage/field.py +++ b/podium/storage/field.py @@ -790,6 +790,12 @@ def get_numericalization_for_example( cache_field_name = f"{self.name}_" numericalization = example.get(cache_field_name) + # Check if this concrete field can be cached. Fields that have + # non-deterministic numericalizers cannot be cached. Currently, + # we only expect vocabs to be non-deterministic. + cache_field = not self.use_vocab or self.use_vocab and self.vocab.deterministic + cache = cache and cache_field + if numericalization is None: example_data = example[self.name] numericalization = self.numericalize(example_data) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 6e8dfd5a..6a2f7213 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -110,7 +110,8 @@ def __init__( min_freq=1, specials=(UNK(), PAD()), keep_freqs=False, - eager=True + eager=True, + deterministic=True ): """Vocab constructor. Specials are first in the vocabulary. @@ -126,14 +127,30 @@ def __init__( keep_freqs : bool if true word frequencies will be saved for later use on the finalization + eager : bool + if `True` the frequencies will be built immediately upon + dataset loading. While not obvious, the main effect of + this argument if set to `True` is that the frequencies of + the vocabulary will be built on based _all_ datasets + that use this vocabulary, while if set to `False`, the + vocabulary will be built by iterating again over the + datasets passed as argument to the `finalize_fields` + function. + deterministic : bool + if `True`, the numericalization for an instance will + not change between function calls. An example where + this argument should be set to `False` is when the + Vocabulary uses Masking. Setting `deterministic` to + `False` will disable caching for all Fields that + use this Vocabulary. """ self._freqs = Counter() self._keep_freqs = keep_freqs self._min_freq = min_freq - self.specials = () if specials is None else specials + self._specials = () if specials is None else specials if not isinstance(self.specials, (tuple, list)): - self.specials = (self.specials,) + self._specials = (self._specials,) self._has_specials = len(self.specials) > 0 # Apply uniqueness check @@ -143,13 +160,33 @@ def __init__( ) raise ValueError(error_msg) - self.itos = list(self.specials) + self._itos = list(self.specials) #self._default_unk_index = self._init_default_unk_index(self.specials) - self.stoi = {k: v for v, k in enumerate(self.itos)} + self._stoi = {k: v for v, k in enumerate(self.itos)} self._max_size = max_size - self.eager = eager - self.finalized = False # flag to know if we're ready to numericalize + self._eager = eager + self._finalized = False # flag to know if we're ready to numericalize + + @property + def eager(self): + return self._eager + + @property + def finalized(self): + return self._finalized + + @property + def specials(self): + return self._specials + + @property + def itos(self): + return self._itos + + @property + def stoi(self): + return self._stoi @classmethod def from_itos(cls, itos): @@ -163,9 +200,9 @@ def from_itos(cls, itos): specials = [token for token in itos if isinstance(token, Special)] vocab = cls(specials=specials) - vocab.itos = itos - vocab.stoi = {k: v for k,v in enumerate(itos)} - vocab.finalized = True + vocab._itos = itos + vocab._stoi = {k: v for k,v in enumerate(itos)} + vocab._finalized = True return vocab @@ -181,13 +218,13 @@ def from_stoi(cls, stoi): specials = [token for token in stoi.keys() if isinstance(token, Special)] vocab = cls(specials=specials) - vocab.stoi = stoi + vocab._stoi = stoi vocab_max_index = max(stoi.values()) itos = [None]*(vocab_max_index+1) for token, index in stoi.items(): itos[index] = token - vocab.itos = itos - vocab.finalized = True + vocab._itos = itos + vocab._finalized = True return vocab @@ -410,7 +447,7 @@ def finalize(self): if not self._keep_freqs: self._freqs = None # release memory - self.finalized = True + self._finalized = True def numericalize(self, data): """Method numericalizes given tokens. From ccdd440d0384737984a898b71485c13146e4dddb Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 16 Dec 2020 01:23:48 +0100 Subject: [PATCH 06/25] Finalize masking functionality --- podium/storage/vocab.py | 141 +++++++++++++++++++++++++++++++++------- 1 file changed, 117 insertions(+), 24 deletions(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 6a2f7213..636c66a6 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -47,42 +47,46 @@ def __eq__(self, other): class BOS(Special): - def __new__(cls, token=''): + def __new__(cls, token=""): return super(BOS, cls).__new__(cls, token) def apply(self, sequence): - return [self] + sequence + return [self] + sequence class EOS(Special): - def __new__(cls, token=''): + def __new__(cls, token=""): return super(EOS, cls).__new__(cls, token) def apply(self, sequence): return sequence + [self] + ################# # Core specials # ################# + class MASK(Special): - def __new__(cls, token=''): + def __new__(cls, token=""): return super(MASK, cls).__new__(cls, token) def apply(self, sequence): # Core special, handled by Vocab return sequence + class UNK(Special): - def __new__(cls, token=''): + def __new__(cls, token=""): return super(UNK, cls).__new__(cls, token) def apply(self, sequence): # Core special, handled by Vocab return sequence + class PAD(Special): - def __new__(cls, token=''): + def __new__(cls, token=""): return super(PAD, cls).__new__(cls, token) def apply(self, sequence): @@ -139,7 +143,7 @@ def __init__( deterministic : bool if `True`, the numericalization for an instance will not change between function calls. An example where - this argument should be set to `False` is when the + this argument should be set to `False` is when the Vocabulary uses Masking. Setting `deterministic` to `False` will disable caching for all Fields that use this Vocabulary. @@ -155,19 +159,22 @@ def __init__( # Apply uniqueness check if len(specials) > len(set(specials)): - error_msg = ( - f"Specials may not contain multiple instances of same type." - ) + error_msg = f"Specials may not contain multiple instances of same type." raise ValueError(error_msg) self._itos = list(self.specials) - #self._default_unk_index = self._init_default_unk_index(self.specials) + # self._default_unk_index = self._init_default_unk_index(self.specials) self._stoi = {k: v for v, k in enumerate(self.itos)} self._max_size = max_size self._eager = eager + self._deterministic = deterministic self._finalized = False # flag to know if we're ready to numericalize + @property + def freqs(self): + return self._freqs + @property def eager(self): return self._eager @@ -188,6 +195,10 @@ def itos(self): def stoi(self): return self._stoi + @property + def deterministic(self): + return self._deterministic + @classmethod def from_itos(cls, itos): """Method constructs a vocab from a predefined index-to-string mapping. @@ -201,7 +212,7 @@ def from_itos(cls, itos): vocab = cls(specials=specials) vocab._itos = itos - vocab._stoi = {k: v for k,v in enumerate(itos)} + vocab._stoi = {k: v for k, v in enumerate(itos)} vocab._finalized = True return vocab @@ -220,7 +231,7 @@ def from_stoi(cls, stoi): vocab = cls(specials=specials) vocab._stoi = stoi vocab_max_index = max(stoi.values()) - itos = [None]*(vocab_max_index+1) + itos = [None] * (vocab_max_index + 1) for token, index in stoi.items(): itos[index] = token vocab._itos = itos @@ -247,7 +258,7 @@ def get_freqs(self): "User specified that frequencies aren't kept in " "vocabulary but the get_freqs method is called." ) - return self._freqs + return self.freqs def padding_index(self): """Method returns padding symbol index. @@ -474,16 +485,15 @@ def numericalize(self, data): ) if UNK in self.stoi: - # In order to replace unknown token with UNK, right now both UNK - # has to be present in the Vocab and filter_unk has to be set. - # I'm not sure in which case UNK would be present but filter_unk - # would be set to True. - return np.array([self.stoi[token] if token in stoi else stoi[UNK] - for token in data]) + # If UNK is not in the vocabulary, we _erase_ the unknown tokens + # from the instances. + return np.array( + [self.stoi[token] if token in self.stoi else stoi[UNK] for token in data] + ) else: # Either UNK is not in Vocab or the user has requested unknown tokens # to be filtered out of the instances. - return np.array([self.stoi[token] for token in data if token in stoi]) + return np.array([self.stoi[token] for token in data if token in self.stoi]) def reverse_numericalize(self, numericalized_data: Iterable): """Transforms an iterable containing numericalized data into a list of tokens. @@ -532,7 +542,7 @@ def __len__(self): """ if self.finalized: return len(self.itos) - return len(self._freqs) + return len(self.freqs) def __eq__(self, other): """Two vocabs are same if they have same finalization status, their @@ -552,7 +562,7 @@ def __eq__(self, other): return False if self.finalized != other.finalized: return False - if self._freqs != other._freqs: + if self.freqs != other.freqs: return False if self.stoi != other.stoi: return False @@ -572,7 +582,7 @@ def __iter__(self): iterator over vocab tokens """ if not self.finalized: - return iter(self._freqs.keys()) + return iter(self.freqs.keys()) return iter(self.itos) def __repr__(self): @@ -606,3 +616,86 @@ def __getitem__(self, token): ) return self.stoi[token] + + +class MaskVocab(Vocab): + def __init__(self, vocab, masking_probability=0.15): + if MASK() not in vocab.specials: + # Todo: flesh out error, proof of concept for now + raise ValueError("Mask token not in vocabulary of MaskVocab") + + self.mask_token = vocab.itos[vocab.stoi[MASK()]] + self._vocab = vocab + self._deterministic = False + self.masking_probability = masking_probability + + @property + def vocab(self): + return self._vocab + + @property + def eager(self): + return self.vocab.eager + + @property + def finalized(self): + return self.vocab.finalized + + @property + def specials(self): + return self.vocab.specials + + @property + def itos(self): + return self.vocab.itos + + @property + def stoi(self): + return self.vocab.stoi + + @property + def freqs(self): + return self.vocab.freqs + + @property + def deterministic(self): + return self._deterministic + + + def numericalize(self, data): + """Method numericalizes given tokens. + + Parameters + ---------- + data : iter(str) + iterable collection of tokens + + Returns + ------- + numericalized_vector : array-like + numpy array of numericalized tokens + + Raises + ------ + RuntimeError + If the vocabulary is not finalized. + """ + + # Ensures data is a numpy array + numericalized_data = self.vocab.numericalize(data) + # Create a boolean vector of tokens which should be masked + mask = np.random.binomial(1, self.masking_probability, len(data)).astype(bool) + # Retrieve index of mask token from vocab + mask_index = self[MASK()] + # Overwrite data which should be masked with the mask index + numericalized_data[mask] = mask_index + + return numericalized_data + + def __iadd__(self, values: Union["Vocab", Iterable]): + self.vocab.__iadd__(values) + return self + + def __add__(self, values: Union["Vocab", Iterable]): + self.vocab.__add__(values) + return self From 64a8cd72ef66648ae607d9ce97554724bae93293 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 16 Dec 2020 01:24:45 +0100 Subject: [PATCH 07/25] black, isort --- podium/storage/vocab.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 636c66a6..561753cd 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -115,7 +115,7 @@ def __init__( specials=(UNK(), PAD()), keep_freqs=False, eager=True, - deterministic=True + deterministic=True, ): """Vocab constructor. Specials are first in the vocabulary. @@ -661,7 +661,6 @@ def freqs(self): def deterministic(self): return self._deterministic - def numericalize(self, data): """Method numericalizes given tokens. From 2f4e9eedcae5899c8edeef8d11542d0e184a725d Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Fri, 18 Dec 2020 12:10:21 +0100 Subject: [PATCH 08/25] Stash --- podium/storage/vocab.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 561753cd..11a94565 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -30,7 +30,6 @@ def unique(values: Iterable): yield element -# Make specials singletons class Special(str): @abc.abstractmethod def apply(self, sequence_or_token): @@ -484,7 +483,7 @@ def numericalize(self, data): "finalized because itos and stoi are not yet built." ) - if UNK in self.stoi: + if UNK() in self.stoi: # If UNK is not in the vocabulary, we _erase_ the unknown tokens # from the instances. return np.array( From 681311d0f4267d0270c6f2c90df1d8c9917127f2 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Tue, 22 Dec 2020 14:01:10 +0100 Subject: [PATCH 09/25] Remove maskvocab, document specials, fix fields --- podium/storage/field.py | 346 +++++++++++++++++++++++----------------- podium/storage/vocab.py | 172 +++++++------------- 2 files changed, 254 insertions(+), 264 deletions(-) diff --git a/podium/storage/field.py b/podium/storage/field.py index 646e271f..635460da 100644 --- a/podium/storage/field.py +++ b/podium/storage/field.py @@ -63,148 +63,6 @@ def clear(self): self.hooks.clear() -class MultioutputField: - """Field that does pretokenization and tokenization once and passes it to its - output fields. Output fields are any type of field. The output fields are used only - for posttokenization processing (posttokenization hooks and vocab updating).""" - - def __init__( - self, - output_fields: List["Field"], - tokenizer: TokenizerType = "split", - pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, - ): - """Field that does pretokenization and tokenization once and passes it to its - output fields. Output fields are any type of field. The output fields are used - only for posttokenization processing (posttokenization hooks and vocab updating). - - Parameters - ---------- - output_fields : List[Field], - List containig the output fields. The pretokenization hooks and tokenizer - in these fields are ignored and only posttokenization hooks are used. - tokenizer : Optional[Union[str, Callable]] - The tokenizer that is to be used when preprocessing raw data - (only if 'tokenize' is True). The user can provide his own - tokenizer as a callable object or specify one of the premade - tokenizers by a string. The available premade tokenizers are: - - - 'split' - default str.split() - - 'spacy-lang' - the spacy tokenizer. The language model can be defined - by replacing `lang` with the language model name. For example `spacy-en` - - pretokenize_hooks: Iterable[Callable[[Any], Any]] - Iterable containing pretokenization hooks. Providing hooks in this way is - identical to calling `add_pretokenize_hook`. - """ - - self._tokenizer_arg = tokenizer - self._pretokenization_pipeline = PretokenizationPipeline() - - if pretokenize_hooks is not None: - if not isinstance(pretokenize_hooks, (list, tuple)): - pretokenize_hooks = [pretokenize_hooks] - for hook in pretokenize_hooks: - self.add_pretokenize_hook(hook) - - self._tokenizer = get_tokenizer(tokenizer) - self._output_fields = deque(output_fields) - - def add_pretokenize_hook(self, hook: PretokenizationHookType): - """Add a pre-tokenization hook to the MultioutputField. - If multiple hooks are added to the field, the order of their execution - will be the same as the order in which they were added to the field, - each subsequent hook taking the output of the previous hook as its - input. - If the same function is added to the Field as a hook multiple times, - it will be executed that many times. - The output of the final pre-tokenization hook is the raw data that the - tokenizer will get as its input. - - Pretokenize hooks have the following signature: - func pre_tok_hook(raw_data): - raw_data_out = do_stuff(raw_data) - return raw_data_out - - This can be used to eliminate encoding errors in data, replace numbers - and names, etc. - - Parameters - ---------- - hook : Callable[[Any], Any] - The pre-tokenization hook that we want to add to the field. - """ - self._pretokenization_pipeline.add_hook(hook) - - def _run_pretokenization_hooks(self, data: Any) -> Any: - """Runs pretokenization hooks on the raw data and returns the result. - - Parameters - ---------- - data : Any - data to be processed - - Returns - ------- - Any - processed data - - """ - - return self._pretokenization_pipeline(data) - - def add_output_field(self, field: "Field"): - """ - Adds the passed field to this field's output fields. - - Parameters - ---------- - field : Field - Field to add to output fields. - """ - self._output_fields.append(field) - - def preprocess(self, data: Any) -> Iterable[Tuple[str, Tuple[Optional[Any], Any]]]: - """Preprocesses raw data, tokenizing it if required. The outputfields update their - vocabs if required and preserve the raw data if the output field's - 'keep_raw' is true. - - Parameters - ---------- - data : Any - The raw data that needs to be preprocessed. - - Returns - ------- - Iterable[Tuple[str, Tuple[Optional[Any], Any]]] - An Iterable containing the raw and tokenized data of all the output fields. - The structure of the returned tuples is (name, (raw, tokenized)), where 'name' - is the name of the output field and raw and tokenized are processed data. - - Raises - ------ - If data is None and missing data is not allowed. - """ - data = self._run_pretokenization_hooks(data) - tokens = self._tokenizer(data) if self._tokenizer is not None else data - return tuple(field._process_tokens(data, tokens) for field in self._output_fields) - - def get_output_fields(self) -> Iterable["Field"]: - """ - Returns an Iterable of the contained output fields. - - Returns - ------- - Iterable[Field] : - an Iterable of the contained output fields. - """ - return self._output_fields - - def remove_pretokenize_hooks(self): - """Remove all the pre-tokenization hooks that were added to the MultioutputField.""" - self._pretokenization_pipeline.clear() - - class Field: """Holds the preprocessing and numericalization logic for a single field of a dataset. @@ -220,6 +78,7 @@ def __init__( fixed_length: Optional[int] = None, allow_missing_data: bool = False, disable_batch_matrix: bool = False, + deterministic: bool = True, padding_token: Union[int, float] = -999, missing_data_token: Union[int, float] = -1, pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, @@ -284,6 +143,15 @@ def __init__( If True, a list of unpadded vectors(or other data type) will be returned instead. For missing data, the value in the list will be None. + deterministic : bool + Flag which determines whether this Field has deterministic or nondeterministic + numericalization (numericalization for the same instance can be different between + function calls). Disables numericalization caching for this Field. The flag is + passed to the numericalizer to indicate to use the nondeterministic setting. + E.g., in the case of masked language modelling, we wish the inputs to be masked + (nondeterministic), and the outputs (labels) to not be masked while using the + same vocabulary. + padding_token : int Padding token used when numericalizer is a callable. If the numericalizer is None or a Vocab, this value is ignored. @@ -314,6 +182,7 @@ def __init__( ) self._name = name self._disable_batch_matrix = disable_batch_matrix + self._deterministic = deterministic self._tokenizer_arg_string = tokenizer if isinstance(tokenizer, str) else None if tokenizer is None: @@ -791,10 +660,9 @@ def get_numericalization_for_example( numericalization = example.get(cache_field_name) # Check if this concrete field can be cached. Fields that have - # non-deterministic numericalizers cannot be cached. Currently, - # we only expect vocabs to be non-deterministic. - cache_field = not self.use_vocab or self.use_vocab and self.vocab.deterministic - cache = cache and cache_field + # non-deterministic numericalizers cannot be cached. + + cache = cache and self.deterministic if numericalization is None: example_data = example[self.name] @@ -852,6 +720,148 @@ def get_output_fields(self) -> Iterable["Field"]: return (self,) +class MultioutputField: + """Field that does pretokenization and tokenization once and passes it to its + output fields. Output fields are any type of field. The output fields are used only + for posttokenization processing (posttokenization hooks and vocab updating).""" + + def __init__( + self, + output_fields: List["Field"], + tokenizer: TokenizerType = "split", + pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, + ): + """Field that does pretokenization and tokenization once and passes it to its + output fields. Output fields are any type of field. The output fields are used + only for posttokenization processing (posttokenization hooks and vocab updating). + + Parameters + ---------- + output_fields : List[Field], + List containig the output fields. The pretokenization hooks and tokenizer + in these fields are ignored and only posttokenization hooks are used. + tokenizer : Optional[Union[str, Callable]] + The tokenizer that is to be used when preprocessing raw data + (only if 'tokenize' is True). The user can provide his own + tokenizer as a callable object or specify one of the premade + tokenizers by a string. The available premade tokenizers are: + + - 'split' - default str.split() + - 'spacy-lang' - the spacy tokenizer. The language model can be defined + by replacing `lang` with the language model name. For example `spacy-en` + + pretokenize_hooks: Iterable[Callable[[Any], Any]] + Iterable containing pretokenization hooks. Providing hooks in this way is + identical to calling `add_pretokenize_hook`. + """ + + self._tokenizer_arg = tokenizer + self._pretokenization_pipeline = PretokenizationPipeline() + + if pretokenize_hooks is not None: + if not isinstance(pretokenize_hooks, (list, tuple)): + pretokenize_hooks = [pretokenize_hooks] + for hook in pretokenize_hooks: + self.add_pretokenize_hook(hook) + + self._tokenizer = get_tokenizer(tokenizer) + self._output_fields = deque(output_fields) + + def add_pretokenize_hook(self, hook: PretokenizationHookType): + """Add a pre-tokenization hook to the MultioutputField. + If multiple hooks are added to the field, the order of their execution + will be the same as the order in which they were added to the field, + each subsequent hook taking the output of the previous hook as its + input. + If the same function is added to the Field as a hook multiple times, + it will be executed that many times. + The output of the final pre-tokenization hook is the raw data that the + tokenizer will get as its input. + + Pretokenize hooks have the following signature: + func pre_tok_hook(raw_data): + raw_data_out = do_stuff(raw_data) + return raw_data_out + + This can be used to eliminate encoding errors in data, replace numbers + and names, etc. + + Parameters + ---------- + hook : Callable[[Any], Any] + The pre-tokenization hook that we want to add to the field. + """ + self._pretokenization_pipeline.add_hook(hook) + + def _run_pretokenization_hooks(self, data: Any) -> Any: + """Runs pretokenization hooks on the raw data and returns the result. + + Parameters + ---------- + data : Any + data to be processed + + Returns + ------- + Any + processed data + + """ + + return self._pretokenization_pipeline(data) + + def add_output_field(self, field: "Field"): + """ + Adds the passed field to this field's output fields. + + Parameters + ---------- + field : Field + Field to add to output fields. + """ + self._output_fields.append(field) + + def preprocess(self, data: Any) -> Iterable[Tuple[str, Tuple[Optional[Any], Any]]]: + """Preprocesses raw data, tokenizing it if required. The outputfields update their + vocabs if required and preserve the raw data if the output field's + 'keep_raw' is true. + + Parameters + ---------- + data : Any + The raw data that needs to be preprocessed. + + Returns + ------- + Iterable[Tuple[str, Tuple[Optional[Any], Any]]] + An Iterable containing the raw and tokenized data of all the output fields. + The structure of the returned tuples is (name, (raw, tokenized)), where 'name' + is the name of the output field and raw and tokenized are processed data. + + Raises + ------ + If data is None and missing data is not allowed. + """ + data = self._run_pretokenization_hooks(data) + tokens = self._tokenizer(data) if self._tokenizer is not None else data + return tuple(field._process_tokens(data, tokens) for field in self._output_fields) + + def get_output_fields(self) -> Iterable["Field"]: + """ + Returns an Iterable of the contained output fields. + + Returns + ------- + Iterable[Field] : + an Iterable of the contained output fields. + """ + return self._output_fields + + def remove_pretokenize_hooks(self): + """Remove all the pre-tokenization hooks that were added to the MultioutputField.""" + self._pretokenization_pipeline.clear() + + class LabelField(Field): """Field subclass used when no tokenization is required. For example, with a field that has a single value denoting a label. @@ -860,8 +870,10 @@ class LabelField(Field): def __init__( self, name: str, - numericalizer: NumericalizerType = None, + numericalizer: Optional[Union[Vocab, NumericalizerType]] = None, allow_missing_data: bool = False, + disable_batch_matrix: bool = False, + deterministic: bool = True, is_target: bool = True, missing_data_token: Union[int, float] = -1, pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, @@ -891,6 +903,22 @@ def __init__( If 'allow_missing_data' is True, if a None is sent to be preprocessed, it will be stored and later numericalized properly. + disable_batch_matrix: bool + Whether the batch created for this field will be compressed into a matrix. + If False, the batch returned by an Iterator or Dataset.batch() will contain + a matrix of numericalizations for all examples (if possible). + If True, a list of unpadded vectors(or other data type) will be returned + instead. For missing data, the value in the list will be None. + + deterministic : bool + Flag which determines whether this Field has deterministic or nondeterministic + numericalization (numericalization for the same instance can be different between + function calls). Disables numericalization caching for this Field. The flag is + passed to the numericalizer to indicate to use the nondeterministic setting. + E.g., in the case of masked language modelling, we wish the inputs to be masked + (nondeterministic), and the outputs (labels) to not be masked while using the + same vocabulary. + is_target : bool Whether this field is a target variable. Affects iteration over batches. @@ -923,6 +951,8 @@ def __init__( is_target=is_target, fixed_length=1, allow_missing_data=allow_missing_data, + disable_batch_matrix=disable_batch_matrix, + deterministic=deterministic, missing_data_token=missing_data_token, pretokenize_hooks=pretokenize_hooks, ) @@ -937,10 +967,12 @@ def __init__( self, name: str, tokenizer: TokenizerType = None, - numericalizer: NumericalizerType = None, + numericalizer: Optional[Union[Vocab, NumericalizerType]] = None, num_of_classes: Optional[int] = None, is_target: bool = True, allow_missing_data: bool = False, + disable_batch_matrix: bool = False, + deterministic: bool = True, missing_data_token: Union[int, float] = -1, pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, posttokenize_hooks: Optional[Iterable[PosttokenizationHookType]] = None, @@ -991,6 +1023,22 @@ def __init__( If 'allow_missing_data' is True, if a None is sent to be preprocessed, it will be stored and later numericalized properly. + disable_batch_matrix: bool + Whether the batch created for this field will be compressed into a matrix. + If False, the batch returned by an Iterator or Dataset.batch() will contain + a matrix of numericalizations for all examples (if possible). + If True, a list of unpadded vectors(or other data type) will be returned + instead. For missing data, the value in the list will be None. + + deterministic : bool + Flag which determines whether this Field has deterministic or nondeterministic + numericalization (numericalization for the same instance can be different between + function calls). Disables numericalization caching for this Field. The flag is + passed to the numericalizer to indicate to use the nondeterministic setting. + E.g., in the case of masked language modelling, we wish the inputs to be masked + (nondeterministic), and the outputs (labels) to not be masked while using the + same vocabulary. + missing_data_token : Union[int, float] Token to use to mark batch rows as missing. If data for a field is missing, its matrix row will be filled with this value. For non-numericalizable fields, @@ -1029,6 +1077,8 @@ def __init__( is_target=is_target, fixed_length=num_of_classes, allow_missing_data=allow_missing_data, + disable_batch_matrix=disable_batch_matrix, + deterministic=deterministic, missing_data_token=missing_data_token, pretokenize_hooks=pretokenize_hooks, posttokenize_hooks=posttokenize_hooks, diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 11a94565..59e7c524 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -31,33 +31,66 @@ def unique(values: Iterable): class Special(str): - @abc.abstractmethod - def apply(self, sequence_or_token): - # Method should ONLY be used in Vocab.numericalize - pass - + """Base class for a special token. + + Every special token is a subclass of string (this way one can) + easily modify the concrete string representation of the special. + The functionality of the special token, which acts the same as + a post-tokenization hook should be implemented in the `apply` + instance method for each subclass. We ensure that each special + token will be present in the Vocab. + """ def __hash__(self): - # Hash class instead of value + """Overrides hash. + + Check docs of `__eq__` for motivation. + """ return hash(self.__class__) def __eq__(self, other): - # Check equals via class instead of value + """ Check equals via class instead of value. + The motivation behind this is that we want to be able to + match the special token by class and not by value, as it + is the type of the special token that determines its + functionality. + This way we allow for the concrete string representation + of the special to be easily changed, while retaining simple + existence checks for vocab functionality. + """ return self.__class__ == other.__class__ + def apply(self, sequence): + """Apply (insert) the special token in the adequate + place in the sequence. + """ + raise NotImplementedError + class BOS(Special): + """The beginning-of-sequence special token. + """ def __new__(cls, token=""): + """Provides default value upon creation for the BOS token. + """ return super(BOS, cls).__new__(cls, token) def apply(self, sequence): + """ Apply the BOS token, adding it to the start of the sequence + """ return [self] + sequence class EOS(Special): + """The end-of-sequence special token. + """ def __new__(cls, token=""): + """Provides default value upon creation for the EOS token. + """ return super(EOS, cls).__new__(cls, token) def apply(self, sequence): + """ Apply the EOS token, adding it to the start of the sequence + """ return sequence + [self] @@ -65,31 +98,33 @@ def apply(self, sequence): # Core specials # ################# - -class MASK(Special): - def __new__(cls, token=""): - return super(MASK, cls).__new__(cls, token) - - def apply(self, sequence): - # Core special, handled by Vocab - return sequence - - class UNK(Special): + """The unknown core special token. + """ def __new__(cls, token=""): + """Provides default value upon creation for the UNK token. + """ return super(UNK, cls).__new__(cls, token) def apply(self, sequence): - # Core special, handled by Vocab + """Core special, handled by Vocab + """ + # Perhaps indicate somehow that this call isn't an op. return sequence class PAD(Special): + """The padding core special token. + """ def __new__(cls, token=""): + """Provides default value upon creation for the PAD token. + """ return super(PAD, cls).__new__(cls, token) def apply(self, sequence): - # Core special, handled by Vocab + """Core special, handled by Vocab + """ + # Perhaps indicate somehow that this call isn't an op. return sequence @@ -114,7 +149,6 @@ def __init__( specials=(UNK(), PAD()), keep_freqs=False, eager=True, - deterministic=True, ): """Vocab constructor. Specials are first in the vocabulary. @@ -139,13 +173,6 @@ def __init__( vocabulary will be built by iterating again over the datasets passed as argument to the `finalize_fields` function. - deterministic : bool - if `True`, the numericalization for an instance will - not change between function calls. An example where - this argument should be set to `False` is when the - Vocabulary uses Masking. Setting `deterministic` to - `False` will disable caching for all Fields that - use this Vocabulary. """ self._freqs = Counter() self._keep_freqs = keep_freqs @@ -167,7 +194,6 @@ def __init__( self._max_size = max_size self._eager = eager - self._deterministic = deterministic self._finalized = False # flag to know if we're ready to numericalize @property @@ -194,10 +220,6 @@ def itos(self): def stoi(self): return self._stoi - @property - def deterministic(self): - return self._deterministic - @classmethod def from_itos(cls, itos): """Method constructs a vocab from a predefined index-to-string mapping. @@ -272,9 +294,9 @@ def padding_index(self): ValueError If the padding symbol is not present in the vocabulary. """ - if PAD not in self.stoi: + if PAD() not in self.stoi: raise ValueError("Padding symbol is not in the vocabulary.") - return self.stoi[PAD] + return self.stoi[PAD()] def __iadd__(self, values: Union["Vocab", Iterable]): """Adds additional values or another Vocab to this Vocab. @@ -615,85 +637,3 @@ def __getitem__(self, token): ) return self.stoi[token] - - -class MaskVocab(Vocab): - def __init__(self, vocab, masking_probability=0.15): - if MASK() not in vocab.specials: - # Todo: flesh out error, proof of concept for now - raise ValueError("Mask token not in vocabulary of MaskVocab") - - self.mask_token = vocab.itos[vocab.stoi[MASK()]] - self._vocab = vocab - self._deterministic = False - self.masking_probability = masking_probability - - @property - def vocab(self): - return self._vocab - - @property - def eager(self): - return self.vocab.eager - - @property - def finalized(self): - return self.vocab.finalized - - @property - def specials(self): - return self.vocab.specials - - @property - def itos(self): - return self.vocab.itos - - @property - def stoi(self): - return self.vocab.stoi - - @property - def freqs(self): - return self.vocab.freqs - - @property - def deterministic(self): - return self._deterministic - - def numericalize(self, data): - """Method numericalizes given tokens. - - Parameters - ---------- - data : iter(str) - iterable collection of tokens - - Returns - ------- - numericalized_vector : array-like - numpy array of numericalized tokens - - Raises - ------ - RuntimeError - If the vocabulary is not finalized. - """ - - # Ensures data is a numpy array - numericalized_data = self.vocab.numericalize(data) - # Create a boolean vector of tokens which should be masked - mask = np.random.binomial(1, self.masking_probability, len(data)).astype(bool) - # Retrieve index of mask token from vocab - mask_index = self[MASK()] - # Overwrite data which should be masked with the mask index - numericalized_data[mask] = mask_index - - return numericalized_data - - def __iadd__(self, values: Union["Vocab", Iterable]): - self.vocab.__iadd__(values) - return self - - def __add__(self, values: Union["Vocab", Iterable]): - self.vocab.__add__(values) - return self From 539798044ad762ddc089bde9ecf0482e63eb89d0 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Tue, 22 Dec 2020 14:36:28 +0100 Subject: [PATCH 10/25] Fix previous tests --- podium/storage/field.py | 4 ++ podium/storage/vocab.py | 15 +++-- tests/storage/test_field.py | 5 +- tests/storage/test_tfidf.py | 14 ++--- tests/storage/test_vocab.py | 110 +++++++++++++++++++----------------- 5 files changed, 82 insertions(+), 66 deletions(-) diff --git a/podium/storage/field.py b/podium/storage/field.py index 635460da..91329f19 100644 --- a/podium/storage/field.py +++ b/podium/storage/field.py @@ -261,6 +261,10 @@ def vocab(self): """""" return self._vocab + @property + def deterministic(self): + return self._deterministic + @property def use_vocab(self): """A flag that tells whether the field uses a vocab or not. diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 59e7c524..730e7765 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -184,7 +184,7 @@ def __init__( self._has_specials = len(self.specials) > 0 # Apply uniqueness check - if len(specials) > len(set(specials)): + if len(self.specials) > len(set(self.specials)): error_msg = f"Specials may not contain multiple instances of same type." raise ValueError(error_msg) @@ -349,7 +349,7 @@ def __iadd__(self, values: Union["Vocab", Iterable]): ) # unique is used instead of set to somewhat preserve ordering - self.specials = list(unique(chain(self.specials, other_vocab.specials))) + self._specials = list(unique(chain(self.specials, other_vocab.specials))) self._has_specials = len(self.specials) > 0 self._itos = list(self.specials) self._freqs += other_vocab._freqs # add freqs to this instance @@ -486,8 +486,8 @@ def numericalize(self, data): Parameters ---------- - data : iter(str) - iterable collection of tokens + data : str | iter(str) + a single token or iterable collection of tokens Returns ------- @@ -505,11 +505,16 @@ def numericalize(self, data): "finalized because itos and stoi are not yet built." ) + if isinstance(data, str): + # Wrap string into list + data = [data] + if UNK() in self.stoi: # If UNK is not in the vocabulary, we _erase_ the unknown tokens # from the instances. + unk_token = self.stoi[UNK()] return np.array( - [self.stoi[token] if token in self.stoi else stoi[UNK] for token in data] + [self.stoi[token] if token in self.stoi else unk_token for token in data] ) else: # Either UNK is not in Vocab or the user has requested unknown tokens diff --git a/tests/storage/test_field.py b/tests/storage/test_field.py index 6f785512..150520b0 100644 --- a/tests/storage/test_field.py +++ b/tests/storage/test_field.py @@ -10,10 +10,11 @@ LabelField, MultilabelField, MultioutputField, - SpecialVocabSymbols, Vocab, ) +from podium.storage.vocab import UNK + ONE_TO_FIVE = [1, 2, 3, 4, 5] @@ -520,7 +521,7 @@ def test_multilabel_field_specials_in_vocab_fail(): with pytest.raises(ValueError): MultilabelField( name="bla", - numericalizer=Vocab(specials=(SpecialVocabSymbols.UNK,)), + numericalizer=Vocab(specials=(UNK())), num_of_classes=10, ) diff --git a/tests/storage/test_tfidf.py b/tests/storage/test_tfidf.py index cfe1e1ac..1e93d0ed 100644 --- a/tests/storage/test_tfidf.py +++ b/tests/storage/test_tfidf.py @@ -4,7 +4,7 @@ from podium.storage.field import Field from podium.storage.vectorizers.tfidf import CountVectorizer, TfIdfVectorizer -from podium.storage.vocab import SpecialVocabSymbols, Vocab +from podium.storage.vocab import UNK, PAD, Vocab from .conftest import TABULAR_TEXT @@ -49,7 +49,7 @@ def test_build_count_matrix_from_tensor_without_specials(): def test_build_count_matrix_from_tensor_with_specials(): - vocab = Vocab(specials=(SpecialVocabSymbols.UNK, SpecialVocabSymbols.PAD)) + vocab = Vocab(specials=(UNK(), PAD())) for i in DATA: vocab += i.split(" ") vocab.finalize() @@ -72,7 +72,7 @@ def test_build_count_matrix_from_tensor_with_specials(): def test_build_count_matrix_out_of_vocab_words(): - vocab = Vocab(specials=(SpecialVocabSymbols.UNK, SpecialVocabSymbols.PAD)) + vocab = Vocab(specials=(UNK(), PAD())) vocab_words = ["this", "is", "the", "first", "document"] vocab += vocab_words vocab.finalize() @@ -108,12 +108,12 @@ def test_build_count_matrix_costum_specials_vocab_without_specials(): def test_build_count_matrix_costum_specials_vocab_with_specials(): - vocab = Vocab(specials=(SpecialVocabSymbols.UNK, SpecialVocabSymbols.PAD)) + vocab = Vocab(specials=(UNK(), PAD())) vocab_words = ["this", "is", "the", "first", "document"] vocab += vocab_words vocab.finalize() tfidf = TfIdfVectorizer( - vocab=vocab, specials=[SpecialVocabSymbols.PAD, "this", "first"] + vocab=vocab, specials=[PAD(), "this", "first"] ) tfidf._init_special_indexes() @@ -126,7 +126,7 @@ def test_build_count_matrix_costum_specials_vocab_with_specials(): def test_specials_indexes(): - specials = (SpecialVocabSymbols.UNK, SpecialVocabSymbols.PAD) + specials = (UNK(), PAD()) vocab = Vocab(specials=specials) for i in DATA: vocab += i.split(" ") @@ -247,7 +247,7 @@ def test_count_vectorizer_examples_none(tabular_dataset): def test_count_matrix_specials_indexes(): - specials = (SpecialVocabSymbols.UNK, SpecialVocabSymbols.PAD) + specials = (UNK(), PAD()) vocab = Vocab(specials=specials) for i in DATA: vocab += i.split(" ") diff --git a/tests/storage/test_vocab.py b/tests/storage/test_vocab.py index c39caf3a..4b04cf77 100644 --- a/tests/storage/test_vocab.py +++ b/tests/storage/test_vocab.py @@ -3,6 +3,8 @@ import dill import pytest +import numpy as np + from podium.storage import vocab @@ -103,23 +105,24 @@ def test_empty_specials_get_pad_symbol(): with pytest.raises(ValueError): voc.padding_index() - -def test_empty_specials_stoi(): - voc = vocab.Vocab(specials=[]) - data = ["tree", "plant", "grass"] - voc = voc + set(data) - voc.finalize() - with pytest.raises(ValueError): - voc.stoi["apple"] +# Won't raise error anymore as we now filter +# unknown tokens +#def test_empty_specials_stoi(): +# voc = vocab.Vocab(specials=[]) +# data = ["tree", "plant", "grass"] +# voc = voc + set(data) +# voc.finalize() +# with pytest.raises(ValueError): +# voc.stoi["apple"] def test_specials_get_pad_symbol(): - voc = vocab.Vocab(specials=(vocab.SpecialVocabSymbols.PAD,)) + voc = vocab.Vocab(specials=(vocab.PAD(),)) data = ["tree", "plant", "grass"] voc = voc + set(data) assert voc.padding_index() == 0 voc.finalize() - assert voc.itos[0] == vocab.SpecialVocabSymbols.PAD + assert voc.itos[0] == vocab.PAD() def test_max_size(): @@ -133,7 +136,7 @@ def test_max_size(): def test_max_size_with_specials(): voc = vocab.Vocab( max_size=2, - specials=[vocab.SpecialVocabSymbols.PAD, vocab.SpecialVocabSymbols.UNK], + specials=[vocab.PAD(), vocab.UNK()], ) data = ["tree", "plant", "grass"] voc = (voc + set(data)) + {"plant"} @@ -142,7 +145,7 @@ def test_max_size_with_specials(): def test_size_after_final_with_specials(): - specials = [vocab.SpecialVocabSymbols.PAD, vocab.SpecialVocabSymbols.UNK] + specials = [vocab.PAD(), vocab.UNK()] voc = vocab.Vocab(specials=specials) data = ["tree", "plant", "grass"] voc = (voc + set(data)) + {"plant"} @@ -151,17 +154,17 @@ def test_size_after_final_with_specials(): def test_enum_special_vocab_symbols(): - assert vocab.SpecialVocabSymbols.PAD.value == "" - assert vocab.SpecialVocabSymbols.UNK.value == "" + assert str(vocab.PAD()) == "" + assert str(vocab.UNK()) == "" def test_get_stoi_for_unknown_word_default_unk(): - specials = [vocab.SpecialVocabSymbols.PAD, vocab.SpecialVocabSymbols.UNK] + specials = [vocab.PAD(), vocab.UNK()] voc = vocab.Vocab(specials=specials) data = ["tree", "plant", "grass"] voc = (voc + set(data)) + {"plant"} voc.finalize() - assert voc.stoi["unknown"] == 1 + assert voc.numericalize("unknown") == 1 def test_iadd_word_after_finalization_error(): @@ -189,19 +192,19 @@ def test_add_vocab_to_vocab(): for word in voc._freqs: assert voc._freqs[word] == expected_freq[word] - voc3 = vocab.Vocab(specials=vocab.SpecialVocabSymbols.UNK) + voc3 = vocab.Vocab(specials=vocab.UNK()) voc3 += data1 voc3 += data3 voc3.finalize() - voc4 = vocab.Vocab(specials=vocab.SpecialVocabSymbols.PAD) + voc4 = vocab.Vocab(specials=vocab.PAD()) voc4 += data2 voc4.finalize() voc = voc3 + voc4 assert set(voc.specials) == { - vocab.SpecialVocabSymbols.PAD, - vocab.SpecialVocabSymbols.UNK, + vocab.PAD(), + vocab.UNK(), } assert voc.finalized assert len(voc.itos) == 7 @@ -212,10 +215,10 @@ def test_iadd_vocab_to_vocab(): data2 = ["a1", "a2", "w1"] expected_freqs = {"w1": 2, "w2": 1, "w3": 1, "a1": 1, "a2": 1} - voc1 = vocab.Vocab(specials=vocab.SpecialVocabSymbols.PAD) + voc1 = vocab.Vocab(specials=vocab.PAD()) voc1 += data1 - voc2 = vocab.Vocab(specials=vocab.SpecialVocabSymbols.UNK) + voc2 = vocab.Vocab(specials=vocab.UNK()) voc2 += data2 voc1 += voc2 @@ -223,7 +226,7 @@ def test_iadd_vocab_to_vocab(): assert voc1.get_freqs() == expected_freqs assert all( spec in voc1.specials - for spec in (vocab.SpecialVocabSymbols.PAD, vocab.SpecialVocabSymbols.UNK) + for spec in (vocab.PAD(), vocab.UNK()) ) @@ -352,14 +355,14 @@ def test_equals_two_vocabs_different_freq(): voc2 += ["a"] assert voc1 != voc2 - +# This won't fail anymore, should change to +# test_vocab_filer_unk def test_vocab_fail_no_unk(): voc = vocab.Vocab(specials=()) voc += [1, 2, 3, 4, 5] voc.finalize() - with pytest.raises(ValueError): - voc.numericalize([1, 2, 3, 6]) + assert np.array_equal(voc.numericalize([1, 2, 3, 6]), np.array([0,1,2])) def test_vocab_has_no_specials(): @@ -374,33 +377,36 @@ def test_vocab_has_specials(): voc = vocab.Vocab() assert voc.has_specials - voc2 = vocab.Vocab(specials=vocab.SpecialVocabSymbols.UNK) + voc2 = vocab.Vocab(specials=vocab.UNK()) assert voc2._has_specials - assert voc2.specials == (vocab.SpecialVocabSymbols.UNK,) - - -def test_vocab_dict_normal_dict_use(): - vocab_dict = vocab.VocabDict() - vocab_dict["first"] = 2 - vocab_dict["second"] = 5 - assert len(vocab_dict) == 2 - assert vocab_dict["first"] == 2 - assert vocab_dict["second"] == 5 - - -def test_vocab_dict_default_factory(): - vocab_dict = vocab.VocabDict(default_factory=lambda: "default") - vocab_dict["item"] = 1 - assert len(vocab_dict) == 1 - assert vocab_dict["unkown_element"] == "default" - assert "unkown_element" not in vocab_dict - assert len(vocab_dict) == 1 - - -def test_vocab_dict_default_factory_none_error(): - vocab_dict = vocab.VocabDict(default_factory=None) - with pytest.raises(KeyError): - vocab_dict["item_not_in_dict"] + assert voc2.specials == (vocab.UNK(),) + +# There is no more vocabdict (applie to next 2 tests) + +#def test_vocab_dict_normal_dict_use(): +# vocab_dict = vocab.VocabDict() +# vocab_dict["first"] = 2 +# vocab_dict["second"] = 5 +# assert len(vocab_dict) == 2 +# assert vocab_dict["first"] == 2 +# assert vocab_dict["second"] == 5 + + +#def test_vocab_dict_default_factory(): +# vocab_dict = vocab.VocabDict(default_factory=lambda: "default") +# vocab_dict["item"] = 1 +# assert len(vocab_dict) == 1 +# assert vocab_dict["unkown_element"] == "default" +# assert "unkown_element" not in vocab_dict +# assert len(vocab_dict) == 1 + + +# This doesn't raise error anymore as we filter out unknown +# tokens when the UNK special isn't defined +#def test_vocab_dict_default_factory_none_error(): +# vocab_dict = vocab.VocabDict(default_factory=None) +# with pytest.raises(KeyError): +# vocab_dict["item_not_in_dict"] def test_reverse_numericalize(): From a5b671bb4f0626bec5d76303cc555bd3207e7fa9 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 23 Dec 2020 14:27:54 +0100 Subject: [PATCH 11/25] Wrap up tests & style --- podium/storage/vocab.py | 48 ++++++-------- tests/storage/conftest.py | 11 ++- tests/storage/test_field.py | 9 +-- tests/storage/test_iterator.py | 16 +++++ tests/storage/test_tfidf.py | 6 +- tests/storage/test_vocab.py | 118 +++++++++++++++++++++------------ 6 files changed, 124 insertions(+), 84 deletions(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 730e7765..49d59de5 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -40,6 +40,7 @@ class Special(str): instance method for each subclass. We ensure that each special token will be present in the Vocab. """ + def __hash__(self): """Overrides hash. @@ -48,10 +49,10 @@ def __hash__(self): return hash(self.__class__) def __eq__(self, other): - """ Check equals via class instead of value. + """Check equals via class instead of value. The motivation behind this is that we want to be able to match the special token by class and not by value, as it - is the type of the special token that determines its + is the type of the special token that determines its functionality. This way we allow for the concrete string representation of the special to be easily changed, while retaining simple @@ -67,30 +68,26 @@ def apply(self, sequence): class BOS(Special): - """The beginning-of-sequence special token. - """ + """The beginning-of-sequence special token.""" + def __new__(cls, token=""): - """Provides default value upon creation for the BOS token. - """ + """Provides default value upon creation for the BOS token.""" return super(BOS, cls).__new__(cls, token) def apply(self, sequence): - """ Apply the BOS token, adding it to the start of the sequence - """ + """Apply the BOS token, adding it to the start of the sequence""" return [self] + sequence class EOS(Special): - """The end-of-sequence special token. - """ + """The end-of-sequence special token.""" + def __new__(cls, token=""): - """Provides default value upon creation for the EOS token. - """ + """Provides default value upon creation for the EOS token.""" return super(EOS, cls).__new__(cls, token) def apply(self, sequence): - """ Apply the EOS token, adding it to the start of the sequence - """ + """Apply the EOS token, adding it to the start of the sequence""" return sequence + [self] @@ -98,32 +95,29 @@ def apply(self, sequence): # Core specials # ################# + class UNK(Special): - """The unknown core special token. - """ + """The unknown core special token.""" + def __new__(cls, token=""): - """Provides default value upon creation for the UNK token. - """ + """Provides default value upon creation for the UNK token.""" return super(UNK, cls).__new__(cls, token) def apply(self, sequence): - """Core special, handled by Vocab - """ + """Core special, handled by Vocab""" # Perhaps indicate somehow that this call isn't an op. return sequence class PAD(Special): - """The padding core special token. - """ + """The padding core special token.""" + def __new__(cls, token=""): - """Provides default value upon creation for the PAD token. - """ + """Provides default value upon creation for the PAD token.""" return super(PAD, cls).__new__(cls, token) def apply(self, sequence): - """Core special, handled by Vocab - """ + """Core special, handled by Vocab""" # Perhaps indicate somehow that this call isn't an op. return sequence @@ -233,7 +227,7 @@ def from_itos(cls, itos): vocab = cls(specials=specials) vocab._itos = itos - vocab._stoi = {k: v for k, v in enumerate(itos)} + vocab._stoi = {v: k for k, v in enumerate(itos)} vocab._finalized = True return vocab diff --git a/tests/storage/conftest.py b/tests/storage/conftest.py index d823f7a4..0cb0dfa6 100644 --- a/tests/storage/conftest.py +++ b/tests/storage/conftest.py @@ -51,18 +51,27 @@ def vocab(tabular_dataset_fields): return tabular_dataset_fields["text"].vocab +@pytest.fixture +@pytest.mark.usefixtures("json_file_path") +def cache_disabled_tabular_dataset(json_file_path): + return create_tabular_dataset_from_json( + tabular_dataset_fields(deterministic=False), json_file_path + ) + + @pytest.fixture @pytest.mark.usefixtures("json_file_path") def tabular_dataset(json_file_path): return create_tabular_dataset_from_json(tabular_dataset_fields(), json_file_path) -def tabular_dataset_fields(fixed_length=None): +def tabular_dataset_fields(fixed_length=None, deterministic=True): text = Field( "text", numericalizer=Vocab(eager=True), fixed_length=fixed_length, allow_missing_data=False, + deterministic=deterministic, ) text_missing = Field( "text_with_missing_data", diff --git a/tests/storage/test_field.py b/tests/storage/test_field.py index 150520b0..a547e53d 100644 --- a/tests/storage/test_field.py +++ b/tests/storage/test_field.py @@ -5,14 +5,7 @@ import numpy as np import pytest -from podium.storage import ( - Field, - LabelField, - MultilabelField, - MultioutputField, - Vocab, -) - +from podium.storage import Field, LabelField, MultilabelField, MultioutputField, Vocab from podium.storage.vocab import UNK diff --git a/tests/storage/test_iterator.py b/tests/storage/test_iterator.py index c7ec3fd0..ffe98a2b 100644 --- a/tests/storage/test_iterator.py +++ b/tests/storage/test_iterator.py @@ -166,6 +166,22 @@ def test_lazy_numericalization_caching(tabular_dataset): assert np.all(numericalized_data == cached_data) +@pytest.mark.usefixtures("cache_disabled_tabular_dataset") +def test_caching_disabled(tabular_dataset): + # Run one epoch to cause lazy numericalization + for _ in Iterator(dataset=tabular_dataset, batch_size=10): + pass + + cache_disabled_fields = [f for f in tabular_dataset.fields if not f.deterministic] + # Test if cached data is equal to numericalized data + for example in tabular_dataset: + for field in cache_disabled_fields: + + cache_field_name = f"{field.name}_" + numericalization = example.get(cache_field_name) + assert numericalization is None + + @pytest.mark.usefixtures("tabular_dataset") def test_sort_key(tabular_dataset): def text_len_sort_key(example): diff --git a/tests/storage/test_tfidf.py b/tests/storage/test_tfidf.py index 1e93d0ed..f64f369b 100644 --- a/tests/storage/test_tfidf.py +++ b/tests/storage/test_tfidf.py @@ -4,7 +4,7 @@ from podium.storage.field import Field from podium.storage.vectorizers.tfidf import CountVectorizer, TfIdfVectorizer -from podium.storage.vocab import UNK, PAD, Vocab +from podium.storage.vocab import PAD, UNK, Vocab from .conftest import TABULAR_TEXT @@ -112,9 +112,7 @@ def test_build_count_matrix_costum_specials_vocab_with_specials(): vocab_words = ["this", "is", "the", "first", "document"] vocab += vocab_words vocab.finalize() - tfidf = TfIdfVectorizer( - vocab=vocab, specials=[PAD(), "this", "first"] - ) + tfidf = TfIdfVectorizer(vocab=vocab, specials=[PAD(), "this", "first"]) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) diff --git a/tests/storage/test_vocab.py b/tests/storage/test_vocab.py index 4b04cf77..5ae2e266 100644 --- a/tests/storage/test_vocab.py +++ b/tests/storage/test_vocab.py @@ -1,9 +1,8 @@ import os import dill -import pytest - import numpy as np +import pytest from podium.storage import vocab @@ -105,15 +104,45 @@ def test_empty_specials_get_pad_symbol(): with pytest.raises(ValueError): voc.padding_index() -# Won't raise error anymore as we now filter -# unknown tokens -#def test_empty_specials_stoi(): -# voc = vocab.Vocab(specials=[]) -# data = ["tree", "plant", "grass"] -# voc = voc + set(data) -# voc.finalize() -# with pytest.raises(ValueError): -# voc.stoi["apple"] + +def test_no_unk_filters_unknown_tokens(): + voc = vocab.Vocab(specials=[]) + data = ["tree", "plant", "grass"] + voc = voc + set(data) + voc.finalize() + + # Tree is in vocab + assert len(voc.numericalize("tree")) == 1 + # Apple isn't in vocab + assert len(voc.numericalize("apple")) == 0 + # Try with list argument + assert len(voc.numericalize(["tree", "apple"])) == 1 + + +def test_specials_uniqueness(): + with pytest.raises(ValueError): + voc = vocab.Vocab(specials=[vocab.UNK(), vocab.UNK()]) + + with pytest.raises(ValueError): + voc = vocab.Vocab(specials=[vocab.UNK(), vocab.UNK("")]) + + with pytest.raises(ValueError): + voc = vocab.Vocab(specials=[vocab.PAD(), vocab.PAD()]) + + with pytest.raises(ValueError): + voc = vocab.Vocab(specials=[vocab.PAD(), vocab.PAD("")]) + + with pytest.raises(ValueError): + voc = vocab.Vocab(specials=[vocab.BOS(), vocab.BOS()]) + + with pytest.raises(ValueError): + voc = vocab.Vocab(specials=[vocab.BOS(), vocab.BOS("")]) + + with pytest.raises(ValueError): + voc = vocab.Vocab(specials=[vocab.EOS(), vocab.EOS()]) + + with pytest.raises(ValueError): + voc = vocab.Vocab(specials=[vocab.EOS(), vocab.EOS("")]) def test_specials_get_pad_symbol(): @@ -153,10 +182,17 @@ def test_size_after_final_with_specials(): assert len(voc) == len(data) + len(specials) -def test_enum_special_vocab_symbols(): +def test_special_vocab_symbols(): assert str(vocab.PAD()) == "" assert str(vocab.UNK()) == "" + assert str(vocab.PAD("")) == "" + assert str(vocab.UNK("")) == "" + + # These hold due to overloaded hash/eq + assert vocab.PAD("") == vocab.PAD() + assert vocab.UNK("") == vocab.UNK() + def test_get_stoi_for_unknown_word_default_unk(): specials = [vocab.PAD(), vocab.UNK()] @@ -224,10 +260,7 @@ def test_iadd_vocab_to_vocab(): voc1 += voc2 assert voc1.get_freqs() == expected_freqs - assert all( - spec in voc1.specials - for spec in (vocab.PAD(), vocab.UNK()) - ) + assert all(spec in voc1.specials for spec in (vocab.PAD(), vocab.UNK())) def test_add_vocab_to_vocab_error(): @@ -355,6 +388,7 @@ def test_equals_two_vocabs_different_freq(): voc2 += ["a"] assert voc1 != voc2 + # This won't fail anymore, should change to # test_vocab_filer_unk def test_vocab_fail_no_unk(): @@ -362,7 +396,7 @@ def test_vocab_fail_no_unk(): voc += [1, 2, 3, 4, 5] voc.finalize() - assert np.array_equal(voc.numericalize([1, 2, 3, 6]), np.array([0,1,2])) + assert np.array_equal(voc.numericalize([1, 2, 3, 6]), np.array([0, 1, 2])) def test_vocab_has_no_specials(): @@ -381,33 +415,6 @@ def test_vocab_has_specials(): assert voc2._has_specials assert voc2.specials == (vocab.UNK(),) -# There is no more vocabdict (applie to next 2 tests) - -#def test_vocab_dict_normal_dict_use(): -# vocab_dict = vocab.VocabDict() -# vocab_dict["first"] = 2 -# vocab_dict["second"] = 5 -# assert len(vocab_dict) == 2 -# assert vocab_dict["first"] == 2 -# assert vocab_dict["second"] == 5 - - -#def test_vocab_dict_default_factory(): -# vocab_dict = vocab.VocabDict(default_factory=lambda: "default") -# vocab_dict["item"] = 1 -# assert len(vocab_dict) == 1 -# assert vocab_dict["unkown_element"] == "default" -# assert "unkown_element" not in vocab_dict -# assert len(vocab_dict) == 1 - - -# This doesn't raise error anymore as we filter out unknown -# tokens when the UNK special isn't defined -#def test_vocab_dict_default_factory_none_error(): -# vocab_dict = vocab.VocabDict(default_factory=None) -# with pytest.raises(KeyError): -# vocab_dict["item_not_in_dict"] - def test_reverse_numericalize(): words = ["first", "second", "third"] @@ -427,3 +434,26 @@ def test_reverse_numericalize_not_finalized(): with pytest.raises(RuntimeError): voc.reverse_numericalize(voc.numericalize(words)) + + +def test_vocab_static_constructors(): + specials = [vocab.PAD(), vocab.UNK()] + voc = vocab.Vocab(specials=specials) + data = ["tree", "plant", "grass"] + voc = (voc + set(data)) + {"plant"} + voc.finalize() + + itos2voc = vocab.Vocab.from_itos(voc.itos) + # Only the frequencies will be different because + # we don't transfer this information, so the full + # vocab1 == vocab2 will fail. Perhaps split equality + # checks for vocab on before/after finalization? + + assert itos2voc.itos == voc.itos + assert itos2voc.stoi == voc.stoi + assert itos2voc.specials == voc.specials + + stoi2voc = vocab.Vocab.from_stoi(voc.stoi) + assert stoi2voc.itos == voc.itos + assert stoi2voc.stoi == voc.stoi + assert stoi2voc.specials == voc.specials From e37f10f2025338c795eaaf951d9890e48a2e7c7b Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 23 Dec 2020 14:31:07 +0100 Subject: [PATCH 12/25] Flake --- podium/storage/vocab.py | 4 +--- tests/storage/test_vocab.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 49d59de5..50c9ff28 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -1,8 +1,6 @@ """Module contains classes related to the vocabulary.""" -import abc import warnings from collections import Counter -from enum import Enum from itertools import chain from typing import Iterable, Union @@ -179,7 +177,7 @@ def __init__( # Apply uniqueness check if len(self.specials) > len(set(self.specials)): - error_msg = f"Specials may not contain multiple instances of same type." + error_msg = "Specials may not contain multiple instances of same type." raise ValueError(error_msg) self._itos = list(self.specials) diff --git a/tests/storage/test_vocab.py b/tests/storage/test_vocab.py index 5ae2e266..ab086002 100644 --- a/tests/storage/test_vocab.py +++ b/tests/storage/test_vocab.py @@ -121,28 +121,28 @@ def test_no_unk_filters_unknown_tokens(): def test_specials_uniqueness(): with pytest.raises(ValueError): - voc = vocab.Vocab(specials=[vocab.UNK(), vocab.UNK()]) + vocab.Vocab(specials=[vocab.UNK(), vocab.UNK()]) with pytest.raises(ValueError): - voc = vocab.Vocab(specials=[vocab.UNK(), vocab.UNK("")]) + vocab.Vocab(specials=[vocab.UNK(), vocab.UNK("")]) with pytest.raises(ValueError): - voc = vocab.Vocab(specials=[vocab.PAD(), vocab.PAD()]) + vocab.Vocab(specials=[vocab.PAD(), vocab.PAD()]) with pytest.raises(ValueError): - voc = vocab.Vocab(specials=[vocab.PAD(), vocab.PAD("")]) + vocab.Vocab(specials=[vocab.PAD(), vocab.PAD("")]) with pytest.raises(ValueError): - voc = vocab.Vocab(specials=[vocab.BOS(), vocab.BOS()]) + vocab.Vocab(specials=[vocab.BOS(), vocab.BOS()]) with pytest.raises(ValueError): - voc = vocab.Vocab(specials=[vocab.BOS(), vocab.BOS("")]) + vocab.Vocab(specials=[vocab.BOS(), vocab.BOS("")]) with pytest.raises(ValueError): - voc = vocab.Vocab(specials=[vocab.EOS(), vocab.EOS()]) + vocab.Vocab(specials=[vocab.EOS(), vocab.EOS()]) with pytest.raises(ValueError): - voc = vocab.Vocab(specials=[vocab.EOS(), vocab.EOS("")]) + vocab.Vocab(specials=[vocab.EOS(), vocab.EOS("")]) def test_specials_get_pad_symbol(): From 329b7f3b6d39f790ea6244eea499e39e278429b0 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 23 Dec 2020 14:50:20 +0100 Subject: [PATCH 13/25] Apply specials --- podium/storage/field.py | 9 +++++++++ tests/storage/test_field.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/podium/storage/field.py b/podium/storage/field.py index 91329f19..74f605e5 100644 --- a/podium/storage/field.py +++ b/podium/storage/field.py @@ -487,8 +487,17 @@ def _process_tokens( """ raw, tokenized = self._run_posttokenization_hooks(raw, tokens) + + # Apply the special tokens. These act as a post-tokenization + # hook, but are applied separately as we want to encapsulate + # that logic in their class to minimize code changes. + if self.use_vocab: + for special_token in self.vocab.specials: + tokenized = special_token.apply(tokenized) + raw = raw if self._keep_raw else None + # Self.eager checks if a vocab is used so this won't error if self.eager and not self.vocab.finalized: self.update_vocab(tokenized) return self.name, (raw, tokenized) diff --git a/tests/storage/test_field.py b/tests/storage/test_field.py index a547e53d..de04da5c 100644 --- a/tests/storage/test_field.py +++ b/tests/storage/test_field.py @@ -6,7 +6,7 @@ import pytest from podium.storage import Field, LabelField, MultilabelField, MultioutputField, Vocab -from podium.storage.vocab import UNK +from podium.storage.vocab import BOS, EOS, PAD, UNK ONE_TO_FIVE = [1, 2, 3, 4, 5] @@ -37,6 +37,7 @@ def __init__(self, eager=True): self.finalized = False self.numericalized = False self.eager = eager + self.specials = () def padding_index(self): return PAD_NUM @@ -411,6 +412,35 @@ def to_lower_hook(raw, tokenized): assert to_lower_hook.call_count == 2 +def test_field_applies_specials(): + bos, eos = BOS(), EOS() + vocab = Vocab(specials=(bos, eos)) + f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) + + _, received = f.preprocess("asd 123 BLA")[0] + expected = ("asd 123 BLA", [bos, "asd", "123", "BLA", eos]) + + assert received == expected + + # Test with empty specials + vocab = Vocab(specials=()) + f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) + + _, received = f.preprocess("asd 123 BLA")[0] + expected = ("asd 123 BLA", ["asd", "123", "BLA"]) + + assert received == expected + + # Test core specials are a no-op + vocab = Vocab(specials=(PAD(), UNK())) + f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) + + _, received = f.preprocess("asd 123 BLA")[0] + expected = ("asd 123 BLA", ["asd", "123", "BLA"]) + + assert received == expected + + def test_field_is_target(): f1 = Field(name="text", is_target=False) f2 = Field(name="label", is_target=True) From 9d4a3742a1f3b292b2ebf22efc9c6fb173dc7a2c Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 23 Dec 2020 14:52:46 +0100 Subject: [PATCH 14/25] Typo --- podium/storage/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 50c9ff28..900d0bf4 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -85,7 +85,7 @@ def __new__(cls, token=""): return super(EOS, cls).__new__(cls, token) def apply(self, sequence): - """Apply the EOS token, adding it to the start of the sequence""" + """Apply the EOS token, adding it to the end of the sequence""" return sequence + [self] From 42d74e678fa9c7ff5efc10bc75009b151e10d714 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 23 Dec 2020 21:24:03 +0100 Subject: [PATCH 15/25] That's what i get for using the web editor --- podium/storage/vocab.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index c4769969..9ebc1b91 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -221,11 +221,6 @@ def stoi(self): def from_itos(cls, itos): """Method constructs a vocab from a predefined index-to-string mapping. - @staticmethod - def _init_default_unk_index(specials): - """ - Method computes index of default unknown symbol in given collection. - Parameters ---------- itos: list | tuple From 84040b620bd6a1a95ad003b4db7dafe0a93200e0 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 23 Dec 2020 21:28:05 +0100 Subject: [PATCH 16/25] Black --- podium/storage/field.py | 51 +++++++++++++--------- podium/storage/vocab.py | 94 ++++++++++++++++++++++++++--------------- 2 files changed, 90 insertions(+), 55 deletions(-) diff --git a/podium/storage/field.py b/podium/storage/field.py index 198baeff..d74721db 100644 --- a/podium/storage/field.py +++ b/podium/storage/field.py @@ -762,9 +762,13 @@ def get_output_fields(self) -> Iterable["Field"]: class MultioutputField: - """Field that does pretokenization and tokenization once and passes it to its - output fields. Output fields are any type of field. The output fields are used only - for posttokenization processing (posttokenization hooks and vocab updating).""" + """ + Field that does pretokenization and tokenization once and passes it to its + output fields. + + Output fields are any type of field. The output fields are used only for + posttokenization processing (posttokenization hooks and vocab updating). + """ def __init__( self, @@ -772,9 +776,11 @@ def __init__( tokenizer: TokenizerType = "split", pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, ): - """Field that does pretokenization and tokenization once and passes it to its - output fields. Output fields are any type of field. The output fields are used - only for posttokenization processing (posttokenization hooks and vocab updating). + """ + Field that does pretokenization and tokenization once and passes it to + its output fields. Output fields are any type of field. The output + fields are used only for posttokenization processing (posttokenization + hooks and vocab updating). Parameters ---------- @@ -809,15 +815,14 @@ def __init__( self._output_fields = deque(output_fields) def add_pretokenize_hook(self, hook: PretokenizationHookType): - """Add a pre-tokenization hook to the MultioutputField. - If multiple hooks are added to the field, the order of their execution - will be the same as the order in which they were added to the field, - each subsequent hook taking the output of the previous hook as its - input. - If the same function is added to the Field as a hook multiple times, - it will be executed that many times. - The output of the final pre-tokenization hook is the raw data that the - tokenizer will get as its input. + """ + Add a pre-tokenization hook to the MultioutputField. If multiple hooks + are added to the field, the order of their execution will be the same as + the order in which they were added to the field, each subsequent hook + taking the output of the previous hook as its input. If the same + function is added to the Field as a hook multiple times, it will be + executed that many times. The output of the final pre-tokenization hook + is the raw data that the tokenizer will get as its input. Pretokenize hooks have the following signature: func pre_tok_hook(raw_data): @@ -835,7 +840,8 @@ def add_pretokenize_hook(self, hook: PretokenizationHookType): self._pretokenization_pipeline.add_hook(hook) def _run_pretokenization_hooks(self, data: Any) -> Any: - """Runs pretokenization hooks on the raw data and returns the result. + """ + Runs pretokenization hooks on the raw data and returns the result. Parameters ---------- @@ -846,7 +852,6 @@ def _run_pretokenization_hooks(self, data: Any) -> Any: ------- Any processed data - """ return self._pretokenization_pipeline(data) @@ -863,9 +868,10 @@ def add_output_field(self, field: "Field"): self._output_fields.append(field) def preprocess(self, data: Any) -> Iterable[Tuple[str, Tuple[Optional[Any], Any]]]: - """Preprocesses raw data, tokenizing it if required. The outputfields update their - vocabs if required and preserve the raw data if the output field's - 'keep_raw' is true. + """ + Preprocesses raw data, tokenizing it if required. The outputfields + update their vocabs if required and preserve the raw data if the output + field's 'keep_raw' is true. Parameters ---------- @@ -899,7 +905,10 @@ def get_output_fields(self) -> Iterable["Field"]: return self._output_fields def remove_pretokenize_hooks(self): - """Remove all the pre-tokenization hooks that were added to the MultioutputField.""" + """ + Remove all the pre-tokenization hooks that were added to the + MultioutputField. + """ self._pretokenization_pipeline.clear() diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 9ebc1b91..4cbc6451 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -32,63 +32,76 @@ def unique(values: Iterable): class Special(str): - """Base class for a special token. - - Every special token is a subclass of string (this way one can) - easily modify the concrete string representation of the special. - The functionality of the special token, which acts the same as - a post-tokenization hook should be implemented in the `apply` - instance method for each subclass. We ensure that each special - token will be present in the Vocab. + """ + Base class for a special token. + + Every special token is a subclass of string (this way one can) easily modify + the concrete string representation of the special. The functionality of the + special token, which acts the same as a post-tokenization hook should be + implemented in the `apply` instance method for each subclass. We ensure that + each special token will be present in the Vocab. """ def __hash__(self): - """Overrides hash. + """ + Overrides hash. Check docs of `__eq__` for motivation. """ return hash(self.__class__) def __eq__(self, other): - """Check equals via class instead of value. - The motivation behind this is that we want to be able to - match the special token by class and not by value, as it - is the type of the special token that determines its - functionality. - This way we allow for the concrete string representation - of the special to be easily changed, while retaining simple - existence checks for vocab functionality. + """ + Check equals via class instead of value. + + The motivation behind this is that we want to be able to match the + special token by class and not by value, as it is the type of the + special token that determines its functionality. This way we allow for + the concrete string representation of the special to be easily changed, + while retaining simple existence checks for vocab functionality. """ return self.__class__ == other.__class__ def apply(self, sequence): - """Apply (insert) the special token in the adequate - place in the sequence. + """ + Apply (insert) the special token in the adequate place in the sequence. """ raise NotImplementedError class BOS(Special): - """The beginning-of-sequence special token.""" + """ + The beginning-of-sequence special token. + """ def __new__(cls, token=""): - """Provides default value upon creation for the BOS token.""" + """ + Provides default value upon creation for the BOS token. + """ return super(BOS, cls).__new__(cls, token) def apply(self, sequence): - """Apply the BOS token, adding it to the start of the sequence""" + """ + Apply the BOS token, adding it to the start of the sequence. + """ return [self] + sequence class EOS(Special): - """The end-of-sequence special token.""" + """ + The end-of-sequence special token. + """ def __new__(cls, token=""): - """Provides default value upon creation for the EOS token.""" + """ + Provides default value upon creation for the EOS token. + """ return super(EOS, cls).__new__(cls, token) def apply(self, sequence): - """Apply the EOS token, adding it to the end of the sequence""" + """ + Apply the EOS token, adding it to the end of the sequence. + """ return sequence + [self] @@ -98,27 +111,39 @@ def apply(self, sequence): class UNK(Special): - """The unknown core special token.""" + """ + The unknown core special token. + """ def __new__(cls, token=""): - """Provides default value upon creation for the UNK token.""" + """ + Provides default value upon creation for the UNK token. + """ return super(UNK, cls).__new__(cls, token) def apply(self, sequence): - """Core special, handled by Vocab""" + """ + Core special, handled by Vocab. + """ # Perhaps indicate somehow that this call isn't an op. return sequence class PAD(Special): - """The padding core special token.""" + """ + The padding core special token. + """ def __new__(cls, token=""): - """Provides default value upon creation for the PAD token.""" + """ + Provides default value upon creation for the PAD token. + """ return super(PAD, cls).__new__(cls, token) def apply(self, sequence): - """Core special, handled by Vocab""" + """ + Core special, handled by Vocab. + """ # Perhaps indicate somehow that this call isn't an op. return sequence @@ -219,7 +244,8 @@ def stoi(self): @classmethod def from_itos(cls, itos): - """Method constructs a vocab from a predefined index-to-string mapping. + """ + Method constructs a vocab from a predefined index-to-string mapping. Parameters ---------- @@ -233,12 +259,12 @@ def from_itos(cls, itos): vocab._stoi = {v: k for k, v in enumerate(itos)} vocab._finalized = True - return vocab @classmethod def from_stoi(cls, stoi): - """Method constructs a vocab from a predefined index-to-string mapping. + """ + Method constructs a vocab from a predefined index-to-string mapping. Parameters ---------- From f471919aefff4a51dab73b1b13e38bc114f87d61 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Mon, 28 Dec 2020 14:49:25 +0100 Subject: [PATCH 17/25] Change default value handling in specials --- podium/storage/vocab.py | 47 +++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 4cbc6451..ba6dfc62 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -42,6 +42,29 @@ class Special(str): each special token will be present in the Vocab. """ + default_value = None + + def __new__(cls, token=None): + """ + Provides default value initialization for subclasses. + + If creating a new instance without a string argument, the + `default_value` class attribute must be set in the subclass + implementation. + """ + + if token is None and cls.default_value is None: + error_msg = ( + "When initializing a special token without argument" + f" the {cls.__class__}.default_value attribute must be set." + ) + raise RuntimeError(error_msg) + + if token is None: + token = cls.default_value + + return super(Special, cls).__new__(cls, token) + def __hash__(self): """ Overrides hash. @@ -74,11 +97,7 @@ class BOS(Special): The beginning-of-sequence special token. """ - def __new__(cls, token=""): - """ - Provides default value upon creation for the BOS token. - """ - return super(BOS, cls).__new__(cls, token) + default_value = "" def apply(self, sequence): """ @@ -92,11 +111,7 @@ class EOS(Special): The end-of-sequence special token. """ - def __new__(cls, token=""): - """ - Provides default value upon creation for the EOS token. - """ - return super(EOS, cls).__new__(cls, token) + default_value = "" def apply(self, sequence): """ @@ -115,11 +130,7 @@ class UNK(Special): The unknown core special token. """ - def __new__(cls, token=""): - """ - Provides default value upon creation for the UNK token. - """ - return super(UNK, cls).__new__(cls, token) + default_value = "" def apply(self, sequence): """ @@ -134,11 +145,7 @@ class PAD(Special): The padding core special token. """ - def __new__(cls, token=""): - """ - Provides default value upon creation for the PAD token. - """ - return super(PAD, cls).__new__(cls, token) + default_value = "" def apply(self, sequence): """ From 5eee1d13a65600d59ec01a56545556f87ea7b8c2 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Mon, 4 Jan 2021 21:43:58 +0100 Subject: [PATCH 18/25] Address comments --- podium/storage/field.py | 11 ++++++----- podium/storage/vocab.py | 19 +++++++++++++------ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/podium/storage/field.py b/podium/storage/field.py index ef4b6375..50b6d783 100644 --- a/podium/storage/field.py +++ b/podium/storage/field.py @@ -148,11 +148,12 @@ def __init__( instead. For missing data, the value in the list will be None. deterministic : bool - Flag which determines whether this Field has deterministic or nondeterministic - numericalization (numericalization for the same instance can be different between - function calls). Disables numericalization caching for this Field. The flag is - passed to the numericalizer to indicate to use the nondeterministic setting. - E.g., in the case of masked language modelling, we wish the inputs to be masked + The Flag which determines whether this Field has deterministic or nondeterministic + numericalization (numericalization is nondeterministic when, for the same instance, + it can be differ between function calls). When set to False, it Disables + numericalization caching for this Field. The flag is passed to the numericalizer + to indicate to use the nondeterministic setting. This flag should be used in the + case of masked language modelling, where we wish the inputs to be masked (nondeterministic), and the outputs (labels) to not be masked while using the same vocabulary. diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index ba6dfc62..247fadcd 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -170,6 +170,9 @@ class Vocab: mapping from word string to index """ + _unk = UNK() + _pad = PAD() + def __init__( self, max_size=None, @@ -327,9 +330,9 @@ def padding_index(self): ValueError If the padding symbol is not present in the vocabulary. """ - if PAD() not in self.stoi: + if Vocab._pad not in self.stoi: raise ValueError("Padding symbol is not in the vocabulary.") - return self.stoi[PAD()] + return self.stoi[Vocab._pad] def __iadd__(self, values: Union["Vocab", Iterable]): """ @@ -340,7 +343,9 @@ def __iadd__(self, values: Union["Vocab", Iterable]): values : Iterable or Vocab Values to be added to this Vocab. If Vocab, all of the token frequencies and specials from that Vocab will be - added to this Vocab. + added to this Vocab. Wheen adding two Vocabs with a different string values + for a special token, only the special token instance with the valuefrom the + LHS operand will be used. If Iterable, all of the tokens from the Iterable will be added to this Vocab, increasing the frequencies of those tokens. @@ -410,7 +415,9 @@ def __add__(self, values: Union["Vocab", Iterable]): ---------- values : Iterable or Vocab If Vocab, a new Vocab will be created containing all of the special symbols - and tokens from both Vocabs. + and tokens from both Vocabs. Wheen adding two Vocabs with a different string + values for a special token, only the special token instance with the value + from the first operand will be used. If Iterable, a new Vocab will be returned containing a copy of this Vocab with the iterables' tokens added. @@ -546,10 +553,10 @@ def numericalize(self, data): # Wrap string into list data = [data] - if UNK() in self.stoi: + if Vocab._unk in self.stoi: # If UNK is not in the vocabulary, we _erase_ the unknown tokens # from the instances. - unk_token = self.stoi[UNK()] + unk_token = self.stoi[Vocab._unk] return np.array( [self.stoi[token] if token in self.stoi else unk_token for token in data] ) From 810ff98242763940239244e908bcbd11323e4ef9 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Mon, 4 Jan 2021 21:52:42 +0100 Subject: [PATCH 19/25] Address comments --- podium/storage/field.py | 2 +- podium/storage/vocab.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/podium/storage/field.py b/podium/storage/field.py index 50b6d783..8a467d91 100644 --- a/podium/storage/field.py +++ b/podium/storage/field.py @@ -152,7 +152,7 @@ def __init__( numericalization (numericalization is nondeterministic when, for the same instance, it can be differ between function calls). When set to False, it Disables numericalization caching for this Field. The flag is passed to the numericalizer - to indicate to use the nondeterministic setting. This flag should be used in the + to indicate to use the nondeterministic setting. This flag should be used in the case of masked language modelling, where we wish the inputs to be masked (nondeterministic), and the outputs (labels) to not be masked while using the same vocabulary. diff --git a/podium/storage/vocab.py b/podium/storage/vocab.py index 247fadcd..fb1e431e 100644 --- a/podium/storage/vocab.py +++ b/podium/storage/vocab.py @@ -415,7 +415,7 @@ def __add__(self, values: Union["Vocab", Iterable]): ---------- values : Iterable or Vocab If Vocab, a new Vocab will be created containing all of the special symbols - and tokens from both Vocabs. Wheen adding two Vocabs with a different string + and tokens from both Vocabs. Wheen adding two Vocabs with a different string values for a special token, only the special token instance with the value from the first operand will be used. If Iterable, a new Vocab will be returned containing a copy of this Vocab From d5a85ccdab6cf5be14d4d55bcc0b53ed9c8b258a Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Mon, 4 Jan 2021 22:54:52 +0100 Subject: [PATCH 20/25] Add rst documentation page for specials --- docs/source/advanced.rst | 58 +++++++++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + docs/source/specials.rst | 29 +++++++++++++++++++ docs/source/walkthrough.rst | 2 +- 4 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 docs/source/specials.rst diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index e6cc723e..4351a241 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -133,6 +133,64 @@ We can see that our hooks worked: the raw data was lowercased prior to tokenizat We have prepared a number of predefined hooks which are ready for you to use. You can see them here: :ref:`predefined-hooks`. +Special tokens +=============== +We have earlier mentioned special tokens, but now is the time to elaborate on what exactly they are. In Podium, each special token is a subclass of the python ``string`` which also encapsulates the functionality for adding that special token in the tokenized sequence. The Vocab handles special tokens differently -- each special token is *guaranteed* a place in the Vocab, which is what makes them... special. + +Since our idea of special tokens was made to be extensible, we will take a brief look at how they are implemented, so we can better understand how to use them. We mentioned that each special token is a subclass of the python string, but there is an intermediary -- the :class:`podium.storage.vocab.Special` base class. The ``Special`` base class implements the following functionality, while still being an instance of a string: + + 1. Extending the constructor of the special token with a default value functionality. The default value for each special token should be set via the ``default_value`` class attribute, while if another value is passed upon creation, it will be used. + 2. Adds a stub ``apply`` method which accepts a sequence of tokens and adds the special token to that sequence. In its essence, the apply method is a post-tokenization hook which doesn't see the raw data whose job is to add the special token to the sequence of replace some of the existing tokens with the special token. The special tokens are applied after all post-tokenization hooks in the order they are passed to the :class:`podium.storage.vocab.Vocab` constructor. Each concrete implementation of a Special token has to implement this method. + 3. Implements singleton-like hash and equality checks. The ``Special`` class overrides the default hash and equals and instead of checking for string value equality, it checks for *class name equality*. We use this type of check to ensure that each Vocab has a single instance of each Special and for simpler referencing and contains checks. + +To better understand how specials work, we will walk through the implementation of one of special tokens implemented in Podium: the beginning-of-sequence (BOS) token. + +.. code-block:: python + + >>> from podium.storage.vocab import Special + >>> class BOS(Special): + >>> default_value = "" + >>> + >>> def apply(self, sequence): + >>> # Prepend to the sequence + >>> return [self] + sequence + >>> + >>> bos = BOS() + >>> print(bos) + + +This code block is the full implementation of a special token! All we needed to do is set the default value and implement the ``apply`` function. The default value is ``None`` by default and if not set, you have to make sure it is passed upon construction, like so: + +.. code-block:: python + + >>> my_bos = BOS("") + >>> print(my_bos) + + >>> print(bos == my_bos) + True + +We can also see that although we have changed the string representation of the special token, the equality check will still return True due to the ``Special`` base class changes mentioned earlier. + +To see the effect of the ``apply`` method, we will once again take a look at the SST dataset: + +.. code-block:: python + + >>> from podium import Vocab, Field, LabelField + >>> from podium.datasets import SST + >>> + >>> vocab = Vocab(specials=(bos)) + >>> text = Field(name='text', numericalizer=vocab) + >>> label = LabelField(name='label') + >>> fields = {'text': text, 'label': label} + >>> sst_train, sst_test, sst_dev = SST.get_dataset_splits(fields=fields) + >>> print(sst_train[222].text) + (None, ['', 'A', 'slick', ',', 'engrossing', 'melodrama', '.']) + +Where we can see that the special token was indeed added to the beginning of the tokenized sequence. + +Finally, it is important to note that there is an implicit distinction between special tokens. The unknown (:class:`podium.storage.vocab.UNK`) and padding (:class:`podium.storage.vocab.PAD`) special tokens are something we refer to as **core** special tokens, whose functionality is hardcoded in the implementation of the Vocab due to them being deeply integrated with the way iterators and numericalization work. +The only difference between normal and core specials is that core specials have an identity ``apply`` function, which simply returns the argument, while the tokens themselves are added to the sequence by other Podium classes. + Custom numericalization functions =========================================== diff --git a/docs/source/index.rst b/docs/source/index.rst index 854dc3c6..64a3aef2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -41,6 +41,7 @@ The documentation is organized in four parts: :caption: Core package Reference: vocab_and_fields + specials datasets iterators vectorizers diff --git a/docs/source/specials.rst b/docs/source/specials.rst new file mode 100644 index 00000000..b5768ac2 --- /dev/null +++ b/docs/source/specials.rst @@ -0,0 +1,29 @@ +Special tokens +=============== +.. autoclass:: podium.storage.vocab.Special + :members: + :no-undoc-members: + +The unknown token +^^^^^^^^^^^^^^^^^^ +.. autoclass:: podium.storage.vocab.UNK + :members: + :no-undoc-members: + +The padding token +^^^^^^^^^^^^^^^^^^ +.. autoclass:: podium.storage.vocab.PAD + :members: + :no-undoc-members: + +The beginning-of-sequence token +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autoclass:: podium.storage.vocab.BOS + :members: + :no-undoc-members: + +The end-of-sequence token +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autoclass:: podium.storage.vocab.EOS + :members: + :no-undoc-members: diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst index 59e6722a..abc81221 100644 --- a/docs/source/walkthrough.rst +++ b/docs/source/walkthrough.rst @@ -246,7 +246,7 @@ For this dataset, we need to define three Fields. We also might want the fields >>> print(dataset) TabularDataset[Size: 1, Fields: ['premise', 'hypothesis', 'label']] >>> print(shared_vocab.itos) - ['>, '>, 'man', 'A', 'inspects', 'the', 'uniform', 'of', 'a', 'figure', 'in', 'some', 'East', 'Asian', 'country', '.', 'The', 'is', 'sleeping'] + ['', '', 'man', 'A', 'inspects', 'the', 'uniform', 'of', 'a', 'figure', 'in', 'some', 'East', 'Asian', 'country', '.', 'The', 'is', 'sleeping'] .. _hf-loading: From 1851bb5da539a6cdca255ca86442afad48efe2c1 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Fri, 8 Jan 2021 19:33:41 +0100 Subject: [PATCH 21/25] Fix docs --- docs/source/specials.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/specials.rst b/docs/source/specials.rst index b5768ac2..d9c8801b 100644 --- a/docs/source/specials.rst +++ b/docs/source/specials.rst @@ -1,29 +1,29 @@ Special tokens =============== -.. autoclass:: podium.storage.vocab.Special +.. autoclass:: podium.vocab.Special :members: :no-undoc-members: The unknown token ^^^^^^^^^^^^^^^^^^ -.. autoclass:: podium.storage.vocab.UNK +.. autoclass:: podium.vocab.UNK :members: :no-undoc-members: The padding token ^^^^^^^^^^^^^^^^^^ -.. autoclass:: podium.storage.vocab.PAD +.. autoclass:: podium.vocab.PAD :members: :no-undoc-members: The beginning-of-sequence token ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. autoclass:: podium.storage.vocab.BOS +.. autoclass:: podium.vocab.BOS :members: :no-undoc-members: The end-of-sequence token ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. autoclass:: podium.storage.vocab.EOS +.. autoclass:: podium.vocab.EOS :members: :no-undoc-members: From e210118ab93ca78e49e6fc523b2cb5454f046dfd Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Tue, 12 Jan 2021 17:58:36 +0100 Subject: [PATCH 22/25] Address all comments --- docs/source/advanced.rst | 11 +++--- podium/field.py | 62 ++++++++++++++++++--------------- tests/conftest.py | 6 ++-- tests/datasets/test_iterator.py | 2 +- tests/test_vocab.py | 33 +++++++----------- 5 files changed, 56 insertions(+), 58 deletions(-) diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 4351a241..e5f060ec 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -100,7 +100,7 @@ And we're done! We can now add our hook to the text field either through the :me Removing punctuation as a posttokenization hook ----------------------------------------------- -We will now similarly define a posttokenization hook to remove punctuation. We will use the punctuation list from python's built-in ``string`` module, which we will store as an attribute of our hook. +We will now similarly define a posttokenization hook to remove punctuation. We will use the punctuation list from python's built-in ``str`` module, which we will store as an attribute of our hook. .. code-block:: python @@ -135,14 +135,17 @@ We have prepared a number of predefined hooks which are ready for you to use. Yo Special tokens =============== -We have earlier mentioned special tokens, but now is the time to elaborate on what exactly they are. In Podium, each special token is a subclass of the python ``string`` which also encapsulates the functionality for adding that special token in the tokenized sequence. The Vocab handles special tokens differently -- each special token is *guaranteed* a place in the Vocab, which is what makes them... special. +We have earlier mentioned special tokens, but now is the time to elaborate on what exactly they are. In Podium, each special token is a subclass of the python ``str`` which also encapsulates the functionality for adding that special token in the tokenized sequence. The Vocab handles special tokens differently -- each special token is guaranteed a place in the Vocab, which is what makes them... *special*. Since our idea of special tokens was made to be extensible, we will take a brief look at how they are implemented, so we can better understand how to use them. We mentioned that each special token is a subclass of the python string, but there is an intermediary -- the :class:`podium.storage.vocab.Special` base class. The ``Special`` base class implements the following functionality, while still being an instance of a string: 1. Extending the constructor of the special token with a default value functionality. The default value for each special token should be set via the ``default_value`` class attribute, while if another value is passed upon creation, it will be used. - 2. Adds a stub ``apply`` method which accepts a sequence of tokens and adds the special token to that sequence. In its essence, the apply method is a post-tokenization hook which doesn't see the raw data whose job is to add the special token to the sequence of replace some of the existing tokens with the special token. The special tokens are applied after all post-tokenization hooks in the order they are passed to the :class:`podium.storage.vocab.Vocab` constructor. Each concrete implementation of a Special token has to implement this method. + 2. Adds a stub ``apply`` method which accepts a sequence of tokens and adds the special token to that sequence. In its essence, the apply method is a post-tokenization hook (applied to the tokenized sequence after other post-tokenization hooks) which doesn't see the raw data whose job is to add the special token to the sequence of replace some of the existing tokens with the special token. The special tokens are applied after all post-tokenization hooks in the order they are passed to the :class:`podium.storage.vocab.Vocab` constructor. Each concrete implementation of a Special token has to implement this method. 3. Implements singleton-like hash and equality checks. The ``Special`` class overrides the default hash and equals and instead of checking for string value equality, it checks for *class name equality*. We use this type of check to ensure that each Vocab has a single instance of each Special and for simpler referencing and contains checks. +There is a number of special tokens used throughout NLP for a number of purposes. The most frequently used ones are the unknown token (UNK), which is used as a catch-all substitute for tokens which are not present in the vocabulary, and the padding token (PAD), which is used to nicely pack variable length sequences into fixed size batch tensors. +Alongside these two, common special tokens include the beginning-of-sequence and end-of-sequence tokens (BOS, EOS), the separator token (SEP) and the mask token introduced in BERT (MASK). + To better understand how specials work, we will walk through the implementation of one of special tokens implemented in Podium: the beginning-of-sequence (BOS) token. .. code-block:: python @@ -189,7 +192,7 @@ To see the effect of the ``apply`` method, we will once again take a look at the Where we can see that the special token was indeed added to the beginning of the tokenized sequence. Finally, it is important to note that there is an implicit distinction between special tokens. The unknown (:class:`podium.storage.vocab.UNK`) and padding (:class:`podium.storage.vocab.PAD`) special tokens are something we refer to as **core** special tokens, whose functionality is hardcoded in the implementation of the Vocab due to them being deeply integrated with the way iterators and numericalization work. -The only difference between normal and core specials is that core specials have an identity ``apply`` function, which simply returns the argument, while the tokens themselves are added to the sequence by other Podium classes. +The only difference between normal and core specials is that core specials are added to the sequence by other Podium classes (their behavior is hardcoded) instead of by their apply method. Custom numericalization functions =========================================== diff --git a/podium/field.py b/podium/field.py index 5c9fb668..6894de01 100644 --- a/podium/field.py +++ b/podium/field.py @@ -81,7 +81,7 @@ def __init__( fixed_length: Optional[int] = None, allow_missing_data: bool = False, disable_batch_matrix: bool = False, - deterministic: bool = True, + disable_numericalize_caching: bool = False, padding_token: Union[int, float] = -999, missing_data_token: Union[int, float] = -1, pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, @@ -147,12 +147,13 @@ def __init__( If True, a list of unpadded vectors(or other data type) will be returned instead. For missing data, the value in the list will be None. - deterministic : bool - The Flag which determines whether this Field has deterministic or nondeterministic - numericalization (numericalization is nondeterministic when, for the same instance, - it can be differ between function calls). When set to False, it Disables - numericalization caching for this Field. The flag is passed to the numericalizer - to indicate to use the nondeterministic setting. This flag should be used in the + disable_numericalize_caching : bool + The flag which determines whether the numericalization of this field should be + cached. This flag should be set to True if the numericalization can differ + between `numericalize` function calls for the same instance. When set to False, + the numericalization values will be cached and reused each time the instance + is used as part of a batch. The flag is passed to the numericalizer to indicate + use of its nondeterministic setting. This flag is mainly intended be used in the case of masked language modelling, where we wish the inputs to be masked (nondeterministic), and the outputs (labels) to not be masked while using the same vocabulary. @@ -187,7 +188,7 @@ def __init__( ) self._name = name self._disable_batch_matrix = disable_batch_matrix - self._deterministic = deterministic + self._disable_numericalize_caching = disable_numericalize_caching self._tokenizer_arg_string = tokenizer if isinstance(tokenizer, str) else None if tokenizer is None: @@ -277,8 +278,8 @@ def vocab(self): return self._vocab @property - def deterministic(self): - return self._deterministic + def disable_numericalize_caching(self): + return self._disable_numericalize_caching @property def use_vocab(self): @@ -698,10 +699,9 @@ def get_numericalization_for_example( cache_field_name = f"{self.name}_" numericalization = example.get(cache_field_name) - # Check if this concrete field can be cached. Fields that have - # non-deterministic numericalizers cannot be cached. + # Check if this concrete field can be cached. - cache = cache and self.deterministic + cache = cache and not self.disable_numericalize_caching if numericalization is None: example_data = example[self.name] @@ -926,7 +926,7 @@ def __init__( numericalizer: Optional[Union[Vocab, NumericalizerType]] = None, allow_missing_data: bool = False, disable_batch_matrix: bool = False, - deterministic: bool = True, + disable_numericalize_caching: bool = False, is_target: bool = True, missing_data_token: Union[int, float] = -1, pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, @@ -963,12 +963,14 @@ def __init__( If True, a list of unpadded vectors(or other data type) will be returned instead. For missing data, the value in the list will be None. - deterministic : bool - Flag which determines whether this Field has deterministic or nondeterministic - numericalization (numericalization for the same instance can be different between - function calls). Disables numericalization caching for this Field. The flag is - passed to the numericalizer to indicate to use the nondeterministic setting. - E.g., in the case of masked language modelling, we wish the inputs to be masked + disable_numericalize_caching : bool + The flag which determines whether the numericalization of this field should be + cached. This flag should be set to True if the numericalization can differ + between `numericalize` function calls for the same instance. When set to False, + the numericalization values will be cached and reused each time the instance + is used as part of a batch. The flag is passed to the numericalizer to indicate + use of its nondeterministic setting. This flag is mainly intended be used in the + case of masked language modelling, where we wish the inputs to be masked (nondeterministic), and the outputs (labels) to not be masked while using the same vocabulary. @@ -1005,7 +1007,7 @@ def __init__( fixed_length=1, allow_missing_data=allow_missing_data, disable_batch_matrix=disable_batch_matrix, - deterministic=deterministic, + disable_numericalize_caching=disable_numericalize_caching, missing_data_token=missing_data_token, pretokenize_hooks=pretokenize_hooks, ) @@ -1027,7 +1029,7 @@ def __init__( is_target: bool = True, allow_missing_data: bool = False, disable_batch_matrix: bool = False, - deterministic: bool = True, + disable_numericalize_caching: bool = False, missing_data_token: Union[int, float] = -1, pretokenize_hooks: Optional[Iterable[PretokenizationHookType]] = None, posttokenize_hooks: Optional[Iterable[PosttokenizationHookType]] = None, @@ -1086,12 +1088,14 @@ def __init__( If True, a list of unpadded vectors(or other data type) will be returned instead. For missing data, the value in the list will be None. - deterministic : bool - Flag which determines whether this Field has deterministic or nondeterministic - numericalization (numericalization for the same instance can be different between - function calls). Disables numericalization caching for this Field. The flag is - passed to the numericalizer to indicate to use the nondeterministic setting. - E.g., in the case of masked language modelling, we wish the inputs to be masked + disable_numericalize_caching : bool + The flag which determines whether the numericalization of this field should be + cached. This flag should be set to True if the numericalization can differ + between `numericalize` function calls for the same instance. When set to False, + the numericalization values will be cached and reused each time the instance + is used as part of a batch. The flag is passed to the numericalizer to indicate + use of its nondeterministic setting. This flag is mainly intended be used in the + case of masked language modelling, where we wish the inputs to be masked (nondeterministic), and the outputs (labels) to not be masked while using the same vocabulary. @@ -1134,7 +1138,7 @@ def __init__( fixed_length=num_of_classes, allow_missing_data=allow_missing_data, disable_batch_matrix=disable_batch_matrix, - deterministic=deterministic, + disable_numericalize_caching=disable_numericalize_caching, missing_data_token=missing_data_token, pretokenize_hooks=pretokenize_hooks, posttokenize_hooks=posttokenize_hooks, diff --git a/tests/conftest.py b/tests/conftest.py index 67674e15..6cb5a120 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -55,7 +55,7 @@ def vocab(tabular_dataset_fields): @pytest.mark.usefixtures("json_file_path") def cache_disabled_tabular_dataset(json_file_path): return create_tabular_dataset_from_json( - tabular_dataset_fields(deterministic=False), json_file_path + tabular_dataset_fields(disable_numericalize_caching=True), json_file_path ) @@ -65,13 +65,13 @@ def tabular_dataset(json_file_path): return create_tabular_dataset_from_json(tabular_dataset_fields(), json_file_path) -def tabular_dataset_fields(fixed_length=None, deterministic=True): +def tabular_dataset_fields(fixed_length=None, disable_numericalize_caching=False): text = Field( "text", numericalizer=Vocab(eager=True), fixed_length=fixed_length, allow_missing_data=False, - deterministic=deterministic, + disable_numericalize_caching=disable_numericalize_caching, ) text_missing = Field( "text_with_missing_data", diff --git a/tests/datasets/test_iterator.py b/tests/datasets/test_iterator.py index 256595ec..59890f26 100644 --- a/tests/datasets/test_iterator.py +++ b/tests/datasets/test_iterator.py @@ -173,7 +173,7 @@ def test_caching_disabled(tabular_dataset): for _ in Iterator(dataset=tabular_dataset, batch_size=10): pass - cache_disabled_fields = [f for f in tabular_dataset.fields if not f.deterministic] + cache_disabled_fields = [f for f in tabular_dataset.fields if f.disable_numericalize_caching] # Test if cached data is equal to numericalized data for example in tabular_dataset: for field in cache_disabled_fields: diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 1438d768..cfd64ff1 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -119,30 +119,21 @@ def test_no_unk_filters_unknown_tokens(): assert len(voc.numericalize(["tree", "apple"])) == 1 -def test_specials_uniqueness(): +@pytest.mark.parametrize( + "default_instance, second_default_instance, custom_instance", + [ + (vocab.UNK(), vocab.UNK(), vocab.UNK("")), + (vocab.PAD(), vocab.PAD(), vocab.PAD("")), + (vocab.BOS(), vocab.BOS(), vocab.BOS("")), + (vocab.EOS(), vocab.EOS(), vocab.EOS("")), + ], +) +def test_specials_uniqueness(default_instance, second_default_instance, custom_instance): with pytest.raises(ValueError): - vocab.Vocab(specials=[vocab.UNK(), vocab.UNK()]) + vocab.Vocab(specials=[default_instance, second_default_instance]) with pytest.raises(ValueError): - vocab.Vocab(specials=[vocab.UNK(), vocab.UNK("")]) - - with pytest.raises(ValueError): - vocab.Vocab(specials=[vocab.PAD(), vocab.PAD()]) - - with pytest.raises(ValueError): - vocab.Vocab(specials=[vocab.PAD(), vocab.PAD("")]) - - with pytest.raises(ValueError): - vocab.Vocab(specials=[vocab.BOS(), vocab.BOS()]) - - with pytest.raises(ValueError): - vocab.Vocab(specials=[vocab.BOS(), vocab.BOS("")]) - - with pytest.raises(ValueError): - vocab.Vocab(specials=[vocab.EOS(), vocab.EOS()]) - - with pytest.raises(ValueError): - vocab.Vocab(specials=[vocab.EOS(), vocab.EOS("")]) + vocab.Vocab(specials=[default_instance, custom_instance]) def test_specials_get_pad_symbol(): From 7b0209d592fd7f923e5a86a992011fddea74e64c Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Tue, 12 Jan 2021 18:00:29 +0100 Subject: [PATCH 23/25] Tick vocab --- docs/source/advanced.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index e5f060ec..ab8f8685 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -135,13 +135,13 @@ We have prepared a number of predefined hooks which are ready for you to use. Yo Special tokens =============== -We have earlier mentioned special tokens, but now is the time to elaborate on what exactly they are. In Podium, each special token is a subclass of the python ``str`` which also encapsulates the functionality for adding that special token in the tokenized sequence. The Vocab handles special tokens differently -- each special token is guaranteed a place in the Vocab, which is what makes them... *special*. +We have earlier mentioned special tokens, but now is the time to elaborate on what exactly they are. In Podium, each special token is a subclass of the python ``str`` which also encapsulates the functionality for adding that special token in the tokenized sequence. The ``Vocab`` handles special tokens differently -- each special token is guaranteed a place in the ``Vocab``, which is what makes them... *special*. Since our idea of special tokens was made to be extensible, we will take a brief look at how they are implemented, so we can better understand how to use them. We mentioned that each special token is a subclass of the python string, but there is an intermediary -- the :class:`podium.storage.vocab.Special` base class. The ``Special`` base class implements the following functionality, while still being an instance of a string: 1. Extending the constructor of the special token with a default value functionality. The default value for each special token should be set via the ``default_value`` class attribute, while if another value is passed upon creation, it will be used. 2. Adds a stub ``apply`` method which accepts a sequence of tokens and adds the special token to that sequence. In its essence, the apply method is a post-tokenization hook (applied to the tokenized sequence after other post-tokenization hooks) which doesn't see the raw data whose job is to add the special token to the sequence of replace some of the existing tokens with the special token. The special tokens are applied after all post-tokenization hooks in the order they are passed to the :class:`podium.storage.vocab.Vocab` constructor. Each concrete implementation of a Special token has to implement this method. - 3. Implements singleton-like hash and equality checks. The ``Special`` class overrides the default hash and equals and instead of checking for string value equality, it checks for *class name equality*. We use this type of check to ensure that each Vocab has a single instance of each Special and for simpler referencing and contains checks. + 3. Implements singleton-like hash and equality checks. The ``Special`` class overrides the default hash and equals and instead of checking for string value equality, it checks for *class name equality*. We use this type of check to ensure that each ``Vocab`` has a single instance of each Special and for simpler referencing and contains checks. There is a number of special tokens used throughout NLP for a number of purposes. The most frequently used ones are the unknown token (UNK), which is used as a catch-all substitute for tokens which are not present in the vocabulary, and the padding token (PAD), which is used to nicely pack variable length sequences into fixed size batch tensors. Alongside these two, common special tokens include the beginning-of-sequence and end-of-sequence tokens (BOS, EOS), the separator token (SEP) and the mask token introduced in BERT (MASK). @@ -191,7 +191,7 @@ To see the effect of the ``apply`` method, we will once again take a look at the Where we can see that the special token was indeed added to the beginning of the tokenized sequence. -Finally, it is important to note that there is an implicit distinction between special tokens. The unknown (:class:`podium.storage.vocab.UNK`) and padding (:class:`podium.storage.vocab.PAD`) special tokens are something we refer to as **core** special tokens, whose functionality is hardcoded in the implementation of the Vocab due to them being deeply integrated with the way iterators and numericalization work. +Finally, it is important to note that there is an implicit distinction between special tokens. The unknown (:class:`podium.storage.vocab.UNK`) and padding (:class:`podium.storage.vocab.PAD`) special tokens are something we refer to as **core** special tokens, whose functionality is hardcoded in the implementation of the ``Vocab`` due to them being deeply integrated with the way iterators and numericalization work. The only difference between normal and core specials is that core specials are added to the sequence by other Podium classes (their behavior is hardcoded) instead of by their apply method. Custom numericalization functions From 8e8c070cf71a87490d49327c3c07163af21b98e3 Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Tue, 12 Jan 2021 18:02:01 +0100 Subject: [PATCH 24/25] Style --- tests/datasets/test_iterator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/datasets/test_iterator.py b/tests/datasets/test_iterator.py index 59890f26..c550a7e9 100644 --- a/tests/datasets/test_iterator.py +++ b/tests/datasets/test_iterator.py @@ -173,7 +173,9 @@ def test_caching_disabled(tabular_dataset): for _ in Iterator(dataset=tabular_dataset, batch_size=10): pass - cache_disabled_fields = [f for f in tabular_dataset.fields if f.disable_numericalize_caching] + cache_disabled_fields = [ + f for f in tabular_dataset.fields if f.disable_numericalize_caching + ] # Test if cached data is equal to numericalized data for example in tabular_dataset: for field in cache_disabled_fields: From e5c90761b851cfa2643eea24be56555cf7d3f11b Mon Sep 17 00:00:00 2001 From: Martin Tutek Date: Wed, 13 Jan 2021 16:08:40 +0100 Subject: [PATCH 25/25] Final comments --- docs/source/advanced.rst | 2 ++ docs/source/walkthrough.rst | 2 +- podium/vocab.py | 28 +++++++++------------------- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index ab8f8685..cee349a9 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -133,6 +133,8 @@ We can see that our hooks worked: the raw data was lowercased prior to tokenizat We have prepared a number of predefined hooks which are ready for you to use. You can see them here: :ref:`predefined-hooks`. +.. _specials: + Special tokens =============== We have earlier mentioned special tokens, but now is the time to elaborate on what exactly they are. In Podium, each special token is a subclass of the python ``str`` which also encapsulates the functionality for adding that special token in the tokenized sequence. The ``Vocab`` handles special tokens differently -- each special token is guaranteed a place in the ``Vocab``, which is what makes them... *special*. diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst index abc81221..13c3e40c 100644 --- a/docs/source/walkthrough.rst +++ b/docs/source/walkthrough.rst @@ -127,7 +127,7 @@ That's it! We have defined our Fields. In order for them to be initialized, we n >>> print(small_vocabulary) Vocab[finalized: True, size: 5000] -Our new Vocab has been limited to the 5000 most frequent words. The remaining words will be replaced by the unknown (````) token, which is one of the default `special` tokens in the Vocab. +Our new Vocab has been limited to the 5000 most frequent words. If your `Vocab` contains the unknown special token :class:`podium.vocab.UNK`, the words not present in the vocabulary will be set to the value of the unknown token. The unknown token is one of the default `special` tokens in the Vocab, alongside the padding token :class:`podium.vocab.PAD`. You can read more about these in :ref:`specials`. You might have noticed that we used a different type of Field: :class:`podium.storage.LabelField` for the label. LabelField is one of the predefined custom Field classes with sensible default constructor arguments for its concrete use-case. We'll take a closer look at LabelFields in the following subsection. diff --git a/podium/vocab.py b/podium/vocab.py index fb1e431e..e24ecb46 100644 --- a/podium/vocab.py +++ b/podium/vocab.py @@ -88,8 +88,10 @@ def __eq__(self, other): def apply(self, sequence): """ Apply (insert) the special token in the adequate place in the sequence. + + By default, returns the unchanged sequence. """ - raise NotImplementedError + return sequence class BOS(Special): @@ -128,32 +130,22 @@ def apply(self, sequence): class UNK(Special): """ The unknown core special token. + + Functionality handled by Vocab. """ default_value = "" - def apply(self, sequence): - """ - Core special, handled by Vocab. - """ - # Perhaps indicate somehow that this call isn't an op. - return sequence - class PAD(Special): """ The padding core special token. + + Functionality handled by Vocab. """ default_value = "" - def apply(self, sequence): - """ - Core special, handled by Vocab. - """ - # Perhaps indicate somehow that this call isn't an op. - return sequence - class Vocab: """ @@ -554,15 +546,13 @@ def numericalize(self, data): data = [data] if Vocab._unk in self.stoi: - # If UNK is not in the vocabulary, we _erase_ the unknown tokens - # from the instances. + # If UNK is in the vocabulary, substitute unknown words with its value unk_token = self.stoi[Vocab._unk] return np.array( [self.stoi[token] if token in self.stoi else unk_token for token in data] ) else: - # Either UNK is not in Vocab or the user has requested unknown tokens - # to be filtered out of the instances. + # If UNK is not in the vocabulary we filter out unknown words return np.array([self.stoi[token] for token in data if token in self.stoi]) def reverse_numericalize(self, numericalized_data: Iterable):