diff --git a/README.md b/README.md index 34b0e57a..47cc3f20 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ For usage examples see the documentation pages [walkthrough](http://takelab.fer. Use some of our pre-defined datasets: ```python ->>> from podium.datasets import SST +>>> from podium import SST >>> sst_train, sst_test, sst_dev = SST.get_dataset_splits() >>> print(sst_train) SST({ @@ -93,7 +93,7 @@ Load datasets from [🤗/datasets](https://github.com/huggingface/datasets): ```python - >>> from podium.datasets.hf import HFDatasetConverter + >>> from podium import HFDatasetConverter >>> import datasets >>> # Load the huggingface dataset >>> imdb = datasets.load_dataset('imdb') @@ -124,8 +124,7 @@ Load datasets from [🤗/datasets](https://github.com/huggingface/datasets): Load your own dataset from a standardized tabular format (e.g. `csv`, `tsv`, `jsonl`): ```python ->>> from podium.datasets import TabularDataset ->>> from podium import Vocab, Field, LabelField +>>> from podium import Vocab, Field, LabelField, TabularDataset >>> fields = {'premise': Field('premise', numericalizer=Vocab()), ... 'hypothesis':Field('hypothesis', numericalizer=Vocab()), ... 'label': LabelField('label')} diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 89ca53e3..e795db2d 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -1,8 +1,6 @@ .. testsetup:: * - from podium import Field, LabelField, Vocab, Iterator, TabularDataset - from podium.datasets import SST - from podium.vectorizers import GloVe, TfIdfVectorizer + from podium import Field, LabelField, Vocab, Iterator, TabularDataset, SST, GloVe, TfIdfVectorizer The Podium data flow ==================== @@ -14,7 +12,7 @@ The data is processed immediately when the instance is loaded from disk and then .. doctest:: sst_field - >>> from podium.datasets import SST + >>> from podium import SST >>> sst_train, sst_test, sst_dev = SST.get_dataset_splits() >>> print(sst_train[222]) Example({'text': (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']), 'label': (None, 'positive')}) @@ -159,7 +157,7 @@ To better understand how specials work, we will walk through the implementation .. doctest:: specials - >>> from podium.vocab import Special + >>> from podium import Special >>> class BOS(Special): ... default_value = "" ... @@ -187,8 +185,7 @@ To see the effect of the ``apply`` method, we will once again take a look at the .. doctest:: specials - >>> from podium import Vocab, Field, LabelField - >>> from podium.datasets import SST + >>> from podium import Vocab, Field, LabelField, SST >>> >>> vocab = Vocab(specials=(bos)) >>> text = Field(name='text', numericalizer=vocab) @@ -236,8 +233,7 @@ We have so far covered the case where you have a single input column, tokenize a .. doctest:: multioutput - >>> from podium import Vocab, Field, LabelField - >>> from podium.datasets import SST + >>> from podium import Vocab, Field, LabelField, SST >>> char = Field(name='char', numericalizer=Vocab(), tokenizer=list) >>> text = Field(name='word', numericalizer=Vocab()) >>> label = LabelField(name='label') @@ -303,8 +299,7 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended .. code-block:: python - >>> from podium import Vocab, Field, LabelField - >>> from podium.datasets import SST, IMDB + >>> from podium import Vocab, Field, LabelField, SST, IMDB >>> vocab = Vocab() >>> text = Field(name='text', numericalizer=vocab) >>> label = LabelField(name='label') @@ -343,7 +338,7 @@ The ``bucket_sort_key`` function defines how the instances in the dataset should For Iterator, padding = 148141 out of 281696 = 52.588961149608096% For BucketIterator, padding = 2125 out of 135680 = 1.5661851415094339% -As we can see, the difference between using a regular Iterator and a BucketIterator is massive. Not only do we reduce the amount of padding, we have reduced the total amount of tokens processed by about 50%. The SST dataset, however, is a relatively small dataset so this experiment might be a bit biased. Let's take a look at the same statistics for the :class:`podium.datasets.IMDB` dataset. After changing the highligted data loading line in the first snippet to: +As we can see, the difference between using a regular ``Iterator`` and a ``BucketIterator`` is massive. Not only do we reduce the amount of padding, we have reduced the total amount of tokens processed by about 50%. The SST dataset, however, is a relatively small dataset so this experiment might be a bit biased. Let's take a look at the same statistics for the :class:`podium.datasets.IMDB` dataset. After changing the highligted data loading line in the first snippet to: .. code-block:: python @@ -374,8 +369,7 @@ As an example, we will again turn to the SST dataset and some of our previously .. doctest:: saveload :options: +NORMALIZE_WHITESPACE - >>> from podium import Vocab, Field, LabelField - >>> from podium.datasets import SST + >>> from podium import Vocab, Field, LabelField, SST >>> >>> vocab = Vocab(max_size=5000, min_freq=2) >>> text = Field(name='text', numericalizer=vocab) diff --git a/docs/source/faq.rst b/docs/source/faq.rst index acf966a5..67cfe505 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -9,7 +9,7 @@ FAQ .. code-block:: python - >>> from podium.datasets import SST + >>> from podium import SST >>> sst_train, sst_test, sst_dev = SST.get_dataset_splits() >>> x, y = sst_train.batch() >>> print(x.text.shape, y.label.shape, sep='\n') @@ -20,8 +20,7 @@ Be aware that you will get a dataset as a matrix by default -- meaning that all .. code-block:: python - >>> from podium.datasets import SST - >>> from podium import Vocab, Field, LabelField + >>> from podium import Vocab, Field, LabelField, SST >>> text = Field(name='text', numericalizer=Vocab(), disable_batch_matrix=True) >>> label = LabelField(name='label') >>> fields = {'text':text, 'label':label} diff --git a/docs/source/preprocessing.rst b/docs/source/preprocessing.rst index ab9897f2..01d897e9 100644 --- a/docs/source/preprocessing.rst +++ b/docs/source/preprocessing.rst @@ -43,8 +43,7 @@ Regex Replace .. code-block:: python - >>> from podium import Field, LabelField, Vocab - >>> from podium.datasets import SST + >>> from podium import Field, LabelField, Vocab, SST >>> >>> text = Field('text', numericalizer=Vocab()) >>> label = LabelField('label') @@ -123,7 +122,7 @@ Truecase .. code-block:: python - >>> from podium.preproc import truecase + >>> from podium import truecase >>> apply_truecase = truecase(oov='as-is') >>> print(apply_truecase('hey, what is the weather in new york?')) Hey, what is the weather in New York? diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst index 18143227..eb952a61 100644 --- a/docs/source/walkthrough.rst +++ b/docs/source/walkthrough.rst @@ -1,9 +1,7 @@ .. testsetup:: * - from podium import Field, LabelField, Vocab, Iterator, TabularDataset - from podium.datasets import SST - from podium.vectorizers import GloVe, TfIdfVectorizer + from podium import Field, LabelField, Vocab, Iterator, TabularDataset, SST, GloVe, TfIdfVectorizer Walkthrough @@ -29,7 +27,7 @@ One built-in dataset available in Podium is the `Stanford Sentiment Treebank >> from podium.datasets import SST + >>> from podium import SST >>> sst_train, sst_test, sst_valid = SST.get_dataset_splits() # doctest:+ELLIPSIS >>> print(sst_train) SST({ @@ -100,7 +98,7 @@ This way, we can define a static dictionary which we might have obtained on anot .. doctest:: custom_vocab - >>> from podium.vocab import UNK + >>> from podium import UNK >>> custom_itos = [UNK(), 'this', 'is', 'a', 'sample'] >>> vocab = Vocab.from_itos(custom_itos) >>> print(vocab) @@ -285,7 +283,7 @@ The output of the function call is a numpy matrix of word embeddings which you c .. code-block:: python - >>> from podium.vectorizers import GloVe + >>> from podium import GloVe >>> vocab = fields['text'].vocab >>> glove = GloVe() >>> embeddings = glove.load_vocab(vocab) @@ -308,8 +306,7 @@ As we intend to use the whole dataset at once, we will also set ``disable_batch_ .. doctest:: vectorizer - >>> from podium.datasets import SST - >>> from podium import Vocab, Field, LabelField + >>> from podium import Vocab, Field, LabelField, SST >>> vocab = Vocab(max_size=5000) >>> text = Field(name='text', numericalizer=vocab, disable_batch_matrix=True) >>> label = LabelField(name='label') @@ -320,7 +317,7 @@ Since the Tf-Idf vectorizer needs information from the dataset to compute the in .. doctest:: vectorizer - >>> from podium.vectorizers.tfidf import TfIdfVectorizer + >>> from podium import TfIdfVectorizer >>> tfidf_vectorizer = TfIdfVectorizer() >>> tfidf_vectorizer.fit(dataset=sst_train, field=text) @@ -433,7 +430,7 @@ You can load a dataset in 🤗/datasets and then convert it to a Podium dataset .. code-block:: python - >>> from podium.datasets.hf import HFDatasetConverter + >>> from podium import HFDatasetConverter >>> import datasets >>> # Loading a huggingface dataset returns an instance of DatasetDict >>> # which contains the dataset splits (usually: train, valid, test, diff --git a/podium/__init__.py b/podium/__init__.py index d57b72e5..9f015234 100644 --- a/podium/__init__.py +++ b/podium/__init__.py @@ -4,21 +4,204 @@ See http://takelab.fer.hr/podium/ for complete documentation. """ -import logging - -from .datasets import ( - BucketIterator, - Dataset, - Example, - HierarchicalDataset, - HierarchicalDatasetIterator, - Iterator, - SingleBatchIterator, - TabularDataset, -) -from .field import Field, LabelField, MultilabelField, MultioutputField -from .vocab import Vocab - - -__name__ = "podium" +import importlib +import os +import sys +from types import ModuleType +from typing import TYPE_CHECKING, Any + + __version__ = "1.1.0" + + +def _is_package_available(package_name): + return importlib.util.find_spec(package_name) is not None + + +_import_structure = { + "field": ["Field", "LabelField", "MultilabelField", "MultioutputField"], + "vocab": ["Vocab", "Special", "BOS", "EOS", "UNK", "PAD"], + "datasets.dataset": [ + "Dataset", + "DatasetBase", + "DatasetConcatView", + "DatasetIndexedView", + "DatasetSlicedView", + "concat", + "create_view", + "rationed_split", + "stratified_split", + ], + "datasets.example_factory": ["Example", "ExampleFactory", "ExampleFormat"], + "datasets.hierarhical_dataset": ["HierarchicalDataset"], + "datasets.impl.catacx_dataset": ["CatacxDataset"], + "datasets.impl.conllu_dataset": ["CoNLLUDataset"], + "datasets.impl.cornell_movie_dialogs_dataset": [ + "CornellMovieDialogsConversationalDataset" + ], + "datasets.impl.eurovoc_dataset": ["EuroVocDataset"], + "datasets.impl.imdb_sentiment_dataset": ["IMDB"], + "datasets.impl.pauza_dataset": ["PauzaHRDataset"], + "datasets.impl.snli_dataset": ["SNLIDataset", "SNLISimple"], + "datasets.impl.sst_sentiment_dataset": ["SST"], + "datasets.iterator": [ + "BucketIterator", + "HierarchicalDatasetIterator", + "Iterator", + "SingleBatchIterator", + ], + "datasets.tabular_dataset": ["TabularDataset"], + "preproc.functional": ["remove_stopwords", "truecase"], + "preproc.hooks": [ + "MosesNormalizer", + "NLTKStemmer", + "RegexReplace", + "SpacyLemmatizer", + "TextCleanUp", + ], + "preproc.lemmatizer": ["CroatianLemmatizer"], + "preproc.sentencizers": ["SpacySentencizer"], + "preproc.stemmer": ["CroatianStemmer"], + "preproc.tokenizers": ["get_tokenizer"], + "storage.resources.downloader": [ + "BaseDownloader", + "HttpDownloader", + "SCPDownloader", + "SimpleHttpDownloader", + ], + "storage.resources.large_resource": ["LargeResource", "SCPLargeResource"], + "vectorizers.impl": ["GloVe", "NlplVectorizer"], + "vectorizers.tfidf": ["TfIdfVectorizer"], + "vectorizers.vectorizer": [ + "BasicVectorStorage", + "VectorStorage", + "random_normal_default_vector", + "zeros_default_vector", + ], +} + +if _is_package_available("pyarrow"): + _import_structure["datasets.arrow"] = ["DiskBackedDataset"] +if _is_package_available("datasets"): + _import_structure["datasets.hf"] = [ + "convert_features_to_fields", + "HFDatasetConverter", + ] +if _is_package_available("yake"): + _import_structure["preproc.yake"] = ["YAKE"] + + +if TYPE_CHECKING: + from .datasets.dataset import ( + Dataset, + DatasetBase, + DatasetConcatView, + DatasetIndexedView, + DatasetSlicedView, + concat, + create_view, + rationed_split, + stratified_split, + ) + from .datasets.example_factory import Example, ExampleFactory, ExampleFormat + from .datasets.hierarhical_dataset import HierarchicalDataset + from .datasets.impl.catacx_dataset import CatacxDataset + from .datasets.impl.conllu_dataset import CoNLLUDataset + from .datasets.impl.cornell_movie_dialogs_dataset import ( + CornellMovieDialogsConversationalDataset, + ) + from .datasets.impl.eurovoc_dataset import EuroVocDataset + from .datasets.impl.imdb_sentiment_dataset import IMDB + from .datasets.impl.pauza_dataset import PauzaHRDataset + from .datasets.impl.snli_dataset import SNLIDataset, SNLISimple + from .datasets.impl.sst_sentiment_dataset import SST + from .datasets.iterator import ( + BucketIterator, + HierarchicalDatasetIterator, + Iterator, + SingleBatchIterator, + ) + from .datasets.tabular_dataset import TabularDataset + from .field import Field, LabelField, MultilabelField, MultioutputField + from .preproc.functional import remove_stopwords, truecase + from .preproc.hooks import ( + MosesNormalizer, + NLTKStemmer, + RegexReplace, + SpacyLemmatizer, + TextCleanUp, + ) + from .preproc.lemmatizer import CroatianLemmatizer + from .preproc.sentencizers import SpacySentencizer + from .preproc.stemmer import CroatianStemmer + from .preproc.tokenizers import get_tokenizer + from .storage.resources.downloader import ( + BaseDownloader, + HttpDownloader, + SCPDownloader, + SimpleHttpDownloader, + ) + from .storage.resources.large_resource import LargeResource, SCPLargeResource + from .vectorizers.impl import GloVe, NlplVectorizer + from .vectorizers.tfidf import TfIdfVectorizer + from .vectorizers.vectorizer import ( + BasicVectorStorage, + VectorStorage, + random_normal_default_vector, + zeros_default_vector, + ) + from .vocab import Vocab + + if _is_package_available("pyarrow"): + from .datasets.arrow import DiskBackedDataset + if _is_package_available("datasets"): + from .datasets.hf import HFDatasetConverter, convert_features_to_fields + if _is_package_available("yake"): + from .preproc.yake import YAKE +else: + + class _LazyModule(ModuleType): + """ + Module class that surfaces all objects but only performs associated + imports when the objects are requested. + """ + + __file__ = globals()["__file__"] + __path__ = [os.path.dirname(__file__)] + + # This code is inspired by: + # https://github.com/huggingface/transformers/blob/master/src/huggingface/__init__.py + def __init__(self, name, import_structure): + super().__init__(name) + self._modules = set(import_structure.keys()) + self._class_to_module = {} + for key, values in import_structure.items(): + for value in values: + self._class_to_module[value] = key + # Needed for autocompletion in an IDE + self.__all__ = list(import_structure.keys()) + sum( + import_structure.values(), [] + ) + + # Needed for autocompletion in an IDE + def __dir__(self): + return super().__dir__() + self.__all__ + + def __getattr__(self, name: str) -> Any: + if name == "__version__": + return __version__ + elif name in self._modules: + value = self._get_module(name) + elif name in self._class_to_module.keys(): + module = self._get_module(self._class_to_module[name]) + value = getattr(module, name) + else: + raise AttributeError(f"module {self.__name__} has no attribute {name}") + + setattr(self, name, value) + return value + + def _get_module(self, module_name: str): + return importlib.import_module("." + module_name, self.__name__) + + sys.modules[__name__] = _LazyModule(__name__, _import_structure) diff --git a/podium/datasets/impl/cornell_movie_dialogs_dataset.py b/podium/datasets/impl/cornell_movie_dialogs_dataset.py index 389fe310..fd23852d 100644 --- a/podium/datasets/impl/cornell_movie_dialogs_dataset.py +++ b/podium/datasets/impl/cornell_movie_dialogs_dataset.py @@ -6,6 +6,8 @@ import re from collections import namedtuple +import pandas as pd + from podium.datasets.dataset import Dataset from podium.datasets.example_factory import ExampleFactory from podium.field import Field @@ -13,16 +15,6 @@ from podium.vocab import Vocab -try: - import pandas as pd -except ImportError: - print( - "Problem occured while trying to import pandas. If the library is not " - "installed visit https://pandas.pydata.org/ for more details." - ) - raise - - CornellMovieDialogsNamedTuple = namedtuple( "CornellMovieDialogsNamedTuple", ["titles", "conversations", "lines", "characters", "url"], diff --git a/podium/datasets/impl/snli_dataset.py b/podium/datasets/impl/snli_dataset.py index ac91807a..1ced2da6 100644 --- a/podium/datasets/impl/snli_dataset.py +++ b/podium/datasets/impl/snli_dataset.py @@ -5,6 +5,8 @@ """ import os +from nltk import Tree + from podium.datasets import Dataset from podium.datasets.example_factory import ExampleFactory from podium.field import Field, LabelField @@ -189,17 +191,6 @@ class _TreeFactory: """ def __call__(self, text): - - try: - from nltk import Tree - except ImportError: - print( - "Problem occurred while trying to import nltk. " - "If the library is not installed visit " - "https://www.nltk.org/ for more details." - ) - raise - if text[0] != "(": text = "(" + text + ")" return Tree.fromstring(text)