diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml new file mode 100644 index 00000000..941a3d26 --- /dev/null +++ b/.github/workflows/scheduled.yml @@ -0,0 +1,33 @@ +name: Scheduled jobs + +on: + pull_request: + branches: + - master + repository_dispatch: + schedule: + - cron: "0 0 * * *" + +jobs: + check_notebooks: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.6] + defaults: + run: + working-directory: docs/source/scripts + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + - name: Execute notebooks + run: | + python check_notebooks.py --num_proc auto --ignore_whitespace diff --git a/README.md b/README.md index 84ea9eef..e7489d16 100644 --- a/README.md +++ b/README.md @@ -207,7 +207,7 @@ Example({ }) ``` -For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/takelab/podium/blob/master/docs/source/notebooks/quickstart.ipynb) +For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/TakeLab/podium/blob/master/docs/source/notebooks/quickstart.ipynb) More complex examples can be found in our [examples folder](./examples). diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 25a939eb..705df020 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -379,7 +379,9 @@ For a simple example, we will take a look at the built-in SST and IMDB datasets: >>> from podium import Field, LabelField, Vocab >>> # Load the datasets >>> imdb_train, imdb_test = IMDB.get_dataset_splits() + >>> imdb_train.finalize_fields() >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits() + >>> sst_train.finalize_fields() >>> >>> # Luckily, both label vocabularies are already equal >>> print(imdb_train.field('label').vocab.itos) @@ -416,6 +418,7 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended >>> fields = {'text': text, 'label': label} >>> >>> train, valid, test = SST.get_dataset_splits(fields=fields) + >>> train.finalize_fields() >>> >>> # Define the iterators and our sort key >>> from podium import Iterator, BucketIterator @@ -423,14 +426,14 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended >>> # Use the text Field >>> raw, tokenized = instance.text >>> return len(tokenized) - >>> bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length) + >>> bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length) The ``bucket_sort_key`` function defines how the instances in the dataset should be sorted. The method accepts an instance of the dataset, and should return a value which will be used as a sort key in the ``BucketIterator``. It might be interesting (and surprising) to see how much space (and time) do we earn by bucketing. We will define a naive iterator on the same dataset and measure the total amount of padding used when iterating over a dataset. .. code-block:: python >>> import numpy as np - >>> vanilla_iter = Iterator(train, batch_size=32) + >>> vanilla_iter = Iterator(sst_train, batch_size=32) >>> >>> def count_padding(batch, padding_idx): >>> return np.count_nonzero(batch == padding_idx) @@ -518,7 +521,7 @@ Each ``Dataset`` instance in the SST dataset splits contains ``Field``\s and a ` >>> import pickle >>> >>> cache_dir = Path('cache') - >>> cache_dir.mkdir() + >>> cache_dir.mkdir(exist_ok=True) >>> >>> dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl') >>> diff --git a/docs/source/installation.md b/docs/source/installation.md index 7c67d2fb..bcc7ab55 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -23,5 +23,5 @@ Coming soon! ## Installing from source To install from source via terminal: -1. Clone the repository: `git clone git@github.com:takelab/podium.git && cd podium` +1. Clone the repository: `git clone git@github.com:TakeLab/podium.git && cd podium` 2. Install podium: `pip install .` diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb index c9f7d874..b5dd9b4c 100644 --- a/docs/source/notebooks/advanced.ipynb +++ b/docs/source/notebooks/advanced.ipynb @@ -9,7 +9,7 @@ "# Podium installation\n", "! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# ! pip install git+https://github.com/takelab/podium\n", + "# ! pip install git+https://github.com/TakeLab/podium.git\n", "\n", "# Additional dependencies required to run this notebook:\n", "! pip install transformers spacy\n", @@ -134,7 +134,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -337,7 +337,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -477,7 +477,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -798,7 +798,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -843,7 +843,9 @@ "from podium import Field, LabelField, Vocab\n", "# Load the datasets\n", "imdb_train, imdb_test = IMDB.get_dataset_splits()\n", + "imdb_train.finalize_fields()\n", "sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n", + "sst_train.finalize_fields()\n", "\n", "# Luckily, both label vocabularies are already equal\n", "print(imdb_train.field('label').vocab.itos)\n", @@ -867,7 +869,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -900,6 +902,7 @@ "fields = {'text': text, 'label': label}\n", "\n", "train, valid, test = SST.get_dataset_splits(fields=fields)\n", + "train.finalize_fields()\n", "\n", "# Define the iterators and our sort key\n", "from podium import Iterator, BucketIterator\n", @@ -907,7 +910,7 @@ " # Use the text Field\n", " raw, tokenized = instance.text\n", " return len(tokenized)\n", - "bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length)" + "bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length)" ] }, { @@ -936,7 +939,7 @@ ], "source": [ "import numpy as np\n", - "vanilla_iter = Iterator(train, batch_size=32)\n", + "vanilla_iter = Iterator(sst_train, batch_size=32)\n", "\n", "def count_padding(batch, padding_idx):\n", " return np.count_nonzero(batch == padding_idx)\n", @@ -1092,7 +1095,7 @@ "import pickle\n", "\n", "cache_dir = Path('cache')\n", - "cache_dir.mkdir()\n", + "cache_dir.mkdir(exist_ok=True)\n", "\n", "dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl')\n", "\n", diff --git a/docs/source/notebooks/preprocessing.ipynb b/docs/source/notebooks/preprocessing.ipynb index 53e59961..609c259d 100644 --- a/docs/source/notebooks/preprocessing.ipynb +++ b/docs/source/notebooks/preprocessing.ipynb @@ -9,11 +9,12 @@ "# Podium installation\n", "! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# ! pip install git+https://github.com/takelab/podium\n", + "# ! pip install git+https://github.com/TakeLab/podium.git\n", "\n", "# Additional dependencies required to run this notebook:\n", - "! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n", - "! python -m spacy download en_core_web_sm" + "! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n", + "! python -m spacy download en_core_web_sm\n", + "! python -m nltk.downloader stopwords" ] }, { @@ -387,7 +388,7 @@ { "data": { "text/plain": [ - "(None, [opinion', 'exciting', 'funny', 'movie'])" + "(None, ['opinion', 'exciting', 'funny', 'movie'])" ] }, "execution_count": null, diff --git a/docs/source/notebooks/quickstart.ipynb b/docs/source/notebooks/quickstart.ipynb index 7d23db64..6522f6f2 100644 --- a/docs/source/notebooks/quickstart.ipynb +++ b/docs/source/notebooks/quickstart.ipynb @@ -9,7 +9,7 @@ "# Podium installation\n", "! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# ! pip install git+https://github.com/takelab/podium" + "# ! pip install git+https://github.com/TakeLab/podium.git" ] }, { @@ -131,9 +131,9 @@ "data": { "text/plain": [ "Example({\n", - " input_text: (None, ['Amazingly', 'lame', '.']),\n", - " input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n", - " target: (None, 'negative')\n", + " input_text: (None, ['Amazingly', 'lame', '.']),\n", + " input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n", + " target: (None, 'negative')\n", "})" ] }, diff --git a/docs/source/notebooks/sample_dataset.csv b/docs/source/notebooks/sample_dataset.csv new file mode 100644 index 00000000..7827a95f --- /dev/null +++ b/docs/source/notebooks/sample_dataset.csv @@ -0,0 +1,3 @@ +text,label +Absorbing character study .,positive +Amazingly lame .,negative diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb index f78d4d45..8ff1658d 100644 --- a/docs/source/notebooks/walkthrough.ipynb +++ b/docs/source/notebooks/walkthrough.ipynb @@ -9,7 +9,7 @@ "# Podium installation\n", "! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# ! pip install git+https://github.com/takelab/podium\n", + "# ! pip install git+https://github.com/TakeLab/podium.git\n", "\n", "# Additional dependencies required to run this notebook:\n", "! pip install datasets spacy\n", @@ -49,7 +49,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -104,7 +104,7 @@ ], "source": [ "from podium.datasets import SST\n", - "sst_train, sst_dev, sst_test = SST.get_dataset_splits() # doctest:+ELLIPSIS\n", + "sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n", "sst_train.finalize_fields()\n", "print(sst_train)\n", "print(sst_train[222]) # A short example" @@ -121,7 +121,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -159,16 +159,20 @@ } ], "source": [ - "import datasets\n", + "import os\n", "from pprint import pprint\n", + "\n", + "from datasets import load_dataset\n", + "\n", "# Loading a huggingface dataset returns an instance of DatasetDict\n", "# which contains the dataset splits (usually: train, valid, test) \n", - "imdb = datasets.load_dataset('imdb')\n", + "imdb = load_dataset('imdb')\n", + "\n", "print(imdb.keys())\n", "\n", "# Each dataset has a set of features which need to be mapped\n", "# to Podium Fields.\n", - "print(imdb['train'].features)" + "pprint(imdb['train'].features)" ] }, { @@ -188,16 +192,15 @@ { "data": { "text/plain": [ - "{'label': LabelField({\n", - " name: 'label',\n", - " keep_raw: False,\n", - " is_target: True\n", - "}),\n", - " 'text': Field({\n", - " name: 'text',\n", - " keep_raw: False,\n", - " is_target: False,\n", - " vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619})\n", + "{'text': Field({\n", + " name: 'text',\n", + " keep_raw: False,\n", + " is_target: False,\n", + " vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619})\n", + " }), 'label': LabelField({\n", + " name: 'label',\n", + " keep_raw: False,\n", + " is_target: True\n", "})}" ] }, @@ -213,7 +216,7 @@ "imdb_train, imdb_test, imdb_unsupervised = HF.from_dataset_dict(imdb).values()\n", "imdb_train.finalize_fields()\n", "\n", - "print(imdb_train.field_dict())" + "print(imdb_train.field_dict)" ] }, { @@ -266,7 +269,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -432,7 +435,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -494,7 +497,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -682,7 +685,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -817,7 +820,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -994,7 +997,7 @@ { "data": { "text/plain": [ - "For vocabulary of size: 21701 loaded embedding matrix of shape: (21701, 300)\n", + "For vocabulary of size: 21701 loaded embedding matrix of shape: (16284, 300)\n", "Vector for sport: [ 0.34566 0.15934 0.48444 -0.13693 0.18737 0.2678\n", " -0.39159 0.4931 -0.76111 -1.4586 0.41475 0.55837\n", " ...\n", @@ -1083,12 +1086,12 @@ "data": { "text/plain": [ " (6920, 4998)\n", - "(0, 2111) 0.617113703893198\n", - "(0, 549) 0.5208201737884445\n", - "(0, 499) 0.5116152860290002\n", - "(0, 19) 0.2515101839877878\n", - "(0, 1) 0.12681755258500052\n", - "(0, 0) 0.08262419651916046" + " (0, 2111) 0.617113703893198\n", + " (0, 549) 0.5208201737884445\n", + " (0, 499) 0.5116152860290002\n", + " (0, 19) 0.2515101839877878\n", + " (0, 1) 0.12681755258500052\n", + " (0, 0) 0.08262419651916046" ] }, "execution_count": null, diff --git a/docs/source/preprocessing.rst b/docs/source/preprocessing.rst index 58e2c86c..c0f5a3bd 100644 --- a/docs/source/preprocessing.rst +++ b/docs/source/preprocessing.rst @@ -159,7 +159,7 @@ Stopword removal >>> raw_text = None >>> tokenized_text = ['in', 'my', 'opinion', 'an', 'exciting', 'and', 'funny', 'movie'] >>> print(remove_stopwords_hook(raw_text, tokenized_text)) - (None, [opinion', 'exciting', 'funny', 'movie']) + (None, ['opinion', 'exciting', 'funny', 'movie']) Keyword extraction ------------------ diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 715f3452..0466b095 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -69,9 +69,9 @@ You might wonder, why not simply use the input column names from the header to s >>> dataset_with_chars.finalize_fields() >>> print(dataset_with_chars[1]) Example({ - input_text: (None, ['Amazingly', 'lame', '.']), - input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']), - target: (None, 'negative') + input_text: (None, ['Amazingly', 'lame', '.']), + input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']), + target: (None, 'negative') }) You might wonder what the ``None``\s we've been seeing represent. For each Field, we store raw and processed data as a tuple. The first element of the tuple is reserved for raw data, by default blank to preserve memory. For a detailed overview of the Field constructor arguments and how to use them, check :ref:`fields`. diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py new file mode 100644 index 00000000..8671a0c8 --- /dev/null +++ b/docs/source/scripts/check_notebooks.py @@ -0,0 +1,186 @@ +import argparse +import copy +import os +import re +import shutil +import string +import subprocess +import textwrap +from functools import partial +from pathlib import Path + +import multiprocess +import nbformat +from nbconvert.preprocessors import ExecutePreprocessor + + +NOTEBOOKS_PATH = "../notebooks" +INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp" +INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git" +INSTALL_SST_COMMAND = "python -c \"from podium.datasets import SST; SST.get_dataset_splits()\"" +TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace)) + +_re_pip_install = re.compile(r"!\s*(pip\s+install\s+[^\\\"]*)") +_re_python = re.compile(r"!\s*(python[^\\\"]*)") + + +def init(notebook_paths): + all_commands = [] + for notebook_path in notebook_paths: + with open(notebook_path, encoding="utf-8") as f: + notebook_raw = f.read() + commands = _re_pip_install.findall(notebook_raw) + _re_python.findall(notebook_raw) + all_commands.extend(commands) + + delim = "&" if os.name == "nt" else ";" + subprocess.call( + delim.join([*all_commands, INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]), + shell=True, + cwd=Path(NOTEBOOKS_PATH).absolute(), + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) + + +def cleanup(snap_before_exec, snap_after_exec): + created_paths = set(snap_after_exec) - set(snap_before_exec) + for path in created_paths: + if path.is_dir(): + shutil.rmtree(path) + else: + path.unlink() + + +def print_notebook_name_with_error(func): + def wrapper(*args, **kwargs): + if args: + notebook_path = args[0] + else: + notebook_path = kwargs.pop("self") + try: + return func(*args, **kwargs) + except Exception as err: + print(f"Error in notebook {Path(notebook_path).name}:\n{err}") + raise + return wrapper + + +def replace_install_release_with_source(nb): + cell = nb["cells"][0] + # sanity check + assert cell["cell_type"] == "code" + assert isinstance(cell["source"], str) + + assert INSTALL_RELEASE_VERSION_COMMAND in cell["source"] + cell["source"] = cell["source"].replace(INSTALL_RELEASE_VERSION_COMMAND, "# " + INSTALL_RELEASE_VERSION_COMMAND) + + assert INSTALL_SOURCE_VERSION_COMMAND in cell["source"] + cell["source"] = cell["source"].replace(INSTALL_SOURCE_VERSION_COMMAND, INSTALL_SOURCE_VERSION_COMMAND[2:]) + + +@print_notebook_name_with_error +def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False): + with open(notebook_path, encoding="utf-8") as f: + nb = nbformat.read(f, as_version=4) + + original_nb = nb + new_nb = copy.deepcopy(nb) + replace_install_release_with_source(new_nb) + + ep = ExecutePreprocessor(kernel_name=env) + ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}}) + + assert len(original_nb["cells"]) == len(new_nb["cells"]) + + report = [] + for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cells"])): + # consider only cells with code + if original_cell["cell_type"] != "code" or original_cell["outputs"] == [] or original_cell["metadata"].get("elippsis"): + continue + + # sanity check + assert isinstance(original_cell["source"], str) + # skip cells with commands + for line in original_cell["source"].splitlines(): + if line.strip().startswith(("!", "%")): + continue + + # sanity check + assert len(original_cell["outputs"]) == 1 + original_cell_stdout = original_cell["outputs"][0]["data"]["text/plain"] + assert isinstance(original_cell_stdout, str) + + new_cell_stdout = "".join([ + new_cell_output["text"] + for new_cell_output in new_cell["outputs"] if new_cell_output["output_type"] == "stream" and new_cell_output["name"] == "stdout" + ]) + + original_cell_stdout_ = original_cell_stdout + new_cell_stdout_ = new_cell_stdout + + if ignore_whitespace: + original_cell_stdout = original_cell_stdout.translate(TRANS_TABLE) + new_cell_stdout = new_cell_stdout.translate(TRANS_TABLE) + else: + if new_cell_stdout[-1] == "\n" and original_cell_stdout[-1] != "\n": + original_cell_stdout += "\n" + + if original_cell_stdout != new_cell_stdout: + report.append((i, original_cell_stdout_, new_cell_stdout_)) + + return notebook_path.name, report + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--env", default="python3", help="kernel that executes the notebook") + parser.add_argument("--num_proc", help="number of processes for parallel execution") + parser.add_argument("--ignore_whitespace", action="store_true", help="ignore whitespace when comparing cell outputs") + parser.add_argument("--keep_artifacts", action="store_true", help="save files/directories created during execution") + args = parser.parse_args() + + if args.num_proc is None: + num_proc = 1 + elif args.num_proc == "auto": + num_proc = multiprocess.cpu_count() + else: + num_proc = int(args.num_proc) + + notebook_paths = [ + notebook_path + for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb") + if not (notebook_path.name.endswith("-checkpoint.ipynb") or notebook_path.parts[-2] == "examples") + ] + + snap_before_exec = list(Path(NOTEBOOKS_PATH).iterdir()) + + num_proc = min(min(num_proc, multiprocess.cpu_count()), len(notebook_paths)) + if num_proc == 1: + reports = [] + for notebook_path in notebook_paths: + report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace) + reports.append(report) + else: + # install packages and predownload datasets/vectorizers to prevent parallel download + init(notebook_paths) + with multiprocess.Pool(num_proc) as pool: + reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths) + + if args.keep_artifacts is False: + snap_after_exec = list(Path(NOTEBOOKS_PATH).iterdir()) + cleanup(snap_before_exec, snap_after_exec) + + if any(report for _, report in reports): + reports_str = "\n\n".join([ + f"In notebook {notebook}:\n" + textwrap.indent( + "\n".join( + f"Cell {i}\n" + "=" * len(f"Cell {i}") + "\n" + + f"Original output:\n{original_output}\nAfter execution:\n{new_output}" + for i, original_output, new_output in report), + " " * 4, + ) + for notebook, report in reports if len(report) > 0 + ]) + raise Exception( + "❌❌ Mismatches found in the outputs of the notebooks:\n\n" + reports_str + ) diff --git a/docs/source/scripts/convert_doc_to_notebooks.py b/docs/source/scripts/convert_doc_to_notebooks.py index 1feb0731..e6f91e6c 100644 --- a/docs/source/scripts/convert_doc_to_notebooks.py +++ b/docs/source/scripts/convert_doc_to_notebooks.py @@ -273,7 +273,7 @@ def convert_math(text): def convert_anchor(text): """ Convert text to an anchor that can be used in the notebook.""" anchor_name = _re_anchor_section.search(text).groups()[0] - return f"" + return f"" ################################### @@ -357,6 +357,8 @@ def split_frameworks(code): # Matches any doctest pattern. _re_doctest = re.compile(r"^(>>>|\.\.\.)") +# Re pattern that matches doctest options in code blocks. +_re_doctest_option = re.compile(r'#\s*doctest:\s*([^\n\'"]*)$', re.MULTILINE) def parse_code_and_output(code): @@ -395,10 +397,16 @@ def code_cell(code, output=None): if output is None or len(output) == 0: outputs = [] else: + metadata = {} + for m in _re_doctest_option.finditer(code): + group = m.group(1) + if group == "+ELIPPSIS": + metadata["elippsis"] = True + code.replace(group, "") outputs = [nbformat.notebooknode.NotebookNode({ 'data': {'text/plain': output}, 'execution_count': None, - 'metadata': {}, + 'metadata': metadata, 'output_type': 'execute_result' })] return nbformat.notebooknode.NotebookNode( @@ -428,7 +436,7 @@ def rm_first_line(text): INSTALL_CODE = """# Podium installation ! pip install podium-nlp # To install from source instead of the last release, comment the command above and uncomment the following one. -# ! pip install git+https://github.com/takelab/podium +# ! pip install git+https://github.com/TakeLab/podium.git """ ADDITIONAL_DEPS = { @@ -440,8 +448,9 @@ def rm_first_line(text): ), "preprocessing.rst": textwrap.dedent( """\ - ! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz + ! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz ! python -m spacy download en_core_web_sm + ! python -m nltk.downloader stopwords """ ), "walkthrough.rst": textwrap.dedent( diff --git a/docs/source/scripts/requirements.txt b/docs/source/scripts/requirements.txt new file mode 100644 index 00000000..a5fc2b3c --- /dev/null +++ b/docs/source/scripts/requirements.txt @@ -0,0 +1,5 @@ +ipykernel +ipywidgets +nbformat +nbconvert +multiprocess \ No newline at end of file diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst index fa2150f4..cc573c28 100644 --- a/docs/source/walkthrough.rst +++ b/docs/source/walkthrough.rst @@ -33,7 +33,7 @@ One built-in dataset available in Podium is the `Stanford Sentiment Treebank >> from podium.datasets import SST - >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits() # doctest:+ELLIPSIS + >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits() >>> sst_train.finalize_fields() >>> print(sst_train) SST({ @@ -73,17 +73,21 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your .. code-block:: python - >>> import datasets + >>> import os >>> from pprint import pprint + >>> + >>> from datasets import load_dataset + >>> >>> # Loading a huggingface dataset returns an instance of DatasetDict >>> # which contains the dataset splits (usually: train, valid, test) - >>> imdb = datasets.load_dataset('imdb') + >>> imdb = load_dataset('imdb') + >>> >>> print(imdb.keys()) dict_keys(['train', 'test', 'unsupervised']) >>> >>> # Each dataset has a set of features which need to be mapped >>> # to Podium Fields. - >>> print(imdb['train'].features) + >>> pprint(imdb['train'].features) {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None), 'text': Value(dtype='string', id=None)} @@ -99,17 +103,16 @@ Datasets from 🤗 need to either (1) be wrapped them in :class:`podium.datasets >>> imdb_train, imdb_test, imdb_unsupervised = HF.from_dataset_dict(imdb).values() >>> imdb_train.finalize_fields() >>> - >>> print(imdb_train.field_dict()) - {'label': LabelField({ - name: 'label', - keep_raw: False, - is_target: True - }), - 'text': Field({ - name: 'text', - keep_raw: False, - is_target: False, - vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619}) + >>> print(imdb_train.field_dict) + {'text': Field({ + name: 'text', + keep_raw: False, + is_target: False, + vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619}) + }), 'label': LabelField({ + name: 'label', + keep_raw: False, + is_target: True })} .. note:: @@ -509,7 +512,7 @@ The output of the function call is a numpy matrix of word embeddings which you c >>> glove = GloVe() >>> embeddings = glove.load_vocab(vocab) >>> print(f"For vocabulary of size: {len(vocab)} loaded embedding matrix of shape: {embeddings.shape}") - For vocabulary of size: 21701 loaded embedding matrix of shape: (21701, 300) + For vocabulary of size: 21701 loaded embedding matrix of shape: (16284, 300) >>> # We can obtain vectors for a single word (given the word is loaded) like this: >>> word = "sport" >>> print(f"Vector for {word}: {glove.token_to_vector(word)}") @@ -555,12 +558,12 @@ Now our vectorizer has seen the dataset as well as the vocabulary and has all th >>> print(type(tfidf_batch), tfidf_batch.shape) (6920, 4998) >>> print(tfidf_batch[222]) - (0, 2111) 0.617113703893198 - (0, 549) 0.5208201737884445 - (0, 499) 0.5116152860290002 - (0, 19) 0.2515101839877878 - (0, 1) 0.12681755258500052 - (0, 0) 0.08262419651916046 + (0, 2111) 0.617113703893198 + (0, 549) 0.5208201737884445 + (0, 499) 0.5116152860290002 + (0, 19) 0.2515101839877878 + (0, 1) 0.12681755258500052 + (0, 0) 0.08262419651916046 The Tf-Idf counts are highly sparse since not all words from the vocabulary are present in every instance. To reduce the memory footprint of count-based numericalization, we store the values in a `SciPy `__ `sparse matrix `__, which can be used in various `scikit-learn `__ models. diff --git a/podium/datasets/arrow.py b/podium/datasets/arrow.py index b69b20b5..0cceb874 100644 --- a/podium/datasets/arrow.py +++ b/podium/datasets/arrow.py @@ -299,7 +299,7 @@ def from_tabular_file( format = format.lower() csv_reader_params = {} if csv_reader_params is None else csv_reader_params - with open(os.path.expanduser(path), encoding="utf8") as f: + with open(os.path.expanduser(path), encoding="utf-8") as f: if format in {"csv", "tsv"}: delimiter = "," if format == "csv" else "\t" reader = csv.reader(f, delimiter=delimiter, **csv_reader_params) @@ -542,7 +542,7 @@ def load_cache(cache_path) -> "DiskBackedDataset": """ # load fields fields_file_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME) - with open(fields_file_path, "rb") as fields_cache_file: + with open(os.path.expanduser(fields_file_path), "rb") as fields_cache_file: fields = pickle.load(fields_cache_file) # load dataset as memory mapped arrow table @@ -587,7 +587,7 @@ def dump_cache(self, cache_path: Optional[str] = None) -> str: # pickle fields cache_fields_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME) - with open(cache_fields_path, "wb") as fields_cache_file: + with open(os.path.expanduser(cache_fields_path), "wb") as fields_cache_file: pickle.dump(self.fields, fields_cache_file) # dump table diff --git a/podium/datasets/dataset.py b/podium/datasets/dataset.py index f26b3a6e..ab681adf 100644 --- a/podium/datasets/dataset.py +++ b/podium/datasets/dataset.py @@ -278,7 +278,7 @@ def shuffled(self) -> "DatasetBase": return self[shuffled_indices] def __repr__(self): - fields_str = ",\n".join(textwrap.indent(repr(f), " " * 8) for f in self.fields) + fields_str = ",\n".join(textwrap.indent(repr(f), " " * 4) for f in self.fields) fields_str = f"[\n{fields_str}\n \n]" attrs = {"size": len(self), "fields": fields_str} return repr_type_and_attrs(self, attrs, with_newlines=True, repr_values=False) diff --git a/podium/datasets/impl/conllu_dataset.py b/podium/datasets/impl/conllu_dataset.py index 6a377e28..5872e7c4 100644 --- a/podium/datasets/impl/conllu_dataset.py +++ b/podium/datasets/impl/conllu_dataset.py @@ -2,6 +2,7 @@ Module contains the CoNLL-U dataset. """ import collections +import os from podium.datasets import Dataset from podium.datasets.example_factory import ExampleFactory @@ -87,7 +88,7 @@ def safe_conllu_parse(in_file): example_factory = ExampleFactory(fields) examples = [] - with open(file_path, encoding="utf-8") as in_file: + with open(os.path.expanduser(file_path), encoding="utf-8") as in_file: for tokenlist in safe_conllu_parse(in_file): example_dict = collections.defaultdict(lambda: []) for token in tokenlist: diff --git a/podium/datasets/impl/imdb.py b/podium/datasets/impl/imdb.py index 500cd563..17950c49 100644 --- a/podium/datasets/impl/imdb.py +++ b/podium/datasets/impl/imdb.py @@ -153,7 +153,9 @@ def _create_labeled_examples(dir_path, label, fields): ] examples = [] for file_path in files_list: - with open(file=os.path.join(dir_path, file_path), encoding="utf8") as fpr: + with open( + os.path.expanduser(os.path.join(dir_path, file_path)), encoding="utf-8" + ) as fpr: data = {IMDB.TEXT_FIELD_NAME: fpr.read(), IMDB.LABEL_FIELD_NAME: label} examples.append(example_factory.from_dict(data)) return examples diff --git a/podium/datasets/impl/snli.py b/podium/datasets/impl/snli.py index bebc522e..52503052 100644 --- a/podium/datasets/impl/snli.py +++ b/podium/datasets/impl/snli.py @@ -99,7 +99,7 @@ def _create_examples(file_path, fields): example_factory = ExampleFactory(fields) examples = [] - with open(file=file_path, encoding="utf8") as in_file: + with open(os.path.expanduser(file_path), encoding="utf-8") as in_file: for line in in_file: examples.append(example_factory.from_json(line)) return examples diff --git a/podium/datasets/impl/sst.py b/podium/datasets/impl/sst.py index e0a40f45..f6200f25 100644 --- a/podium/datasets/impl/sst.py +++ b/podium/datasets/impl/sst.py @@ -123,7 +123,7 @@ def label_trf(label): return label_to_string_map[label] examples = [] - with open(file=file_path, encoding="utf8") as fpr: + with open(os.path.expanduser(file_path), encoding="utf-8") as fpr: for line in fpr: example = example_factory.from_fields_tree( diff --git a/podium/field.py b/podium/field.py index cd946577..fc554412 100644 --- a/podium/field.py +++ b/podium/field.py @@ -954,7 +954,7 @@ def remove_pretokenize_hooks(self): def __repr__(self): fields_str = ",\n".join( - textwrap.indent(repr(f), " " * 8) for f in self._output_fields + textwrap.indent(repr(f), " " * 4) for f in self._output_fields ) fields_str = f"[\n{fields_str}\n \n]" attrs = {"fields": fields_str} diff --git a/podium/vectorizers/vectorizer.py b/podium/vectorizers/vectorizer.py index 2f7b1e1a..982e1f20 100644 --- a/podium/vectorizers/vectorizer.py +++ b/podium/vectorizers/vectorizer.py @@ -298,7 +298,7 @@ def _cache_vectors(self): """ Method for caching loaded vectors to cache_dir. """ - with open(self._cache_path, "wb") as cache_file: + with open(os.path.expanduser(self._cache_path), "wb") as cache_file: for word in self._vectors: vector_values_string = " ".join(map(str, self._vectors[word])) cache_file.write(f"{word} {vector_values_string}\n".encode("utf-8")) @@ -362,7 +362,9 @@ def _load_vectors(self, vocab=None): vocab = set(vocab) open_mode, split_delimiter = ("rb", b" ") if self._binary else ("r", " ") - with open(curr_path, open_mode, encoding=self._encoding) as vector_file: + with open( + os.path.expanduser(curr_path), open_mode, encoding=self._encoding + ) as vector_file: vectors_loaded = 0 header_lines = 0 diff --git a/setup.py b/setup.py index 9b643cad..ef4dc1f0 100644 --- a/setup.py +++ b/setup.py @@ -97,7 +97,6 @@ def _get_version(): 'sphinx_rtd_theme', 'sphinx-copybutton', 'recommonmark', - 'nbformat', 'datasets', ]