diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml
new file mode 100644
index 00000000..941a3d26
--- /dev/null
+++ b/.github/workflows/scheduled.yml
@@ -0,0 +1,33 @@
+name: Scheduled jobs
+
+on:
+ pull_request:
+ branches:
+ - master
+ repository_dispatch:
+ schedule:
+ - cron: "0 0 * * *"
+
+jobs:
+ check_notebooks:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: [3.6]
+ defaults:
+ run:
+ working-directory: docs/source/scripts
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ - name: Execute notebooks
+ run: |
+ python check_notebooks.py --num_proc auto --ignore_whitespace
diff --git a/README.md b/README.md
index 84ea9eef..e7489d16 100644
--- a/README.md
+++ b/README.md
@@ -207,7 +207,7 @@ Example({
})
```
-For a more interactive introduction, check out the quickstart on Google Colab: [](https://colab.research.google.com/github/takelab/podium/blob/master/docs/source/notebooks/quickstart.ipynb)
+For a more interactive introduction, check out the quickstart on Google Colab: [](https://colab.research.google.com/github/TakeLab/podium/blob/master/docs/source/notebooks/quickstart.ipynb)
More complex examples can be found in our [examples folder](./examples).
diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index 25a939eb..705df020 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -379,7 +379,9 @@ For a simple example, we will take a look at the built-in SST and IMDB datasets:
>>> from podium import Field, LabelField, Vocab
>>> # Load the datasets
>>> imdb_train, imdb_test = IMDB.get_dataset_splits()
+ >>> imdb_train.finalize_fields()
>>> sst_train, sst_dev, sst_test = SST.get_dataset_splits()
+ >>> sst_train.finalize_fields()
>>>
>>> # Luckily, both label vocabularies are already equal
>>> print(imdb_train.field('label').vocab.itos)
@@ -416,6 +418,7 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended
>>> fields = {'text': text, 'label': label}
>>>
>>> train, valid, test = SST.get_dataset_splits(fields=fields)
+ >>> train.finalize_fields()
>>>
>>> # Define the iterators and our sort key
>>> from podium import Iterator, BucketIterator
@@ -423,14 +426,14 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended
>>> # Use the text Field
>>> raw, tokenized = instance.text
>>> return len(tokenized)
- >>> bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length)
+ >>> bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length)
The ``bucket_sort_key`` function defines how the instances in the dataset should be sorted. The method accepts an instance of the dataset, and should return a value which will be used as a sort key in the ``BucketIterator``. It might be interesting (and surprising) to see how much space (and time) do we earn by bucketing. We will define a naive iterator on the same dataset and measure the total amount of padding used when iterating over a dataset.
.. code-block:: python
>>> import numpy as np
- >>> vanilla_iter = Iterator(train, batch_size=32)
+ >>> vanilla_iter = Iterator(sst_train, batch_size=32)
>>>
>>> def count_padding(batch, padding_idx):
>>> return np.count_nonzero(batch == padding_idx)
@@ -518,7 +521,7 @@ Each ``Dataset`` instance in the SST dataset splits contains ``Field``\s and a `
>>> import pickle
>>>
>>> cache_dir = Path('cache')
- >>> cache_dir.mkdir()
+ >>> cache_dir.mkdir(exist_ok=True)
>>>
>>> dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl')
>>>
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 7c67d2fb..bcc7ab55 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -23,5 +23,5 @@ Coming soon!
## Installing from source
To install from source via terminal:
-1. Clone the repository: `git clone git@github.com:takelab/podium.git && cd podium`
+1. Clone the repository: `git clone git@github.com:TakeLab/podium.git && cd podium`
2. Install podium: `pip install .`
diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb
index c9f7d874..b5dd9b4c 100644
--- a/docs/source/notebooks/advanced.ipynb
+++ b/docs/source/notebooks/advanced.ipynb
@@ -9,7 +9,7 @@
"# Podium installation\n",
"! pip install podium-nlp\n",
"# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
- "# ! pip install git+https://github.com/takelab/podium\n",
+ "# ! pip install git+https://github.com/TakeLab/podium.git\n",
"\n",
"# Additional dependencies required to run this notebook:\n",
"! pip install transformers spacy\n",
@@ -134,7 +134,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -337,7 +337,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -477,7 +477,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -798,7 +798,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -843,7 +843,9 @@
"from podium import Field, LabelField, Vocab\n",
"# Load the datasets\n",
"imdb_train, imdb_test = IMDB.get_dataset_splits()\n",
+ "imdb_train.finalize_fields()\n",
"sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n",
+ "sst_train.finalize_fields()\n",
"\n",
"# Luckily, both label vocabularies are already equal\n",
"print(imdb_train.field('label').vocab.itos)\n",
@@ -867,7 +869,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -900,6 +902,7 @@
"fields = {'text': text, 'label': label}\n",
"\n",
"train, valid, test = SST.get_dataset_splits(fields=fields)\n",
+ "train.finalize_fields()\n",
"\n",
"# Define the iterators and our sort key\n",
"from podium import Iterator, BucketIterator\n",
@@ -907,7 +910,7 @@
" # Use the text Field\n",
" raw, tokenized = instance.text\n",
" return len(tokenized)\n",
- "bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length)"
+ "bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length)"
]
},
{
@@ -936,7 +939,7 @@
],
"source": [
"import numpy as np\n",
- "vanilla_iter = Iterator(train, batch_size=32)\n",
+ "vanilla_iter = Iterator(sst_train, batch_size=32)\n",
"\n",
"def count_padding(batch, padding_idx):\n",
" return np.count_nonzero(batch == padding_idx)\n",
@@ -1092,7 +1095,7 @@
"import pickle\n",
"\n",
"cache_dir = Path('cache')\n",
- "cache_dir.mkdir()\n",
+ "cache_dir.mkdir(exist_ok=True)\n",
"\n",
"dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl')\n",
"\n",
diff --git a/docs/source/notebooks/preprocessing.ipynb b/docs/source/notebooks/preprocessing.ipynb
index 53e59961..609c259d 100644
--- a/docs/source/notebooks/preprocessing.ipynb
+++ b/docs/source/notebooks/preprocessing.ipynb
@@ -9,11 +9,12 @@
"# Podium installation\n",
"! pip install podium-nlp\n",
"# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
- "# ! pip install git+https://github.com/takelab/podium\n",
+ "# ! pip install git+https://github.com/TakeLab/podium.git\n",
"\n",
"# Additional dependencies required to run this notebook:\n",
- "! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n",
- "! python -m spacy download en_core_web_sm"
+ "! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n",
+ "! python -m spacy download en_core_web_sm\n",
+ "! python -m nltk.downloader stopwords"
]
},
{
@@ -387,7 +388,7 @@
{
"data": {
"text/plain": [
- "(None, [opinion', 'exciting', 'funny', 'movie'])"
+ "(None, ['opinion', 'exciting', 'funny', 'movie'])"
]
},
"execution_count": null,
diff --git a/docs/source/notebooks/quickstart.ipynb b/docs/source/notebooks/quickstart.ipynb
index 7d23db64..6522f6f2 100644
--- a/docs/source/notebooks/quickstart.ipynb
+++ b/docs/source/notebooks/quickstart.ipynb
@@ -9,7 +9,7 @@
"# Podium installation\n",
"! pip install podium-nlp\n",
"# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
- "# ! pip install git+https://github.com/takelab/podium"
+ "# ! pip install git+https://github.com/TakeLab/podium.git"
]
},
{
@@ -131,9 +131,9 @@
"data": {
"text/plain": [
"Example({\n",
- " input_text: (None, ['Amazingly', 'lame', '.']),\n",
- " input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n",
- " target: (None, 'negative')\n",
+ " input_text: (None, ['Amazingly', 'lame', '.']),\n",
+ " input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n",
+ " target: (None, 'negative')\n",
"})"
]
},
diff --git a/docs/source/notebooks/sample_dataset.csv b/docs/source/notebooks/sample_dataset.csv
new file mode 100644
index 00000000..7827a95f
--- /dev/null
+++ b/docs/source/notebooks/sample_dataset.csv
@@ -0,0 +1,3 @@
+text,label
+Absorbing character study .,positive
+Amazingly lame .,negative
diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb
index f78d4d45..8ff1658d 100644
--- a/docs/source/notebooks/walkthrough.ipynb
+++ b/docs/source/notebooks/walkthrough.ipynb
@@ -9,7 +9,7 @@
"# Podium installation\n",
"! pip install podium-nlp\n",
"# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
- "# ! pip install git+https://github.com/takelab/podium\n",
+ "# ! pip install git+https://github.com/TakeLab/podium.git\n",
"\n",
"# Additional dependencies required to run this notebook:\n",
"! pip install datasets spacy\n",
@@ -49,7 +49,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -104,7 +104,7 @@
],
"source": [
"from podium.datasets import SST\n",
- "sst_train, sst_dev, sst_test = SST.get_dataset_splits() # doctest:+ELLIPSIS\n",
+ "sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n",
"sst_train.finalize_fields()\n",
"print(sst_train)\n",
"print(sst_train[222]) # A short example"
@@ -121,7 +121,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -159,16 +159,20 @@
}
],
"source": [
- "import datasets\n",
+ "import os\n",
"from pprint import pprint\n",
+ "\n",
+ "from datasets import load_dataset\n",
+ "\n",
"# Loading a huggingface dataset returns an instance of DatasetDict\n",
"# which contains the dataset splits (usually: train, valid, test) \n",
- "imdb = datasets.load_dataset('imdb')\n",
+ "imdb = load_dataset('imdb')\n",
+ "\n",
"print(imdb.keys())\n",
"\n",
"# Each dataset has a set of features which need to be mapped\n",
"# to Podium Fields.\n",
- "print(imdb['train'].features)"
+ "pprint(imdb['train'].features)"
]
},
{
@@ -188,16 +192,15 @@
{
"data": {
"text/plain": [
- "{'label': LabelField({\n",
- " name: 'label',\n",
- " keep_raw: False,\n",
- " is_target: True\n",
- "}),\n",
- " 'text': Field({\n",
- " name: 'text',\n",
- " keep_raw: False,\n",
- " is_target: False,\n",
- " vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619})\n",
+ "{'text': Field({\n",
+ " name: 'text',\n",
+ " keep_raw: False,\n",
+ " is_target: False,\n",
+ " vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619})\n",
+ " }), 'label': LabelField({\n",
+ " name: 'label',\n",
+ " keep_raw: False,\n",
+ " is_target: True\n",
"})}"
]
},
@@ -213,7 +216,7 @@
"imdb_train, imdb_test, imdb_unsupervised = HF.from_dataset_dict(imdb).values()\n",
"imdb_train.finalize_fields()\n",
"\n",
- "print(imdb_train.field_dict())"
+ "print(imdb_train.field_dict)"
]
},
{
@@ -266,7 +269,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -432,7 +435,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -494,7 +497,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -682,7 +685,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -817,7 +820,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -994,7 +997,7 @@
{
"data": {
"text/plain": [
- "For vocabulary of size: 21701 loaded embedding matrix of shape: (21701, 300)\n",
+ "For vocabulary of size: 21701 loaded embedding matrix of shape: (16284, 300)\n",
"Vector for sport: [ 0.34566 0.15934 0.48444 -0.13693 0.18737 0.2678\n",
" -0.39159 0.4931 -0.76111 -1.4586 0.41475 0.55837\n",
" ...\n",
@@ -1083,12 +1086,12 @@
"data": {
"text/plain": [
" (6920, 4998)\n",
- "(0, 2111) 0.617113703893198\n",
- "(0, 549) 0.5208201737884445\n",
- "(0, 499) 0.5116152860290002\n",
- "(0, 19) 0.2515101839877878\n",
- "(0, 1) 0.12681755258500052\n",
- "(0, 0) 0.08262419651916046"
+ " (0, 2111) 0.617113703893198\n",
+ " (0, 549) 0.5208201737884445\n",
+ " (0, 499) 0.5116152860290002\n",
+ " (0, 19) 0.2515101839877878\n",
+ " (0, 1) 0.12681755258500052\n",
+ " (0, 0) 0.08262419651916046"
]
},
"execution_count": null,
diff --git a/docs/source/preprocessing.rst b/docs/source/preprocessing.rst
index 58e2c86c..c0f5a3bd 100644
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -159,7 +159,7 @@ Stopword removal
>>> raw_text = None
>>> tokenized_text = ['in', 'my', 'opinion', 'an', 'exciting', 'and', 'funny', 'movie']
>>> print(remove_stopwords_hook(raw_text, tokenized_text))
- (None, [opinion', 'exciting', 'funny', 'movie'])
+ (None, ['opinion', 'exciting', 'funny', 'movie'])
Keyword extraction
------------------
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index 715f3452..0466b095 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -69,9 +69,9 @@ You might wonder, why not simply use the input column names from the header to s
>>> dataset_with_chars.finalize_fields()
>>> print(dataset_with_chars[1])
Example({
- input_text: (None, ['Amazingly', 'lame', '.']),
- input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),
- target: (None, 'negative')
+ input_text: (None, ['Amazingly', 'lame', '.']),
+ input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),
+ target: (None, 'negative')
})
You might wonder what the ``None``\s we've been seeing represent. For each Field, we store raw and processed data as a tuple. The first element of the tuple is reserved for raw data, by default blank to preserve memory. For a detailed overview of the Field constructor arguments and how to use them, check :ref:`fields`.
diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
new file mode 100644
index 00000000..8671a0c8
--- /dev/null
+++ b/docs/source/scripts/check_notebooks.py
@@ -0,0 +1,186 @@
+import argparse
+import copy
+import os
+import re
+import shutil
+import string
+import subprocess
+import textwrap
+from functools import partial
+from pathlib import Path
+
+import multiprocess
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+
+
+NOTEBOOKS_PATH = "../notebooks"
+INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp"
+INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git"
+INSTALL_SST_COMMAND = "python -c \"from podium.datasets import SST; SST.get_dataset_splits()\""
+TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace))
+
+_re_pip_install = re.compile(r"!\s*(pip\s+install\s+[^\\\"]*)")
+_re_python = re.compile(r"!\s*(python[^\\\"]*)")
+
+
+def init(notebook_paths):
+ all_commands = []
+ for notebook_path in notebook_paths:
+ with open(notebook_path, encoding="utf-8") as f:
+ notebook_raw = f.read()
+ commands = _re_pip_install.findall(notebook_raw) + _re_python.findall(notebook_raw)
+ all_commands.extend(commands)
+
+ delim = "&" if os.name == "nt" else ";"
+ subprocess.call(
+ delim.join([*all_commands, INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]),
+ shell=True,
+ cwd=Path(NOTEBOOKS_PATH).absolute(),
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.STDOUT,
+ )
+
+
+def cleanup(snap_before_exec, snap_after_exec):
+ created_paths = set(snap_after_exec) - set(snap_before_exec)
+ for path in created_paths:
+ if path.is_dir():
+ shutil.rmtree(path)
+ else:
+ path.unlink()
+
+
+def print_notebook_name_with_error(func):
+ def wrapper(*args, **kwargs):
+ if args:
+ notebook_path = args[0]
+ else:
+ notebook_path = kwargs.pop("self")
+ try:
+ return func(*args, **kwargs)
+ except Exception as err:
+ print(f"Error in notebook {Path(notebook_path).name}:\n{err}")
+ raise
+ return wrapper
+
+
+def replace_install_release_with_source(nb):
+ cell = nb["cells"][0]
+ # sanity check
+ assert cell["cell_type"] == "code"
+ assert isinstance(cell["source"], str)
+
+ assert INSTALL_RELEASE_VERSION_COMMAND in cell["source"]
+ cell["source"] = cell["source"].replace(INSTALL_RELEASE_VERSION_COMMAND, "# " + INSTALL_RELEASE_VERSION_COMMAND)
+
+ assert INSTALL_SOURCE_VERSION_COMMAND in cell["source"]
+ cell["source"] = cell["source"].replace(INSTALL_SOURCE_VERSION_COMMAND, INSTALL_SOURCE_VERSION_COMMAND[2:])
+
+
+@print_notebook_name_with_error
+def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False):
+ with open(notebook_path, encoding="utf-8") as f:
+ nb = nbformat.read(f, as_version=4)
+
+ original_nb = nb
+ new_nb = copy.deepcopy(nb)
+ replace_install_release_with_source(new_nb)
+
+ ep = ExecutePreprocessor(kernel_name=env)
+ ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}})
+
+ assert len(original_nb["cells"]) == len(new_nb["cells"])
+
+ report = []
+ for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cells"])):
+ # consider only cells with code
+ if original_cell["cell_type"] != "code" or original_cell["outputs"] == [] or original_cell["metadata"].get("elippsis"):
+ continue
+
+ # sanity check
+ assert isinstance(original_cell["source"], str)
+ # skip cells with commands
+ for line in original_cell["source"].splitlines():
+ if line.strip().startswith(("!", "%")):
+ continue
+
+ # sanity check
+ assert len(original_cell["outputs"]) == 1
+ original_cell_stdout = original_cell["outputs"][0]["data"]["text/plain"]
+ assert isinstance(original_cell_stdout, str)
+
+ new_cell_stdout = "".join([
+ new_cell_output["text"]
+ for new_cell_output in new_cell["outputs"] if new_cell_output["output_type"] == "stream" and new_cell_output["name"] == "stdout"
+ ])
+
+ original_cell_stdout_ = original_cell_stdout
+ new_cell_stdout_ = new_cell_stdout
+
+ if ignore_whitespace:
+ original_cell_stdout = original_cell_stdout.translate(TRANS_TABLE)
+ new_cell_stdout = new_cell_stdout.translate(TRANS_TABLE)
+ else:
+ if new_cell_stdout[-1] == "\n" and original_cell_stdout[-1] != "\n":
+ original_cell_stdout += "\n"
+
+ if original_cell_stdout != new_cell_stdout:
+ report.append((i, original_cell_stdout_, new_cell_stdout_))
+
+ return notebook_path.name, report
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--env", default="python3", help="kernel that executes the notebook")
+ parser.add_argument("--num_proc", help="number of processes for parallel execution")
+ parser.add_argument("--ignore_whitespace", action="store_true", help="ignore whitespace when comparing cell outputs")
+ parser.add_argument("--keep_artifacts", action="store_true", help="save files/directories created during execution")
+ args = parser.parse_args()
+
+ if args.num_proc is None:
+ num_proc = 1
+ elif args.num_proc == "auto":
+ num_proc = multiprocess.cpu_count()
+ else:
+ num_proc = int(args.num_proc)
+
+ notebook_paths = [
+ notebook_path
+ for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb")
+ if not (notebook_path.name.endswith("-checkpoint.ipynb") or notebook_path.parts[-2] == "examples")
+ ]
+
+ snap_before_exec = list(Path(NOTEBOOKS_PATH).iterdir())
+
+ num_proc = min(min(num_proc, multiprocess.cpu_count()), len(notebook_paths))
+ if num_proc == 1:
+ reports = []
+ for notebook_path in notebook_paths:
+ report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace)
+ reports.append(report)
+ else:
+ # install packages and predownload datasets/vectorizers to prevent parallel download
+ init(notebook_paths)
+ with multiprocess.Pool(num_proc) as pool:
+ reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths)
+
+ if args.keep_artifacts is False:
+ snap_after_exec = list(Path(NOTEBOOKS_PATH).iterdir())
+ cleanup(snap_before_exec, snap_after_exec)
+
+ if any(report for _, report in reports):
+ reports_str = "\n\n".join([
+ f"In notebook {notebook}:\n" + textwrap.indent(
+ "\n".join(
+ f"Cell {i}\n" + "=" * len(f"Cell {i}") + "\n" +
+ f"Original output:\n{original_output}\nAfter execution:\n{new_output}"
+ for i, original_output, new_output in report),
+ " " * 4,
+ )
+ for notebook, report in reports if len(report) > 0
+ ])
+ raise Exception(
+ "❌❌ Mismatches found in the outputs of the notebooks:\n\n" + reports_str
+ )
diff --git a/docs/source/scripts/convert_doc_to_notebooks.py b/docs/source/scripts/convert_doc_to_notebooks.py
index 1feb0731..e6f91e6c 100644
--- a/docs/source/scripts/convert_doc_to_notebooks.py
+++ b/docs/source/scripts/convert_doc_to_notebooks.py
@@ -273,7 +273,7 @@ def convert_math(text):
def convert_anchor(text):
""" Convert text to an anchor that can be used in the notebook."""
anchor_name = _re_anchor_section.search(text).groups()[0]
- return f""
+ return f""
###################################
@@ -357,6 +357,8 @@ def split_frameworks(code):
# Matches any doctest pattern.
_re_doctest = re.compile(r"^(>>>|\.\.\.)")
+# Re pattern that matches doctest options in code blocks.
+_re_doctest_option = re.compile(r'#\s*doctest:\s*([^\n\'"]*)$', re.MULTILINE)
def parse_code_and_output(code):
@@ -395,10 +397,16 @@ def code_cell(code, output=None):
if output is None or len(output) == 0:
outputs = []
else:
+ metadata = {}
+ for m in _re_doctest_option.finditer(code):
+ group = m.group(1)
+ if group == "+ELIPPSIS":
+ metadata["elippsis"] = True
+ code.replace(group, "")
outputs = [nbformat.notebooknode.NotebookNode({
'data': {'text/plain': output},
'execution_count': None,
- 'metadata': {},
+ 'metadata': metadata,
'output_type': 'execute_result'
})]
return nbformat.notebooknode.NotebookNode(
@@ -428,7 +436,7 @@ def rm_first_line(text):
INSTALL_CODE = """# Podium installation
! pip install podium-nlp
# To install from source instead of the last release, comment the command above and uncomment the following one.
-# ! pip install git+https://github.com/takelab/podium
+# ! pip install git+https://github.com/TakeLab/podium.git
"""
ADDITIONAL_DEPS = {
@@ -440,8 +448,9 @@ def rm_first_line(text):
),
"preprocessing.rst": textwrap.dedent(
"""\
- ! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz
+ ! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz
! python -m spacy download en_core_web_sm
+ ! python -m nltk.downloader stopwords
"""
),
"walkthrough.rst": textwrap.dedent(
diff --git a/docs/source/scripts/requirements.txt b/docs/source/scripts/requirements.txt
new file mode 100644
index 00000000..a5fc2b3c
--- /dev/null
+++ b/docs/source/scripts/requirements.txt
@@ -0,0 +1,5 @@
+ipykernel
+ipywidgets
+nbformat
+nbconvert
+multiprocess
\ No newline at end of file
diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst
index fa2150f4..cc573c28 100644
--- a/docs/source/walkthrough.rst
+++ b/docs/source/walkthrough.rst
@@ -33,7 +33,7 @@ One built-in dataset available in Podium is the `Stanford Sentiment Treebank >> from podium.datasets import SST
- >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits() # doctest:+ELLIPSIS
+ >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits()
>>> sst_train.finalize_fields()
>>> print(sst_train)
SST({
@@ -73,17 +73,21 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your
.. code-block:: python
- >>> import datasets
+ >>> import os
>>> from pprint import pprint
+ >>>
+ >>> from datasets import load_dataset
+ >>>
>>> # Loading a huggingface dataset returns an instance of DatasetDict
>>> # which contains the dataset splits (usually: train, valid, test)
- >>> imdb = datasets.load_dataset('imdb')
+ >>> imdb = load_dataset('imdb')
+ >>>
>>> print(imdb.keys())
dict_keys(['train', 'test', 'unsupervised'])
>>>
>>> # Each dataset has a set of features which need to be mapped
>>> # to Podium Fields.
- >>> print(imdb['train'].features)
+ >>> pprint(imdb['train'].features)
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),
'text': Value(dtype='string', id=None)}
@@ -99,17 +103,16 @@ Datasets from 🤗 need to either (1) be wrapped them in :class:`podium.datasets
>>> imdb_train, imdb_test, imdb_unsupervised = HF.from_dataset_dict(imdb).values()
>>> imdb_train.finalize_fields()
>>>
- >>> print(imdb_train.field_dict())
- {'label': LabelField({
- name: 'label',
- keep_raw: False,
- is_target: True
- }),
- 'text': Field({
- name: 'text',
- keep_raw: False,
- is_target: False,
- vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619})
+ >>> print(imdb_train.field_dict)
+ {'text': Field({
+ name: 'text',
+ keep_raw: False,
+ is_target: False,
+ vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619})
+ }), 'label': LabelField({
+ name: 'label',
+ keep_raw: False,
+ is_target: True
})}
.. note::
@@ -509,7 +512,7 @@ The output of the function call is a numpy matrix of word embeddings which you c
>>> glove = GloVe()
>>> embeddings = glove.load_vocab(vocab)
>>> print(f"For vocabulary of size: {len(vocab)} loaded embedding matrix of shape: {embeddings.shape}")
- For vocabulary of size: 21701 loaded embedding matrix of shape: (21701, 300)
+ For vocabulary of size: 21701 loaded embedding matrix of shape: (16284, 300)
>>> # We can obtain vectors for a single word (given the word is loaded) like this:
>>> word = "sport"
>>> print(f"Vector for {word}: {glove.token_to_vector(word)}")
@@ -555,12 +558,12 @@ Now our vectorizer has seen the dataset as well as the vocabulary and has all th
>>> print(type(tfidf_batch), tfidf_batch.shape)
(6920, 4998)
>>> print(tfidf_batch[222])
- (0, 2111) 0.617113703893198
- (0, 549) 0.5208201737884445
- (0, 499) 0.5116152860290002
- (0, 19) 0.2515101839877878
- (0, 1) 0.12681755258500052
- (0, 0) 0.08262419651916046
+ (0, 2111) 0.617113703893198
+ (0, 549) 0.5208201737884445
+ (0, 499) 0.5116152860290002
+ (0, 19) 0.2515101839877878
+ (0, 1) 0.12681755258500052
+ (0, 0) 0.08262419651916046
The Tf-Idf counts are highly sparse since not all words from the vocabulary are present in every instance. To reduce the memory footprint of count-based numericalization, we store the values in a `SciPy `__ `sparse matrix `__, which can be used in various `scikit-learn `__ models.
diff --git a/podium/datasets/arrow.py b/podium/datasets/arrow.py
index b69b20b5..0cceb874 100644
--- a/podium/datasets/arrow.py
+++ b/podium/datasets/arrow.py
@@ -299,7 +299,7 @@ def from_tabular_file(
format = format.lower()
csv_reader_params = {} if csv_reader_params is None else csv_reader_params
- with open(os.path.expanduser(path), encoding="utf8") as f:
+ with open(os.path.expanduser(path), encoding="utf-8") as f:
if format in {"csv", "tsv"}:
delimiter = "," if format == "csv" else "\t"
reader = csv.reader(f, delimiter=delimiter, **csv_reader_params)
@@ -542,7 +542,7 @@ def load_cache(cache_path) -> "DiskBackedDataset":
"""
# load fields
fields_file_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME)
- with open(fields_file_path, "rb") as fields_cache_file:
+ with open(os.path.expanduser(fields_file_path), "rb") as fields_cache_file:
fields = pickle.load(fields_cache_file)
# load dataset as memory mapped arrow table
@@ -587,7 +587,7 @@ def dump_cache(self, cache_path: Optional[str] = None) -> str:
# pickle fields
cache_fields_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME)
- with open(cache_fields_path, "wb") as fields_cache_file:
+ with open(os.path.expanduser(cache_fields_path), "wb") as fields_cache_file:
pickle.dump(self.fields, fields_cache_file)
# dump table
diff --git a/podium/datasets/dataset.py b/podium/datasets/dataset.py
index f26b3a6e..ab681adf 100644
--- a/podium/datasets/dataset.py
+++ b/podium/datasets/dataset.py
@@ -278,7 +278,7 @@ def shuffled(self) -> "DatasetBase":
return self[shuffled_indices]
def __repr__(self):
- fields_str = ",\n".join(textwrap.indent(repr(f), " " * 8) for f in self.fields)
+ fields_str = ",\n".join(textwrap.indent(repr(f), " " * 4) for f in self.fields)
fields_str = f"[\n{fields_str}\n \n]"
attrs = {"size": len(self), "fields": fields_str}
return repr_type_and_attrs(self, attrs, with_newlines=True, repr_values=False)
diff --git a/podium/datasets/impl/conllu_dataset.py b/podium/datasets/impl/conllu_dataset.py
index 6a377e28..5872e7c4 100644
--- a/podium/datasets/impl/conllu_dataset.py
+++ b/podium/datasets/impl/conllu_dataset.py
@@ -2,6 +2,7 @@
Module contains the CoNLL-U dataset.
"""
import collections
+import os
from podium.datasets import Dataset
from podium.datasets.example_factory import ExampleFactory
@@ -87,7 +88,7 @@ def safe_conllu_parse(in_file):
example_factory = ExampleFactory(fields)
examples = []
- with open(file_path, encoding="utf-8") as in_file:
+ with open(os.path.expanduser(file_path), encoding="utf-8") as in_file:
for tokenlist in safe_conllu_parse(in_file):
example_dict = collections.defaultdict(lambda: [])
for token in tokenlist:
diff --git a/podium/datasets/impl/imdb.py b/podium/datasets/impl/imdb.py
index 500cd563..17950c49 100644
--- a/podium/datasets/impl/imdb.py
+++ b/podium/datasets/impl/imdb.py
@@ -153,7 +153,9 @@ def _create_labeled_examples(dir_path, label, fields):
]
examples = []
for file_path in files_list:
- with open(file=os.path.join(dir_path, file_path), encoding="utf8") as fpr:
+ with open(
+ os.path.expanduser(os.path.join(dir_path, file_path)), encoding="utf-8"
+ ) as fpr:
data = {IMDB.TEXT_FIELD_NAME: fpr.read(), IMDB.LABEL_FIELD_NAME: label}
examples.append(example_factory.from_dict(data))
return examples
diff --git a/podium/datasets/impl/snli.py b/podium/datasets/impl/snli.py
index bebc522e..52503052 100644
--- a/podium/datasets/impl/snli.py
+++ b/podium/datasets/impl/snli.py
@@ -99,7 +99,7 @@ def _create_examples(file_path, fields):
example_factory = ExampleFactory(fields)
examples = []
- with open(file=file_path, encoding="utf8") as in_file:
+ with open(os.path.expanduser(file_path), encoding="utf-8") as in_file:
for line in in_file:
examples.append(example_factory.from_json(line))
return examples
diff --git a/podium/datasets/impl/sst.py b/podium/datasets/impl/sst.py
index e0a40f45..f6200f25 100644
--- a/podium/datasets/impl/sst.py
+++ b/podium/datasets/impl/sst.py
@@ -123,7 +123,7 @@ def label_trf(label):
return label_to_string_map[label]
examples = []
- with open(file=file_path, encoding="utf8") as fpr:
+ with open(os.path.expanduser(file_path), encoding="utf-8") as fpr:
for line in fpr:
example = example_factory.from_fields_tree(
diff --git a/podium/field.py b/podium/field.py
index cd946577..fc554412 100644
--- a/podium/field.py
+++ b/podium/field.py
@@ -954,7 +954,7 @@ def remove_pretokenize_hooks(self):
def __repr__(self):
fields_str = ",\n".join(
- textwrap.indent(repr(f), " " * 8) for f in self._output_fields
+ textwrap.indent(repr(f), " " * 4) for f in self._output_fields
)
fields_str = f"[\n{fields_str}\n \n]"
attrs = {"fields": fields_str}
diff --git a/podium/vectorizers/vectorizer.py b/podium/vectorizers/vectorizer.py
index 2f7b1e1a..982e1f20 100644
--- a/podium/vectorizers/vectorizer.py
+++ b/podium/vectorizers/vectorizer.py
@@ -298,7 +298,7 @@ def _cache_vectors(self):
"""
Method for caching loaded vectors to cache_dir.
"""
- with open(self._cache_path, "wb") as cache_file:
+ with open(os.path.expanduser(self._cache_path), "wb") as cache_file:
for word in self._vectors:
vector_values_string = " ".join(map(str, self._vectors[word]))
cache_file.write(f"{word} {vector_values_string}\n".encode("utf-8"))
@@ -362,7 +362,9 @@ def _load_vectors(self, vocab=None):
vocab = set(vocab)
open_mode, split_delimiter = ("rb", b" ") if self._binary else ("r", " ")
- with open(curr_path, open_mode, encoding=self._encoding) as vector_file:
+ with open(
+ os.path.expanduser(curr_path), open_mode, encoding=self._encoding
+ ) as vector_file:
vectors_loaded = 0
header_lines = 0
diff --git a/setup.py b/setup.py
index 9b643cad..ef4dc1f0 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,6 @@ def _get_version():
'sphinx_rtd_theme',
'sphinx-copybutton',
'recommonmark',
- 'nbformat',
'datasets',
]