From 51eb9e7e85776566a501e4a63da68e2fc10b7822 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Sat, 27 Mar 2021 13:48:13 +0100
Subject: [PATCH 01/15] Fix references in colab

---
 docs/source/scripts/convert_doc_to_notebooks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/scripts/convert_doc_to_notebooks.py b/docs/source/scripts/convert_doc_to_notebooks.py
index 1feb0731..913c406c 100644
--- a/docs/source/scripts/convert_doc_to_notebooks.py
+++ b/docs/source/scripts/convert_doc_to_notebooks.py
@@ -273,7 +273,7 @@ def convert_math(text):
 def convert_anchor(text):
     """ Convert text to an anchor that can be used in the notebook."""
     anchor_name = _re_anchor_section.search(text).groups()[0]
-    return f"<a id='{anchor_name}'></a>"
+    return f"<a name='{anchor_name}' id='{anchor_name}'></a>"
 
 
 ###################################

From b1303b7b3f53d9d8c9c0c6b93d7c91adea66cf16 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 31 Mar 2021 15:57:36 +0200
Subject: [PATCH 02/15] Add check_notebooks script

---
 .github/workflows/scheduled.yml               |  28 ++
 README.md                                     |   2 +-
 docs/source/installation.md                   |   2 +-
 docs/source/notebooks/advanced.ipynb          | 443 +++++++++++-------
 docs/source/scripts/check_notebooks.py        | 106 +++++
 .../scripts/convert_doc_to_notebooks.py       |   2 +-
 6 files changed, 409 insertions(+), 174 deletions(-)
 create mode 100644 .github/workflows/scheduled.yml
 create mode 100644 docs/source/scripts/check_notebooks.py

diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml
new file mode 100644
index 00000000..9f394b8c
--- /dev/null
+++ b/.github/workflows/scheduled.yml
@@ -0,0 +1,28 @@
+name: Scheduled jobs
+
+on:
+  repository_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+jobs:
+  check_notebooks:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.6]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install 
+    - name: Execute notebooks 
+      run: |
+        cd docs/source/scripts
+        python check_notebooks.py --num_proc auto
diff --git a/README.md b/README.md
index 84ea9eef..e7489d16 100644
--- a/README.md
+++ b/README.md
@@ -207,7 +207,7 @@ Example({
 })
 ```
 
-For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/takelab/podium/blob/master/docs/source/notebooks/quickstart.ipynb)
+For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/TakeLab/podium/blob/master/docs/source/notebooks/quickstart.ipynb)
 
 More complex examples can be found in our [examples folder](./examples).
 
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 7c67d2fb..bcc7ab55 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -23,5 +23,5 @@ Coming soon!
 ## Installing from source
 
 To install from source via terminal:
-1. Clone the repository: `git clone git@github.com:takelab/podium.git && cd podium`
+1. Clone the repository: `git clone git@github.com:TakeLab/podium.git && cd podium`
 2. Install podium: `pip install .`
diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb
index c9f7d874..73484cb5 100644
--- a/docs/source/notebooks/advanced.ipynb
+++ b/docs/source/notebooks/advanced.ipynb
@@ -2,14 +2,117 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting git+https://github.com/TakeLab/podium.git\n",
+      "  Cloning https://github.com/TakeLab/podium.git to c:\\users\\mario\\appdata\\local\\temp\\pip-req-build-ixj3hts4\n",
+      "Requirement already satisfied: dill in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.3.3)\n",
+      "Requirement already satisfied: nltk>=3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (3.5)\n",
+      "Requirement already satisfied: paramiko in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (2.7.2)\n",
+      "Requirement already satisfied: requests in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (2.24.0)\n",
+      "Requirement already satisfied: scikit-learn in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.23.2)\n",
+      "Requirement already satisfied: tqdm in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (4.49.0)\n",
+      "Requirement already satisfied: numpy<=1.19 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.19.0)\n",
+      "Requirement already satisfied: pandas<1.2.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.1.2)\n",
+      "Requirement already satisfied: scipy<1.6.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.5.2)\n",
+      "Requirement already satisfied: dataclasses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.7)\n",
+      "Requirement already satisfied: regex in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (2020.11.13)\n",
+      "Requirement already satisfied: joblib in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (0.16.0)\n",
+      "Requirement already satisfied: click in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (7.1.2)\n",
+      "Requirement already satisfied: pytz>=2017.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from pandas<1.2.0->podium==1.1.1) (2020.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from pandas<1.2.0->podium==1.1.1) (2.8.1)\n",
+      "Requirement already satisfied: six>=1.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from python-dateutil>=2.7.3->pandas<1.2.0->podium==1.1.1) (1.15.0)\n",
+      "Requirement already satisfied: pynacl>=1.0.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (1.4.0)\n",
+      "Requirement already satisfied: bcrypt>=3.1.3 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (3.2.0)\n",
+      "Requirement already satisfied: cryptography>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (3.1.1)\n",
+      "Requirement already satisfied: cffi>=1.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from bcrypt>=3.1.3->paramiko->podium==1.1.1) (1.14.3)\n",
+      "Requirement already satisfied: pycparser in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from cffi>=1.1->bcrypt>=3.1.3->paramiko->podium==1.1.1) (2.20)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (1.25.10)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (2.10)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (2020.6.20)\n",
+      "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (3.0.4)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from scikit-learn->podium==1.1.1) (2.1.0)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  Running command git clone -q https://github.com/TakeLab/podium.git 'C:\\Users\\Mario\\AppData\\Local\\Temp\\pip-req-build-ixj3hts4'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: transformers in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (4.4.0.dev0)\n",
+      "Requirement already satisfied: spacy in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (2.2.3)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (2.24.0)\n",
+      "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.9.6)\n",
+      "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.8.2)\n",
+      "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.19.0)\n",
+      "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.4.1)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (2.0.5)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.28.0)\n",
+      "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (7.3.1)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (3.0.5)\n",
+      "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.0.0)\n",
+      "Requirement already satisfied: setuptools in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (52.0.0.post20210125)\n",
+      "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.0.5)\n",
+      "Requirement already satisfied: importlib-metadata>=0.20 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy) (2.0.0)\n",
+      "Requirement already satisfied: zipp>=0.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.2.0)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.6.20)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.25.10)\n",
+      "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from thinc<7.4.0,>=7.3.0->spacy) (4.49.0)\n",
+      "Requirement already satisfied: packaging in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (20.4)\n",
+      "Requirement already satisfied: filelock in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (3.0.12)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (2020.11.13)\n",
+      "Requirement already satisfied: sacremoses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.0.43)\n",
+      "Requirement already satisfied: dataclasses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.7)\n",
+      "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.10.1)\n",
+      "Requirement already satisfied: six in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from packaging->transformers) (1.15.0)\n",
+      "Requirement already satisfied: pyparsing>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from packaging->transformers) (2.4.7)\n",
+      "Requirement already satisfied: click in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from sacremoses->transformers) (7.1.2)\n",
+      "Requirement already satisfied: joblib in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from sacremoses->transformers) (0.16.0)\n",
+      "Collecting en_core_web_sm==2.2.5\n",
+      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)\n",
+      "Requirement already satisfied: spacy>=2.2.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from en_core_web_sm==2.2.5) (2.2.3)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.5)\n",
+      "Requirement already satisfied: setuptools in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (52.0.0.post20210125)\n",
+      "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.5)\n",
+      "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.5)\n",
+      "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.3.1)\n",
+      "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n",
+      "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.8.2)\n",
+      "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.19.0)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.28.0)\n",
+      "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.9.6)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.24.0)\n",
+      "Requirement already satisfied: importlib-metadata>=0.20 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.0)\n",
+      "Requirement already satisfied: zipp>=0.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.2.0)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)\n",
+      "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2020.6.20)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.25.10)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from thinc<7.4.0,>=7.3.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (4.49.0)\n",
+      "[+] Download and installation successful\n",
+      "You can now load the model via spacy.load('en_core_web_sm')\n"
+     ]
+    }
+   ],
    "source": [
     "# Podium installation\n",
-    "! pip install podium-nlp\n",
+    "# ! pip install podium-nlp\n",
     "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# ! pip install git+https://github.com/takelab/podium\n",
+    "! pip install git+https://github.com/TakeLab/podium.git\n",
     "\n",
     "# Additional dependencies required to run this notebook:\n",
     "! pip install transformers spacy\n",
@@ -34,7 +137,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -67,19 +170,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "None ['A', 'slick', ',', 'engrossing', 'melodrama', '.']\n",
-       "None positive"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None ['A', 'slick', ',', 'engrossing', 'melodrama', '.']\n",
+      "None positive\n"
+     ]
     }
    ],
    "source": [
@@ -98,18 +198,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "('A slick , engrossing melodrama .', ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('A slick , engrossing melodrama .', ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n"
+     ]
     }
    ],
    "source": [
@@ -169,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -187,7 +284,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -206,7 +303,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -242,7 +339,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -274,7 +371,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -297,18 +394,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "('a slick , engrossing melodrama .', ['a', 'slick', 'engrossing', 'melodrama'])"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('a slick , engrossing melodrama .', ['a', 'slick', 'engrossing', 'melodrama'])\n"
+     ]
     }
    ],
    "source": [
@@ -367,18 +461,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "<BOS>"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<BOS>\n"
+     ]
     }
    ],
    "source": [
@@ -403,19 +494,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "<MY_BOS>\n",
-       "True"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<MY_BOS>\n",
+      "True\n"
+     ]
     }
    ],
    "source": [
@@ -435,18 +523,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "(None, ['<BOS>', 'A', 'slick', ',', 'engrossing', 'melodrama', '.'])"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(None, ['<BOS>', 'A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n"
+     ]
     }
    ],
    "source": [
@@ -498,18 +583,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])\n"
+     ]
     }
    ],
    "source": [
@@ -545,19 +627,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n",
-       "(None, ['A', ' ', 's', 'l', 'i', 'c', 'k', ' ', ',', ' ', 'e', 'n', 'g', 'r', 'o', 's', 's', 'i', 'n', 'g', ' ', 'm', 'e', 'l', 'o', 'd', 'r', 'a', 'm', 'a', ' ', '.'])"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n",
+      "(None, ['A', ' ', 's', 'l', 'i', 'c', 'k', ' ', ',', ' ', 'e', 'n', 'g', 'r', 'o', 's', 's', 'i', 'n', 'g', ' ', 'm', 'e', 'l', 'o', 'd', 'r', 'a', 'm', 'a', ' ', '.'])\n"
+     ]
     }
    ],
    "source": [
@@ -597,19 +676,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n",
-       "(None, ['DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'])"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n",
+      "(None, ['DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'])\n"
+     ]
     }
    ],
    "source": [
@@ -673,18 +749,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "0.5 0.3 0.2"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.5 0.3 0.2\n"
+     ]
     }
    ],
    "source": [
@@ -704,20 +777,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "{'negative': 0.47803468208092487, 'positive': 0.5219653179190752}\n",
-       "{'negative': 0.48458574181117536, 'positive': 0.5154142581888247}\n",
-       "{'negative': 0.46965317919075145, 'positive': 0.5303468208092486}"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'negative': 0.47803468208092487, 'positive': 0.5219653179190752}\n",
+      "{'negative': 0.48458574181117536, 'positive': 0.5154142581888247}\n",
+      "{'negative': 0.46965317919075145, 'positive': 0.5303468208092486}\n"
+     ]
     }
    ],
    "source": [
@@ -742,18 +812,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "0.5 0.3 0.2"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.5 0.3 0.2\n"
+     ]
     }
    ],
    "source": [
@@ -771,20 +838,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n",
-       "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n",
-       "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n",
+      "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n",
+      "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n"
+     ]
     }
    ],
    "source": [
@@ -822,20 +886,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "['positive', 'negative']\n",
-       "['positive', 'negative']\n",
-       "31920 = 25000 + 6920"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████| 84.1M/84.1M [00:27<00:00, 3.04MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['positive', 'negative']\n",
+      "['positive', 'negative']\n",
+      "31920 = 25000 + 6920\n"
+     ]
     }
    ],
    "source": [
@@ -888,7 +956,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -919,19 +987,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "For Iterator, padding = 148141 out of 281696 = 52.588961149608096%\n",
-       "For BucketIterator, padding = 2125 out of 135680 = 1.5661851415094339%"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "For Iterator, padding = 145749 out of 279304 = 52.18%\n",
+      "For BucketIterator, padding = 2125 out of 135680 = 1.57%\n"
+     ]
     }
    ],
    "source": [
@@ -1011,7 +1076,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -1070,10 +1135,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "data": {
       "text/plain": [
        "Example({\n",
@@ -1085,6 +1151,16 @@
      "execution_count": null,
      "metadata": {},
      "output_type": "execute_result"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Example({\n",
+      "    text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n",
+      "    label: (None, 'positive')\n",
+      "})\n"
+     ]
+>>>>>>> Add check_notebooks script
     }
    ],
    "source": [
@@ -1116,10 +1192,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "data": {
       "text/plain": [
        "[[  14 1144    9 2955    8   27    4 2956 3752   10  149   62    0   64\n",
@@ -1130,6 +1207,15 @@
      "execution_count": null,
      "metadata": {},
      "output_type": "execute_result"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[  14 1057   10 2580    8   28    4 3334 3335    9  154   68    0   67\n",
+      "     5   11   81    9  274    8   83    6 4683   74 2901   38 1410 2581\n",
+      "     3    0 2102    0   49  870    0    2]]\n"
+     ]
+>>>>>>> Add check_notebooks script
     }
    ],
    "source": [
@@ -1156,19 +1242,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "True\n",
-       "True"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n"
+     ]
     }
    ],
    "source": [
@@ -1188,7 +1271,25 @@
    ]
   }
  ],
- "metadata": {},
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test-podium",
+   "language": "python",
+   "name": "test-podium"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.12"
+  }
+ },
  "nbformat": 4,
  "nbformat_minor": 4
 }
diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
new file mode 100644
index 00000000..55833d23
--- /dev/null
+++ b/docs/source/scripts/check_notebooks.py
@@ -0,0 +1,106 @@
+import argparse
+import multiprocessing
+import textwrap
+from functools import partial
+from pathlib import Path
+
+import nbformat
+from nbconvert.preprocessors import CellExecutionError, ExecutePreprocessor
+
+
+NOTEBOOKS_PATH = "../notebooks"
+INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp\n"
+INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git\n"
+
+
+def replace_install_release_with_source(nb):
+    cell = nb.cells[0]
+    # sanity check
+    assert cell["cell_type"] == "code"
+    assert isinstance(cell.source, list)
+
+    irv_idx = cell["source"].index(INSTALL_RELEASE_VERSION_COMMAND)
+    cell["source"][irv_idx] = "# " + cell["source"][irv_idx]
+
+    isv_idx = cell["source"].index(INSTALL_SOURCE_VERSION_COMMAND)
+    cell["source"][isv_idx] = cell["source"][isv_idx][cell["source"][isv_idx].index("!"):]
+
+
+def check_notebook_output(notebook_path, env="python3"):
+    with open(notebook_path, encoding="utf-8") as f:
+        nb = nbformat.read(f, as_version=4)
+
+    original_nb = nb.copy()
+    ep = ExecutePreprocessor(kernel_name=env)
+    new_nb = nb
+    replace_install_release_with_source(new_nb)
+    try:
+        ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}})
+    except CellExecutionError:
+        print(f"Error happened while executing the notebook {notebook_path.name}")
+        raise
+
+    report = []
+    assert len(original_nb["cells"]) == len(new_nb["cells"])
+    for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cell"])):
+        # consider only cells with code
+        if original_cell["cell_type"] != "code":
+            continue
+
+        # sanity check
+        assert isinstance(original_cell["source"], list)
+        # skip cells with commands
+        for line in original_cell["source"]:
+            if line.strip().startswith(("!", "%")):
+                continue
+
+        # sanity check
+        assert isinstance(original_cell["outputs"]["data"]["text/plain"], list)
+        original_cell_stdout = "".join(original_cell["outputs"]["data"]["text/plain"])
+
+        new_cell_stdout = "".join([
+            new_cell_output["text"]
+            for new_cell_output in new_cell["outputs"] if new_cell_output["name"] == "stdout"
+        ])
+
+        if original_cell_stdout != new_cell_stdout:
+            report.append(i, original_cell_stdout, new_cell_stdout)
+
+    return notebook_path.name, report
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--env", help="kernel that executes the notebook")
+    parser.add_argument("--num_proc", help="number of processes for parallel execution")
+    args = parser.parse_args()
+
+    if args.num_proc is None:
+        num_proc = 1
+    elif args.num_proc == "auto":
+        num_proc = multiprocessing.cpu_count()
+    else:
+        num_proc = int(args.num_proc)
+
+    notebook_paths = [notebook_path for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb")]
+    num_proc = min(min(num_proc, multiprocessing.cpu_count()), len(notebook_paths))
+    if num_proc == 1:
+        reports = []
+        for notebook_path in notebook_paths:
+            report = check_notebook_output(notebook_path, env=args.env)
+            reports.append(report)
+    else:
+        with multiprocessing.Pool(num_proc) as pool:
+            reports = pool.map(partial(check_notebook_output, env=args.env), notebook_paths)
+
+    if any(report for _, report in reports):
+        reports_str = "\n\n".join([
+            f"In notebook {notebook}:\n" + textwrap.indent(
+                "\n".join(
+                    f"Original output:\n{original_output}\nAfter execution:\n{new_output}"
+                    for original_output, new_output in report), " " * 4)
+                for notebook, report in reports
+        ])
+        raise Exception(
+            "❌❌ Found mismatches in the outputs of the notebooks:\n\n" + reports_str
+        )
diff --git a/docs/source/scripts/convert_doc_to_notebooks.py b/docs/source/scripts/convert_doc_to_notebooks.py
index 913c406c..dd870b94 100644
--- a/docs/source/scripts/convert_doc_to_notebooks.py
+++ b/docs/source/scripts/convert_doc_to_notebooks.py
@@ -428,7 +428,7 @@ def rm_first_line(text):
 INSTALL_CODE = """# Podium installation
 ! pip install podium-nlp
 # To install from source instead of the last release, comment the command above and uncomment the following one.
-# ! pip install git+https://github.com/takelab/podium
+# ! pip install git+https://github.com/TakeLab/podium.git
 """
 
 ADDITIONAL_DEPS = {

From 977af00dce88699a4df1f3e075b0bb8eae10ce5c Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Wed, 31 Mar 2021 19:49:45 +0200
Subject: [PATCH 03/15] Fixes

---
 docs/source/notebooks/advanced.ipynb      | 210 +++++-----------------
 docs/source/notebooks/preprocessing.ipynb |   2 +-
 docs/source/notebooks/walkthrough.ipynb   |   4 +-
 docs/source/scripts/check_notebooks.py    |  26 +--
 4 files changed, 61 insertions(+), 181 deletions(-)

diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb
index 73484cb5..9a0b0710 100644
--- a/docs/source/notebooks/advanced.ipynb
+++ b/docs/source/notebooks/advanced.ipynb
@@ -2,117 +2,14 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collecting git+https://github.com/TakeLab/podium.git\n",
-      "  Cloning https://github.com/TakeLab/podium.git to c:\\users\\mario\\appdata\\local\\temp\\pip-req-build-ixj3hts4\n",
-      "Requirement already satisfied: dill in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.3.3)\n",
-      "Requirement already satisfied: nltk>=3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (3.5)\n",
-      "Requirement already satisfied: paramiko in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (2.7.2)\n",
-      "Requirement already satisfied: requests in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (2.24.0)\n",
-      "Requirement already satisfied: scikit-learn in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.23.2)\n",
-      "Requirement already satisfied: tqdm in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (4.49.0)\n",
-      "Requirement already satisfied: numpy<=1.19 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.19.0)\n",
-      "Requirement already satisfied: pandas<1.2.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.1.2)\n",
-      "Requirement already satisfied: scipy<1.6.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.5.2)\n",
-      "Requirement already satisfied: dataclasses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.7)\n",
-      "Requirement already satisfied: regex in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (2020.11.13)\n",
-      "Requirement already satisfied: joblib in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (0.16.0)\n",
-      "Requirement already satisfied: click in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (7.1.2)\n",
-      "Requirement already satisfied: pytz>=2017.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from pandas<1.2.0->podium==1.1.1) (2020.1)\n",
-      "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from pandas<1.2.0->podium==1.1.1) (2.8.1)\n",
-      "Requirement already satisfied: six>=1.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from python-dateutil>=2.7.3->pandas<1.2.0->podium==1.1.1) (1.15.0)\n",
-      "Requirement already satisfied: pynacl>=1.0.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (1.4.0)\n",
-      "Requirement already satisfied: bcrypt>=3.1.3 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (3.2.0)\n",
-      "Requirement already satisfied: cryptography>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (3.1.1)\n",
-      "Requirement already satisfied: cffi>=1.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from bcrypt>=3.1.3->paramiko->podium==1.1.1) (1.14.3)\n",
-      "Requirement already satisfied: pycparser in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from cffi>=1.1->bcrypt>=3.1.3->paramiko->podium==1.1.1) (2.20)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (1.25.10)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (2.10)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (2020.6.20)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (3.0.4)\n",
-      "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from scikit-learn->podium==1.1.1) (2.1.0)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  Running command git clone -q https://github.com/TakeLab/podium.git 'C:\\Users\\Mario\\AppData\\Local\\Temp\\pip-req-build-ixj3hts4'\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: transformers in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (4.4.0.dev0)\n",
-      "Requirement already satisfied: spacy in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (2.2.3)\n",
-      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (2.24.0)\n",
-      "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.9.6)\n",
-      "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.8.2)\n",
-      "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.19.0)\n",
-      "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.4.1)\n",
-      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (2.0.5)\n",
-      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.28.0)\n",
-      "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (7.3.1)\n",
-      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (3.0.5)\n",
-      "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.0.0)\n",
-      "Requirement already satisfied: setuptools in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (52.0.0.post20210125)\n",
-      "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.0.5)\n",
-      "Requirement already satisfied: importlib-metadata>=0.20 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy) (2.0.0)\n",
-      "Requirement already satisfied: zipp>=0.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.2.0)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.6.20)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.25.10)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n",
-      "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from thinc<7.4.0,>=7.3.0->spacy) (4.49.0)\n",
-      "Requirement already satisfied: packaging in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (20.4)\n",
-      "Requirement already satisfied: filelock in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (3.0.12)\n",
-      "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (2020.11.13)\n",
-      "Requirement already satisfied: sacremoses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.0.43)\n",
-      "Requirement already satisfied: dataclasses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.7)\n",
-      "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.10.1)\n",
-      "Requirement already satisfied: six in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from packaging->transformers) (1.15.0)\n",
-      "Requirement already satisfied: pyparsing>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from packaging->transformers) (2.4.7)\n",
-      "Requirement already satisfied: click in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from sacremoses->transformers) (7.1.2)\n",
-      "Requirement already satisfied: joblib in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from sacremoses->transformers) (0.16.0)\n",
-      "Collecting en_core_web_sm==2.2.5\n",
-      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)\n",
-      "Requirement already satisfied: spacy>=2.2.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from en_core_web_sm==2.2.5) (2.2.3)\n",
-      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.5)\n",
-      "Requirement already satisfied: setuptools in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (52.0.0.post20210125)\n",
-      "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.5)\n",
-      "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n",
-      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.5)\n",
-      "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.3.1)\n",
-      "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n",
-      "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.8.2)\n",
-      "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.19.0)\n",
-      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.28.0)\n",
-      "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.9.6)\n",
-      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.24.0)\n",
-      "Requirement already satisfied: importlib-metadata>=0.20 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.0)\n",
-      "Requirement already satisfied: zipp>=0.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.2.0)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2020.6.20)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.25.10)\n",
-      "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from thinc<7.4.0,>=7.3.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (4.49.0)\n",
-      "[+] Download and installation successful\n",
-      "You can now load the model via spacy.load('en_core_web_sm')\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Podium installation\n",
-    "# ! pip install podium-nlp\n",
+    "! pip install podium-nlp\n",
     "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "! pip install git+https://github.com/TakeLab/podium.git\n",
+    "# ! pip install git+https://github.com/TakeLab/podium.git\n",
     "\n",
     "# Additional dependencies required to run this notebook:\n",
     "! pip install transformers spacy\n",
@@ -137,21 +34,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "Example({\n",
-       "    text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n",
-       "    label: (None, 'positive')\n",
-       "})"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Example({\n",
+      "    text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n",
+      "    label: (None, 'positive')\n",
+      "})\n"
+     ]
     }
    ],
    "source": [
@@ -170,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -198,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -266,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -284,7 +178,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -303,7 +197,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -339,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -371,7 +265,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -394,7 +288,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -431,7 +325,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='specials'></a>"
+    "<a name='specials' id='specials'></a>"
    ]
   },
   {
@@ -461,7 +355,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -494,7 +388,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -523,7 +417,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -583,13 +477,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "66267ed5720f410faf47766dea3dbad6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "\n",
       "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])\n"
      ]
     }
@@ -889,13 +798,6 @@
    "execution_count": 22,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|█████████████████████████████████████████████████████████████████████████████| 84.1M/84.1M [00:27<00:00, 3.04MB/s]\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -1139,19 +1041,6 @@
    "metadata": {},
    "outputs": [
     {
-<<<<<<< HEAD
-     "data": {
-      "text/plain": [
-       "Example({\n",
-       "    text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n",
-       "    label: (None, 'positive')\n",
-       "})"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-=======
      "name": "stdout",
      "output_type": "stream",
      "text": [
@@ -1160,7 +1049,6 @@
       "    label: (None, 'positive')\n",
       "})\n"
      ]
->>>>>>> Add check_notebooks script
     }
    ],
    "source": [
@@ -1196,7 +1084,6 @@
    "metadata": {},
    "outputs": [
     {
-<<<<<<< HEAD
      "data": {
       "text/plain": [
        "[[  14 1144    9 2955    8   27    4 2956 3752   10  149   62    0   64\n",
@@ -1207,15 +1094,6 @@
      "execution_count": null,
      "metadata": {},
      "output_type": "execute_result"
-=======
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[  14 1057   10 2580    8   28    4 3334 3335    9  154   68    0   67\n",
-      "     5   11   81    9  274    8   83    6 4683   74 2901   38 1410 2581\n",
-      "     3    0 2102    0   49  870    0    2]]\n"
-     ]
->>>>>>> Add check_notebooks script
     }
    ],
    "source": [
@@ -1273,9 +1151,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "test-podium",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "test-podium"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1287,7 +1165,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.12"
+   "version": "3.7.9"
   }
  },
  "nbformat": 4,
diff --git a/docs/source/notebooks/preprocessing.ipynb b/docs/source/notebooks/preprocessing.ipynb
index 53e59961..1ade9a99 100644
--- a/docs/source/notebooks/preprocessing.ipynb
+++ b/docs/source/notebooks/preprocessing.ipynb
@@ -9,7 +9,7 @@
     "# Podium installation\n",
     "! pip install podium-nlp\n",
     "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# ! pip install git+https://github.com/takelab/podium\n",
+    "# ! pip install git+https://github.com/TakeLab/podium.git\n",
     "\n",
     "# Additional dependencies required to run this notebook:\n",
     "! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n",
diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb
index f78d4d45..d6944d23 100644
--- a/docs/source/notebooks/walkthrough.ipynb
+++ b/docs/source/notebooks/walkthrough.ipynb
@@ -9,7 +9,7 @@
     "# Podium installation\n",
     "! pip install podium-nlp\n",
     "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# ! pip install git+https://github.com/takelab/podium\n",
+    "# ! pip install git+https://github.com/TakeLab/podium.git\n",
     "\n",
     "# Additional dependencies required to run this notebook:\n",
     "! pip install datasets spacy\n",
@@ -49,7 +49,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='builtin-loading'></a>"
+    "<a name='builtin-loading' id='builtin-loading'></a>"
    ]
   },
   {
diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
index 55833d23..941037ba 100644
--- a/docs/source/scripts/check_notebooks.py
+++ b/docs/source/scripts/check_notebooks.py
@@ -1,4 +1,5 @@
 import argparse
+import copy
 import multiprocessing
 import textwrap
 from functools import partial
@@ -14,25 +15,25 @@
 
 
 def replace_install_release_with_source(nb):
-    cell = nb.cells[0]
+    cell = nb["cells"][0]
     # sanity check
     assert cell["cell_type"] == "code"
-    assert isinstance(cell.source, list)
+    assert isinstance(cell["source"], str)
 
-    irv_idx = cell["source"].index(INSTALL_RELEASE_VERSION_COMMAND)
-    cell["source"][irv_idx] = "# " + cell["source"][irv_idx]
+    assert INSTALL_RELEASE_VERSION_COMMAND in cell["source"]
+    cell["source"] = cell["source"].replace(INSTALL_RELEASE_VERSION_COMMAND, "# " + INSTALL_RELEASE_VERSION_COMMAND)
 
-    isv_idx = cell["source"].index(INSTALL_SOURCE_VERSION_COMMAND)
-    cell["source"][isv_idx] = cell["source"][isv_idx][cell["source"][isv_idx].index("!"):]
+    assert INSTALL_SOURCE_VERSION_COMMAND in cell["source"]
+    cell["source"] = cell["source"].replace(INSTALL_SOURCE_VERSION_COMMAND, INSTALL_SOURCE_VERSION_COMMAND[2:])
 
 
 def check_notebook_output(notebook_path, env="python3"):
     with open(notebook_path, encoding="utf-8") as f:
         nb = nbformat.read(f, as_version=4)
 
-    original_nb = nb.copy()
+    original_nb = nb
     ep = ExecutePreprocessor(kernel_name=env)
-    new_nb = nb
+    new_nb = copy.deepcopy(nb)
     replace_install_release_with_source(new_nb)
     try:
         ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}})
@@ -40,6 +41,7 @@ def check_notebook_output(notebook_path, env="python3"):
         print(f"Error happened while executing the notebook {notebook_path.name}")
         raise
 
+    print(new_nb)
     report = []
     assert len(original_nb["cells"]) == len(new_nb["cells"])
     for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cell"])):
@@ -48,9 +50,9 @@ def check_notebook_output(notebook_path, env="python3"):
             continue
 
         # sanity check
-        assert isinstance(original_cell["source"], list)
+        assert isinstance(original_cell["source"], str)
         # skip cells with commands
-        for line in original_cell["source"]:
+        for line in original_cell["source"].splitlines():
             if line.strip().startswith(("!", "%")):
                 continue
 
@@ -71,7 +73,7 @@ def check_notebook_output(notebook_path, env="python3"):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--env", help="kernel that executes the notebook")
+    parser.add_argument("--env", default="python3", help="kernel that executes the notebook")
     parser.add_argument("--num_proc", help="number of processes for parallel execution")
     args = parser.parse_args()
 
@@ -102,5 +104,5 @@ def check_notebook_output(notebook_path, env="python3"):
                 for notebook, report in reports
         ])
         raise Exception(
-            "❌❌ Found mismatches in the outputs of the notebooks:\n\n" + reports_str
+            "❌❌ Mismatches found in the outputs of the notebooks:\n\n" + reports_str
         )

From 0d03cd093bf16da861650cfd8a4605406f8c3014 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 02:18:26 +0200
Subject: [PATCH 04/15] Add check notebooks output script

---
 .github/workflows/scheduled.yml               |  11 +-
 docs/source/advanced.rst                      |  11 +-
 docs/source/notebooks/advanced.ipynb          | 366 ++++++++++--------
 docs/source/notebooks/preprocessing.ipynb     |   7 +-
 docs/source/notebooks/quickstart.ipynb        |   8 +-
 docs/source/notebooks/walkthrough.ipynb       |  52 ++-
 docs/source/preprocessing.rst                 |   2 +-
 docs/source/quickstart.rst                    |   6 +-
 docs/source/scripts/check_notebooks.py        |  91 +++--
 .../scripts/convert_doc_to_notebooks.py       |  13 +-
 docs/source/scripts/requirements.txt          |   4 +
 docs/source/walkthrough.rst                   |  40 +-
 podium/datasets/dataset.py                    |   2 +-
 podium/field.py                               |   2 +-
 setup.py                                      |   1 -
 15 files changed, 348 insertions(+), 268 deletions(-)
 create mode 100644 docs/source/scripts/requirements.txt

diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml
index 9f394b8c..856fa93a 100644
--- a/.github/workflows/scheduled.yml
+++ b/.github/workflows/scheduled.yml
@@ -1,6 +1,9 @@
 name: Scheduled jobs
 
 on:
+  pull_request:
+    branches:
+      - master
   repository_dispatch:
   schedule:
     - cron: "0 0 * * *"
@@ -11,6 +14,9 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6]
+    defaults:
+      run:
+        working-direcory: docs/source/scripts
 
     steps:
     - uses: actions/checkout@v2
@@ -21,8 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install 
+        pip install --r requirements.txt
     - name: Execute notebooks 
       run: |
-        cd docs/source/scripts
-        python check_notebooks.py --num_proc auto
+        python check_notebooks.py --num_proc auto --ignore_whitespace
diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index 25a939eb..9572daa5 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -379,7 +379,9 @@ For a simple example, we will take a look at the built-in SST and IMDB datasets:
   >>> from podium import Field, LabelField, Vocab
   >>> # Load the datasets
   >>> imdb_train, imdb_test = IMDB.get_dataset_splits()
+  >>> imdb_train.finalize_fields()
   >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits()
+  >>> sst_train.finalize_fields()
   >>>
   >>> # Luckily, both label vocabularies are already equal
   >>> print(imdb_train.field('label').vocab.itos)
@@ -415,7 +417,8 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended
   >>> label = LabelField(name='label')
   >>> fields = {'text': text, 'label': label}
   >>>
-  >>> train, valid, test = SST.get_dataset_splits(fields=fields)
+  >>> sst_train, sst_valid, sst_test = SST.get_dataset_splits(fields=fields)
+  >>> sst_train.finalize_fields()
   >>>
   >>> # Define the iterators and our sort key
   >>> from podium import Iterator, BucketIterator
@@ -423,14 +426,14 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended
   >>>     # Use the text Field
   >>>     raw, tokenized = instance.text
   >>>     return len(tokenized)
-  >>> bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length)
+  >>> bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length)
 
 The ``bucket_sort_key`` function defines how the instances in the dataset should be sorted. The method accepts an instance of the dataset, and should return a value which will be used as a sort key in the ``BucketIterator``. It might be interesting (and surprising) to see how much space (and time) do we earn by bucketing. We will define a naive iterator on the same dataset and measure the total amount of padding used when iterating over a dataset.
 
 .. code-block:: python
 
   >>> import numpy as np
-  >>> vanilla_iter = Iterator(train, batch_size=32)
+  >>> vanilla_iter = Iterator(sst_train, batch_size=32)
   >>>
   >>> def count_padding(batch, padding_idx):
   >>>     return np.count_nonzero(batch == padding_idx)
@@ -518,7 +521,7 @@ Each ``Dataset`` instance in the SST dataset splits contains ``Field``\s and a `
   >>> import pickle
   >>>
   >>> cache_dir = Path('cache')
-  >>> cache_dir.mkdir()
+  >>> cache_dir.mkdir(exist_ok=True)
   >>>
   >>> dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl')
   >>>
diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb
index 9a0b0710..c5eac847 100644
--- a/docs/source/notebooks/advanced.ipynb
+++ b/docs/source/notebooks/advanced.ipynb
@@ -34,18 +34,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Example({\n",
-      "    text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n",
-      "    label: (None, 'positive')\n",
-      "})\n"
-     ]
+     "data": {
+      "text/plain": [
+       "Example({\n",
+       "    text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n",
+       "    label: (None, 'positive')\n",
+       "})"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -64,16 +67,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "None ['A', 'slick', ',', 'engrossing', 'melodrama', '.']\n",
-      "None positive\n"
-     ]
+     "data": {
+      "text/plain": [
+       "None ['A', 'slick', ',', 'engrossing', 'melodrama', '.']\n",
+       "None positive"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -92,15 +98,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "('A slick , engrossing melodrama .', ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n"
-     ]
+     "data": {
+      "text/plain": [
+       "('A slick , engrossing melodrama .', ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -125,7 +134,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='interact_fields'></a>"
+    "<a name='interact_fields' id='interact_fields'></a>"
    ]
   },
   {
@@ -160,7 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -178,7 +187,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -197,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -233,7 +242,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -265,7 +274,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -288,15 +297,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "('a slick , engrossing melodrama .', ['a', 'slick', 'engrossing', 'melodrama'])\n"
-     ]
+     "data": {
+      "text/plain": [
+       "('a slick , engrossing melodrama .', ['a', 'slick', 'engrossing', 'melodrama'])"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -355,15 +367,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<BOS>\n"
-     ]
+     "data": {
+      "text/plain": [
+       "<BOS>"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -388,16 +403,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<MY_BOS>\n",
-      "True\n"
-     ]
+     "data": {
+      "text/plain": [
+       "<MY_BOS>\n",
+       "True"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -417,15 +435,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(None, ['<BOS>', 'A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n"
-     ]
+     "data": {
+      "text/plain": [
+       "(None, ['<BOS>', 'A', 'slick', ',', 'engrossing', 'melodrama', '.'])"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -456,7 +477,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='custom_numericalization'></a>"
+    "<a name='custom_numericalization' id='custom_numericalization'></a>"
    ]
   },
   {
@@ -477,30 +498,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "66267ed5720f410faf47766dea3dbad6",
-       "version_major": 2,
-       "version_minor": 0
-      },
       "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…"
+       "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])"
       ]
      },
+     "execution_count": null,
      "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])\n"
-     ]
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -536,16 +545,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n",
-      "(None, ['A', ' ', 's', 'l', 'i', 'c', 'k', ' ', ',', ' ', 'e', 'n', 'g', 'r', 'o', 's', 's', 'i', 'n', 'g', ' ', 'm', 'e', 'l', 'o', 'd', 'r', 'a', 'm', 'a', ' ', '.'])\n"
-     ]
+     "data": {
+      "text/plain": [
+       "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n",
+       "(None, ['A', ' ', 's', 'l', 'i', 'c', 'k', ' ', ',', ' ', 'e', 'n', 'g', 'r', 'o', 's', 's', 'i', 'n', 'g', ' ', 'm', 'e', 'l', 'o', 'd', 'r', 'a', 'm', 'a', ' ', '.'])"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -585,16 +597,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n",
-      "(None, ['DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'])\n"
-     ]
+     "data": {
+      "text/plain": [
+       "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n",
+       "(None, ['DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'])"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -658,15 +673,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.5 0.3 0.2\n"
-     ]
+     "data": {
+      "text/plain": [
+       "0.5 0.3 0.2"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -686,17 +704,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'negative': 0.47803468208092487, 'positive': 0.5219653179190752}\n",
-      "{'negative': 0.48458574181117536, 'positive': 0.5154142581888247}\n",
-      "{'negative': 0.46965317919075145, 'positive': 0.5303468208092486}\n"
-     ]
+     "data": {
+      "text/plain": [
+       "{'negative': 0.47803468208092487, 'positive': 0.5219653179190752}\n",
+       "{'negative': 0.48458574181117536, 'positive': 0.5154142581888247}\n",
+       "{'negative': 0.46965317919075145, 'positive': 0.5303468208092486}"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -721,15 +742,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.5 0.3 0.2\n"
-     ]
+     "data": {
+      "text/plain": [
+       "0.5 0.3 0.2"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -747,17 +771,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n",
-      "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n",
-      "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n"
-     ]
+     "data": {
+      "text/plain": [
+       "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n",
+       "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n",
+       "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -771,7 +798,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='dataset_concat'></a>"
+    "<a name='dataset_concat' id='dataset_concat'></a>"
    ]
   },
   {
@@ -795,17 +822,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['positive', 'negative']\n",
-      "['positive', 'negative']\n",
-      "31920 = 25000 + 6920\n"
-     ]
+     "data": {
+      "text/plain": [
+       "['positive', 'negative']\n",
+       "['positive', 'negative']\n",
+       "31920 = 25000 + 6920"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -813,7 +843,9 @@
     "from podium import Field, LabelField, Vocab\n",
     "# Load the datasets\n",
     "imdb_train, imdb_test = IMDB.get_dataset_splits()\n",
+    "imdb_train.finalize_fields()\n",
     "sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n",
+    "sst_train.finalize_fields()\n",
     "\n",
     "# Luckily, both label vocabularies are already equal\n",
     "print(imdb_train.field('label').vocab.itos)\n",
@@ -837,7 +869,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='bucketing'></a>"
+    "<a name='bucketing' id='bucketing'></a>"
    ]
   },
   {
@@ -858,7 +890,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -869,7 +901,8 @@
     "label = LabelField(name='label')\n",
     "fields = {'text': text, 'label': label}\n",
     "\n",
-    "train, valid, test = SST.get_dataset_splits(fields=fields)\n",
+    "sst_train, sst_valid, sst_test = SST.get_dataset_splits(fields=fields)\n",
+    "sst_train.finalize_fields()\n",
     "\n",
     "# Define the iterators and our sort key\n",
     "from podium import Iterator, BucketIterator\n",
@@ -877,7 +910,7 @@
     "    # Use the text Field\n",
     "    raw, tokenized = instance.text\n",
     "    return len(tokenized)\n",
-    "bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length)"
+    "bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length)"
    ]
   },
   {
@@ -889,21 +922,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "For Iterator, padding = 145749 out of 279304 = 52.18%\n",
-      "For BucketIterator, padding = 2125 out of 135680 = 1.57%\n"
-     ]
+     "data": {
+      "text/plain": [
+       "For Iterator, padding = 148141 out of 281696 = 52.588961149608096%\n",
+       "For BucketIterator, padding = 2125 out of 135680 = 1.5661851415094339%"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "import numpy as np\n",
-    "vanilla_iter = Iterator(train, batch_size=32)\n",
+    "vanilla_iter = Iterator(sst_train, batch_size=32)\n",
     "\n",
     "def count_padding(batch, padding_idx):\n",
     "    return np.count_nonzero(batch == padding_idx)\n",
@@ -978,7 +1014,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1037,18 +1073,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Example({\n",
-      "    text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n",
-      "    label: (None, 'positive')\n",
-      "})\n"
-     ]
+     "data": {
+      "text/plain": [
+       "Example({\n",
+       "    text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n",
+       "    label: (None, 'positive')\n",
+       "})"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -1056,7 +1095,7 @@
     "import pickle\n",
     "\n",
     "cache_dir = Path('cache')\n",
-    "cache_dir.mkdir()\n",
+    "cache_dir.mkdir(exist_ok=True)\n",
     "\n",
     "dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl')\n",
     "\n",
@@ -1080,7 +1119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1120,16 +1159,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "True\n",
-      "True\n"
-     ]
+     "data": {
+      "text/plain": [
+       "True\n",
+       "True"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -1149,25 +1191,7 @@
    ]
   }
  ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.9"
-  }
- },
+ "metadata": {},
  "nbformat": 4,
  "nbformat_minor": 4
 }
diff --git a/docs/source/notebooks/preprocessing.ipynb b/docs/source/notebooks/preprocessing.ipynb
index 1ade9a99..609c259d 100644
--- a/docs/source/notebooks/preprocessing.ipynb
+++ b/docs/source/notebooks/preprocessing.ipynb
@@ -12,8 +12,9 @@
     "# ! pip install git+https://github.com/TakeLab/podium.git\n",
     "\n",
     "# Additional dependencies required to run this notebook:\n",
-    "! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n",
-    "! python -m spacy download en_core_web_sm"
+    "! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n",
+    "! python -m spacy download en_core_web_sm\n",
+    "! python -m nltk.downloader stopwords"
    ]
   },
   {
@@ -387,7 +388,7 @@
     {
      "data": {
       "text/plain": [
-       "(None, [opinion', 'exciting', 'funny', 'movie'])"
+       "(None, ['opinion', 'exciting', 'funny', 'movie'])"
       ]
      },
      "execution_count": null,
diff --git a/docs/source/notebooks/quickstart.ipynb b/docs/source/notebooks/quickstart.ipynb
index 7d23db64..6522f6f2 100644
--- a/docs/source/notebooks/quickstart.ipynb
+++ b/docs/source/notebooks/quickstart.ipynb
@@ -9,7 +9,7 @@
     "# Podium installation\n",
     "! pip install podium-nlp\n",
     "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# ! pip install git+https://github.com/takelab/podium"
+    "# ! pip install git+https://github.com/TakeLab/podium.git"
    ]
   },
   {
@@ -131,9 +131,9 @@
      "data": {
       "text/plain": [
        "Example({\n",
-       "  input_text: (None, ['Amazingly', 'lame', '.']),\n",
-       "  input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n",
-       "  target: (None, 'negative')\n",
+       "    input_text: (None, ['Amazingly', 'lame', '.']),\n",
+       "    input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n",
+       "    target: (None, 'negative')\n",
        "})"
       ]
      },
diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb
index d6944d23..3cfb3902 100644
--- a/docs/source/notebooks/walkthrough.ipynb
+++ b/docs/source/notebooks/walkthrough.ipynb
@@ -104,7 +104,7 @@
    ],
    "source": [
     "from podium.datasets import SST\n",
-    "sst_train, sst_dev, sst_test = SST.get_dataset_splits() # doctest:+ELLIPSIS\n",
+    "sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n",
     "sst_train.finalize_fields()\n",
     "print(sst_train)\n",
     "print(sst_train[222]) # A short example"
@@ -121,7 +121,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='hf-loading'></a>"
+    "<a name='hf-loading' id='hf-loading'></a>"
    ]
   },
   {
@@ -149,8 +149,7 @@
      "data": {
       "text/plain": [
        "dict_keys(['train', 'test', 'unsupervised'])\n",
-       "{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),\n",
-       " 'text': Value(dtype='string', id=None)}"
+       "{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}"
       ]
      },
      "execution_count": null,
@@ -188,16 +187,15 @@
     {
      "data": {
       "text/plain": [
-       "{'label': LabelField({\n",
-       "    name: 'label',\n",
-       "    keep_raw: False,\n",
-       "    is_target: True\n",
-       "}),\n",
-       " 'text': Field({\n",
-       "    name: 'text',\n",
-       "    keep_raw: False,\n",
-       "    is_target: False,\n",
-       "    vocab: Vocab({specials: ('<UNK>', '<PAD>'), eager: False, is_finalized: True, size: 280619})\n",
+       "{'text': Field({\n",
+       "      name: 'text',\n",
+       "      keep_raw: False,\n",
+       "      is_target: False,\n",
+       "      vocab: Vocab({specials: ('<UNK>', '<PAD>'), eager: False, is_finalized: True, size: 280619})\n",
+       "  }), 'label': LabelField({\n",
+       "      name: 'label',\n",
+       "      keep_raw: False,\n",
+       "      is_target: True\n",
        "})}"
       ]
      },
@@ -213,7 +211,7 @@
     "imdb_train, imdb_test, imdb_unsupervised = HF.from_dataset_dict(imdb).values()\n",
     "imdb_train.finalize_fields()\n",
     "\n",
-    "print(imdb_train.field_dict())"
+    "print(imdb_train.field_dict)"
    ]
   },
   {
@@ -266,7 +264,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='custom-loading'></a>"
+    "<a name='custom-loading' id='custom-loading'></a>"
    ]
   },
   {
@@ -432,7 +430,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='vocab'></a>"
+    "<a name='vocab' id='vocab'></a>"
    ]
   },
   {
@@ -494,7 +492,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='finalizing_vocab'></a>"
+    "<a name='finalizing_vocab' id='finalizing_vocab'></a>"
    ]
   },
   {
@@ -682,7 +680,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='fields'></a>"
+    "<a name='fields' id='fields'></a>"
    ]
   },
   {
@@ -817,7 +815,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='iterating'></a>"
+    "<a name='iterating' id='iterating'></a>"
    ]
   },
   {
@@ -994,7 +992,7 @@
     {
      "data": {
       "text/plain": [
-       "For vocabulary of size: 21701 loaded embedding matrix of shape: (21701, 300)\n",
+       "For vocabulary of size: 21701 loaded embedding matrix of shape: (16284, 300)\n",
        "Vector for sport: [ 0.34566    0.15934    0.48444   -0.13693    0.18737    0.2678\n",
        " -0.39159    0.4931    -0.76111   -1.4586     0.41475    0.55837\n",
        " ...\n",
@@ -1083,12 +1081,12 @@
      "data": {
       "text/plain": [
        "<class 'scipy.sparse.csr.csr_matrix'> (6920, 4998)\n",
-       "(0, 2111) 0.617113703893198\n",
-       "(0, 549)  0.5208201737884445\n",
-       "(0, 499)  0.5116152860290002\n",
-       "(0, 19) 0.2515101839877878\n",
-       "(0, 1)  0.12681755258500052\n",
-       "(0, 0)  0.08262419651916046"
+       "  (0, 2111) 0.617113703893198\n",
+       "  (0, 549)  0.5208201737884445\n",
+       "  (0, 499)  0.5116152860290002\n",
+       "  (0, 19) 0.2515101839877878\n",
+       "  (0, 1)  0.12681755258500052\n",
+       "  (0, 0)  0.08262419651916046"
       ]
      },
      "execution_count": null,
diff --git a/docs/source/preprocessing.rst b/docs/source/preprocessing.rst
index 58e2c86c..c0f5a3bd 100644
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -159,7 +159,7 @@ Stopword removal
    >>> raw_text = None
    >>> tokenized_text = ['in', 'my', 'opinion', 'an', 'exciting', 'and', 'funny', 'movie']
    >>> print(remove_stopwords_hook(raw_text, tokenized_text))
-   (None, [opinion', 'exciting', 'funny', 'movie'])
+   (None, ['opinion', 'exciting', 'funny', 'movie'])
 
 Keyword extraction
 ------------------
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index 715f3452..0466b095 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -69,9 +69,9 @@ You might wonder, why not simply use the input column names from the header to s
   >>> dataset_with_chars.finalize_fields()
   >>> print(dataset_with_chars[1])
   Example({
-    input_text: (None, ['Amazingly', 'lame', '.']),
-    input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),
-    target: (None, 'negative')
+      input_text: (None, ['Amazingly', 'lame', '.']),
+      input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),
+      target: (None, 'negative')
   })
 
 You might wonder what the ``None``\s we've been seeing represent. For each Field, we store raw and processed data as a tuple. The first element of the tuple is reserved for raw data, by default blank to preserve memory. For a detailed overview of the Field constructor arguments and how to use them, check :ref:`fields`.
diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
index 941037ba..e8e54566 100644
--- a/docs/source/scripts/check_notebooks.py
+++ b/docs/source/scripts/check_notebooks.py
@@ -1,17 +1,33 @@
 import argparse
 import copy
-import multiprocessing
+import multiprocess
+import string
 import textwrap
 from functools import partial
 from pathlib import Path
 
 import nbformat
-from nbconvert.preprocessors import CellExecutionError, ExecutePreprocessor
+from nbconvert.preprocessors import ExecutePreprocessor
 
 
 NOTEBOOKS_PATH = "../notebooks"
-INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp\n"
-INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git\n"
+INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp"
+INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git"
+TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace))
+
+
+def print_notebook_name_with_error(func):
+    def wrapper(*args, **kwargs):
+        if args:
+            notebook_path = args[0]
+        else:
+            notebook_path = kwargs.pop("self")
+        try:
+            return func(*args, **kwargs)
+        except Exception as err:
+            print(f"Error in notebook {Path(notebook_path).name}:\n{err}")
+            raise
+    return wrapper
 
 
 def replace_install_release_with_source(nb):
@@ -27,26 +43,26 @@ def replace_install_release_with_source(nb):
     cell["source"] = cell["source"].replace(INSTALL_SOURCE_VERSION_COMMAND, INSTALL_SOURCE_VERSION_COMMAND[2:])
 
 
-def check_notebook_output(notebook_path, env="python3"):
+@print_notebook_name_with_error
+def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False):
     with open(notebook_path, encoding="utf-8") as f:
         nb = nbformat.read(f, as_version=4)
 
     original_nb = nb
-    ep = ExecutePreprocessor(kernel_name=env)
     new_nb = copy.deepcopy(nb)
     replace_install_release_with_source(new_nb)
-    try:
-        ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}})
-    except CellExecutionError:
-        print(f"Error happened while executing the notebook {notebook_path.name}")
-        raise
 
-    print(new_nb)
-    report = []
+    ep = ExecutePreprocessor(kernel_name=env)
+
+    print(str(Path(notebook_path).parent))
+    ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}})
+
     assert len(original_nb["cells"]) == len(new_nb["cells"])
-    for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cell"])):
+
+    report = []
+    for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cells"])):
         # consider only cells with code
-        if original_cell["cell_type"] != "code":
+        if original_cell["cell_type"] != "code" or original_cell["outputs"] == [] or original_cell["metadata"].get("elippsis"):
             continue
 
         # sanity check
@@ -57,16 +73,27 @@ def check_notebook_output(notebook_path, env="python3"):
                 continue
 
         # sanity check
-        assert isinstance(original_cell["outputs"]["data"]["text/plain"], list)
-        original_cell_stdout = "".join(original_cell["outputs"]["data"]["text/plain"])
+        assert len(original_cell["outputs"]) == 1
+        original_cell_stdout = original_cell["outputs"][0]["data"]["text/plain"]
+        assert isinstance(original_cell_stdout, str)
 
         new_cell_stdout = "".join([
             new_cell_output["text"]
             for new_cell_output in new_cell["outputs"] if new_cell_output["name"] == "stdout"
         ])
 
+        original_cell_stdout_ = original_cell_stdout
+        new_cell_stdout_ = new_cell_stdout
+
+        if ignore_whitespace:
+            original_cell = original_cell_stdout.translate(TRANS_TABLE)
+            new_cell_stdout = new_cell_stdout.translate(TRANS_TABLE)
+        else:
+            if new_cell_stdout[-1] == "\n" and original_cell_stdout[-1] != "\n":
+                original_cell_stdout += "\n"
+
         if original_cell_stdout != new_cell_stdout:
-            report.append(i, original_cell_stdout, new_cell_stdout)
+            report.append((i, original_cell_stdout_, new_cell_stdout_))
 
     return notebook_path.name, report
 
@@ -75,33 +102,45 @@ def check_notebook_output(notebook_path, env="python3"):
     parser = argparse.ArgumentParser()
     parser.add_argument("--env", default="python3", help="kernel that executes the notebook")
     parser.add_argument("--num_proc", help="number of processes for parallel execution")
+    parser.add_argument("--ignore_whitespace", action="store_true", help="ignore whitespace when comparing cell outputs")
     args = parser.parse_args()
 
     if args.num_proc is None:
         num_proc = 1
     elif args.num_proc == "auto":
-        num_proc = multiprocessing.cpu_count()
+        num_proc = multiprocess.cpu_count()
     else:
         num_proc = int(args.num_proc)
 
-    notebook_paths = [notebook_path for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb")]
-    num_proc = min(min(num_proc, multiprocessing.cpu_count()), len(notebook_paths))
+    notebook_paths = [
+        notebook_path
+        for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb")
+        if not notebook_path.name.endswith("-checkpoint.ipynb")
+    ]
+    # print(notebook_paths)
+    # exit()
+    # notebook_paths = [notebook_paths[2]]
+    # print(notebook_paths)
+    num_proc = min(min(num_proc, multiprocess.cpu_count()), len(notebook_paths))
     if num_proc == 1:
         reports = []
         for notebook_path in notebook_paths:
-            report = check_notebook_output(notebook_path, env=args.env)
+            report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace)
             reports.append(report)
     else:
-        with multiprocessing.Pool(num_proc) as pool:
-            reports = pool.map(partial(check_notebook_output, env=args.env), notebook_paths)
+        with multiprocess.Pool(num_proc) as pool:
+            reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths)
 
     if any(report for _, report in reports):
         reports_str = "\n\n".join([
             f"In notebook {notebook}:\n" + textwrap.indent(
                 "\n".join(
+                    f"Cell {i}\n" + "=" * len(f"Cell {i}") + "\n" +
                     f"Original output:\n{original_output}\nAfter execution:\n{new_output}"
-                    for original_output, new_output in report), " " * 4)
-                for notebook, report in reports
+                    for i, original_output, new_output in report),
+                " " * 4,
+            )
+            for notebook, report in reports
         ])
         raise Exception(
             "❌❌ Mismatches found in the outputs of the notebooks:\n\n" + reports_str
diff --git a/docs/source/scripts/convert_doc_to_notebooks.py b/docs/source/scripts/convert_doc_to_notebooks.py
index dd870b94..e6f91e6c 100644
--- a/docs/source/scripts/convert_doc_to_notebooks.py
+++ b/docs/source/scripts/convert_doc_to_notebooks.py
@@ -357,6 +357,8 @@ def split_frameworks(code):
 
 # Matches any doctest pattern.
 _re_doctest = re.compile(r"^(>>>|\.\.\.)")
+# Re pattern that matches doctest options in code blocks.
+_re_doctest_option = re.compile(r'#\s*doctest:\s*([^\n\'"]*)$', re.MULTILINE)
 
 
 def parse_code_and_output(code):
@@ -395,10 +397,16 @@ def code_cell(code, output=None):
     if output is None or len(output) == 0:
         outputs = []
     else:
+        metadata = {}
+        for m in _re_doctest_option.finditer(code):
+            group = m.group(1)
+            if group == "+ELIPPSIS":
+                metadata["elippsis"] = True
+            code.replace(group, "")
         outputs = [nbformat.notebooknode.NotebookNode({
             'data': {'text/plain': output},
             'execution_count': None,
-            'metadata': {},
+            'metadata': metadata,
             'output_type': 'execute_result'
         })]
     return nbformat.notebooknode.NotebookNode(
@@ -440,8 +448,9 @@ def rm_first_line(text):
     ),
     "preprocessing.rst": textwrap.dedent(
         """\
-        ! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz
+        ! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz
         ! python -m spacy download en_core_web_sm
+        ! python -m nltk.downloader stopwords
         """
     ),
     "walkthrough.rst": textwrap.dedent(
diff --git a/docs/source/scripts/requirements.txt b/docs/source/scripts/requirements.txt
new file mode 100644
index 00000000..5623fe18
--- /dev/null
+++ b/docs/source/scripts/requirements.txt
@@ -0,0 +1,4 @@
+ipykernel
+nbformat
+nbconvert
+multiprocess
\ No newline at end of file
diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst
index fa2150f4..5d7089be 100644
--- a/docs/source/walkthrough.rst
+++ b/docs/source/walkthrough.rst
@@ -33,7 +33,7 @@ One built-in dataset available in Podium is the `Stanford Sentiment Treebank <ht
   :options: +NORMALIZE_WHITESPACE
 
   >>> from podium.datasets import SST
-  >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits() # doctest:+ELLIPSIS
+  >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits()
   >>> sst_train.finalize_fields()
   >>> print(sst_train)
   SST({
@@ -84,8 +84,7 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your
   >>> # Each dataset has a set of features which need to be mapped
   >>> # to Podium Fields.
   >>> print(imdb['train'].features)
-  {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),
-   'text': Value(dtype='string', id=None)}
+  {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}
 
 As is the case with loading your custom dataset, ``features`` of 🤗 datasets need to be mapped to Podium Fields in order to direct the data flow for preprocessing.
 
@@ -99,17 +98,16 @@ Datasets from 🤗 need to either (1) be wrapped them in :class:`podium.datasets
   >>> imdb_train, imdb_test, imdb_unsupervised = HF.from_dataset_dict(imdb).values()
   >>> imdb_train.finalize_fields()
   >>>
-  >>> print(imdb_train.field_dict())
-  {'label': LabelField({
-      name: 'label',
-      keep_raw: False,
-      is_target: True
-  }),
-   'text': Field({
-      name: 'text',
-      keep_raw: False,
-      is_target: False,
-      vocab: Vocab({specials: ('<UNK>', '<PAD>'), eager: False, is_finalized: True, size: 280619})
+  >>> print(imdb_train.field_dict)
+  {'text': Field({
+        name: 'text',
+        keep_raw: False,
+        is_target: False,
+        vocab: Vocab({specials: ('<UNK>', '<PAD>'), eager: False, is_finalized: True, size: 280619})
+    }), 'label': LabelField({
+        name: 'label',
+        keep_raw: False,
+        is_target: True
   })}
 
 .. note::
@@ -509,7 +507,7 @@ The output of the function call is a numpy matrix of word embeddings which you c
   >>> glove = GloVe()
   >>> embeddings = glove.load_vocab(vocab)
   >>> print(f"For vocabulary of size: {len(vocab)} loaded embedding matrix of shape: {embeddings.shape}")
-  For vocabulary of size: 21701 loaded embedding matrix of shape: (21701, 300)
+  For vocabulary of size: 21701 loaded embedding matrix of shape: (16284, 300)
   >>> # We can obtain vectors for a single word (given the word is loaded) like this:
   >>> word = "sport"
   >>> print(f"Vector for {word}: {glove.token_to_vector(word)}")
@@ -555,12 +553,12 @@ Now our vectorizer has seen the dataset as well as the vocabulary and has all th
   >>> print(type(tfidf_batch), tfidf_batch.shape)
   <class 'scipy.sparse.csr.csr_matrix'> (6920, 4998)
   >>> print(tfidf_batch[222])
-  (0, 2111) 0.617113703893198
-  (0, 549)  0.5208201737884445
-  (0, 499)  0.5116152860290002
-  (0, 19) 0.2515101839877878
-  (0, 1)  0.12681755258500052
-  (0, 0)  0.08262419651916046
+    (0, 2111) 0.617113703893198
+    (0, 549)  0.5208201737884445
+    (0, 499)  0.5116152860290002
+    (0, 19) 0.2515101839877878
+    (0, 1)  0.12681755258500052
+    (0, 0)  0.08262419651916046
 
 The Tf-Idf counts are highly sparse since not all words from the vocabulary are present in every instance. To reduce the memory footprint of count-based numericalization, we store the values in a `SciPy <https://www.scipy.org/>`__ `sparse matrix <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix>`__, which can be used in various `scikit-learn <https://scikit-learn.org/stable/>`__ models.
 
diff --git a/podium/datasets/dataset.py b/podium/datasets/dataset.py
index f26b3a6e..ab681adf 100644
--- a/podium/datasets/dataset.py
+++ b/podium/datasets/dataset.py
@@ -278,7 +278,7 @@ def shuffled(self) -> "DatasetBase":
         return self[shuffled_indices]
 
     def __repr__(self):
-        fields_str = ",\n".join(textwrap.indent(repr(f), " " * 8) for f in self.fields)
+        fields_str = ",\n".join(textwrap.indent(repr(f), " " * 4) for f in self.fields)
         fields_str = f"[\n{fields_str}\n    \n]"
         attrs = {"size": len(self), "fields": fields_str}
         return repr_type_and_attrs(self, attrs, with_newlines=True, repr_values=False)
diff --git a/podium/field.py b/podium/field.py
index cd946577..fc554412 100644
--- a/podium/field.py
+++ b/podium/field.py
@@ -954,7 +954,7 @@ def remove_pretokenize_hooks(self):
 
     def __repr__(self):
         fields_str = ",\n".join(
-            textwrap.indent(repr(f), " " * 8) for f in self._output_fields
+            textwrap.indent(repr(f), " " * 4) for f in self._output_fields
         )
         fields_str = f"[\n{fields_str}\n    \n]"
         attrs = {"fields": fields_str}
diff --git a/setup.py b/setup.py
index 9b643cad..ef4dc1f0 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,6 @@ def _get_version():
     'sphinx_rtd_theme',
     'sphinx-copybutton',
     'recommonmark',
-    'nbformat',
     'datasets',
 ]
 

From 5fc112d94efabe43bd5bf30c251a48018dd5d1fe Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 02:20:00 +0200
Subject: [PATCH 05/15] Remove comments

---
 docs/source/scripts/check_notebooks.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
index e8e54566..34556eca 100644
--- a/docs/source/scripts/check_notebooks.py
+++ b/docs/source/scripts/check_notebooks.py
@@ -53,8 +53,6 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
     replace_install_release_with_source(new_nb)
 
     ep = ExecutePreprocessor(kernel_name=env)
-
-    print(str(Path(notebook_path).parent))
     ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}})
 
     assert len(original_nb["cells"]) == len(new_nb["cells"])
@@ -117,10 +115,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
         for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb")
         if not notebook_path.name.endswith("-checkpoint.ipynb")
     ]
-    # print(notebook_paths)
-    # exit()
-    # notebook_paths = [notebook_paths[2]]
-    # print(notebook_paths)
+
     num_proc = min(min(num_proc, multiprocess.cpu_count()), len(notebook_paths))
     if num_proc == 1:
         reports = []

From b3d74cc9d58fd0a649297c5ec1e608c981dc5d2c Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 02:22:22 +0200
Subject: [PATCH 06/15] CI fix

---
 .github/workflows/scheduled.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml
index 856fa93a..cb6be8d8 100644
--- a/.github/workflows/scheduled.yml
+++ b/.github/workflows/scheduled.yml
@@ -16,7 +16,7 @@ jobs:
         python-version: [3.6]
     defaults:
       run:
-        working-direcory: docs/source/scripts
+        working-directory: docs/source/scripts
 
     steps:
     - uses: actions/checkout@v2

From d13a9c0268c2efeac95cc9b5309c54477e877c2e Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 02:23:31 +0200
Subject: [PATCH 07/15] CI fix #2

---
 .github/workflows/scheduled.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml
index cb6be8d8..941a3d26 100644
--- a/.github/workflows/scheduled.yml
+++ b/.github/workflows/scheduled.yml
@@ -27,7 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install --r requirements.txt
+        pip install -r requirements.txt
     - name: Execute notebooks 
       run: |
         python check_notebooks.py --num_proc auto --ignore_whitespace

From 187e30525dca701dd7a5d2a5545b435461328952 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 12:59:49 +0200
Subject: [PATCH 08/15] Inject SST

---
 docs/source/advanced.rst               |  4 ++--
 docs/source/notebooks/advanced.ipynb   |  4 ++--
 docs/source/scripts/check_notebooks.py | 33 ++++++++++++++++++++++++++
 docs/source/scripts/requirements.txt   |  1 +
 podium/datasets/arrow.py               |  6 ++---
 podium/datasets/impl/conllu_dataset.py |  3 ++-
 podium/datasets/impl/imdb.py           |  4 +++-
 podium/datasets/impl/snli.py           |  2 +-
 podium/datasets/impl/sst.py            |  2 +-
 podium/vectorizers/vectorizer.py       |  6 +++--
 10 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
index 9572daa5..705df020 100644
--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -417,8 +417,8 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended
   >>> label = LabelField(name='label')
   >>> fields = {'text': text, 'label': label}
   >>>
-  >>> sst_train, sst_valid, sst_test = SST.get_dataset_splits(fields=fields)
-  >>> sst_train.finalize_fields()
+  >>> train, valid, test = SST.get_dataset_splits(fields=fields)
+  >>> train.finalize_fields()
   >>>
   >>> # Define the iterators and our sort key
   >>> from podium import Iterator, BucketIterator
diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb
index c5eac847..b5dd9b4c 100644
--- a/docs/source/notebooks/advanced.ipynb
+++ b/docs/source/notebooks/advanced.ipynb
@@ -901,8 +901,8 @@
     "label = LabelField(name='label')\n",
     "fields = {'text': text, 'label': label}\n",
     "\n",
-    "sst_train, sst_valid, sst_test = SST.get_dataset_splits(fields=fields)\n",
-    "sst_train.finalize_fields()\n",
+    "train, valid, test = SST.get_dataset_splits(fields=fields)\n",
+    "train.finalize_fields()\n",
     "\n",
     "# Define the iterators and our sort key\n",
     "from podium import Iterator, BucketIterator\n",
diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
index 34556eca..92fd08b2 100644
--- a/docs/source/scripts/check_notebooks.py
+++ b/docs/source/scripts/check_notebooks.py
@@ -1,7 +1,10 @@
 import argparse
 import copy
 import multiprocess
+import os
+import shutil
 import string
+import subprocess
 import textwrap
 from functools import partial
 from pathlib import Path
@@ -13,9 +16,30 @@
 NOTEBOOKS_PATH = "../notebooks"
 INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp"
 INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git"
+INSTALL_SST_COMMAND = "python -c \"from podium.datasets import SST; SST.get_dataset_splits()\""
 TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace))
 
 
+def inject_sst():
+    delim = "&" if os.name == "nt" else ";"
+    subprocess.call(
+        delim.join([INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]),
+        shell=True,
+        cwd=Path(NOTEBOOKS_PATH).absolute(),
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.STDOUT,
+    )
+
+
+def cleanup(snap_before_exec, snap_after_exec):
+    created_paths = set(snap_after_exec) - set(snap_before_exec)
+    for path in created_paths:
+        if path.is_dir():
+            shutil.rmtree(path)
+        else:
+            path.unlink()
+
+
 def print_notebook_name_with_error(func):
     def wrapper(*args, **kwargs):
         if args:
@@ -101,6 +125,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
     parser.add_argument("--env", default="python3", help="kernel that executes the notebook")
     parser.add_argument("--num_proc", help="number of processes for parallel execution")
     parser.add_argument("--ignore_whitespace", action="store_true", help="ignore whitespace when comparing cell outputs")
+    parser.add_argument("--keep_artifacts", action="store_true", help="save files/directories created during execution")
     args = parser.parse_args()
 
     if args.num_proc is None:
@@ -116,6 +141,8 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
         if not notebook_path.name.endswith("-checkpoint.ipynb")
     ]
 
+    snap_before_exec = list(Path(NOTEBOOKS_PATH).iterdir())
+
     num_proc = min(min(num_proc, multiprocess.cpu_count()), len(notebook_paths))
     if num_proc == 1:
         reports = []
@@ -123,9 +150,15 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
             report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace)
             reports.append(report)
     else:
+        # inject the SST dataset to prevent parallel download
+        inject_sst()
         with multiprocess.Pool(num_proc) as pool:
             reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths)
 
+    if args.keep_artefacts is False:
+        snap_after_exec = list(Path(NOTEBOOKS_PATH).iterdir())
+        cleanup(snap_before_exec, snap_after_exec)
+
     if any(report for _, report in reports):
         reports_str = "\n\n".join([
             f"In notebook {notebook}:\n" + textwrap.indent(
diff --git a/docs/source/scripts/requirements.txt b/docs/source/scripts/requirements.txt
index 5623fe18..a5fc2b3c 100644
--- a/docs/source/scripts/requirements.txt
+++ b/docs/source/scripts/requirements.txt
@@ -1,4 +1,5 @@
 ipykernel
+ipywidgets
 nbformat
 nbconvert
 multiprocess
\ No newline at end of file
diff --git a/podium/datasets/arrow.py b/podium/datasets/arrow.py
index b69b20b5..0cceb874 100644
--- a/podium/datasets/arrow.py
+++ b/podium/datasets/arrow.py
@@ -299,7 +299,7 @@ def from_tabular_file(
         format = format.lower()
         csv_reader_params = {} if csv_reader_params is None else csv_reader_params
 
-        with open(os.path.expanduser(path), encoding="utf8") as f:
+        with open(os.path.expanduser(path), encoding="utf-8") as f:
             if format in {"csv", "tsv"}:
                 delimiter = "," if format == "csv" else "\t"
                 reader = csv.reader(f, delimiter=delimiter, **csv_reader_params)
@@ -542,7 +542,7 @@ def load_cache(cache_path) -> "DiskBackedDataset":
         """
         # load fields
         fields_file_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME)
-        with open(fields_file_path, "rb") as fields_cache_file:
+        with open(os.path.expanduser(fields_file_path), "rb") as fields_cache_file:
             fields = pickle.load(fields_cache_file)
 
         # load dataset as memory mapped arrow table
@@ -587,7 +587,7 @@ def dump_cache(self, cache_path: Optional[str] = None) -> str:
 
         # pickle fields
         cache_fields_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME)
-        with open(cache_fields_path, "wb") as fields_cache_file:
+        with open(os.path.expanduser(cache_fields_path), "wb") as fields_cache_file:
             pickle.dump(self.fields, fields_cache_file)
 
         # dump table
diff --git a/podium/datasets/impl/conllu_dataset.py b/podium/datasets/impl/conllu_dataset.py
index 6a377e28..5872e7c4 100644
--- a/podium/datasets/impl/conllu_dataset.py
+++ b/podium/datasets/impl/conllu_dataset.py
@@ -2,6 +2,7 @@
 Module contains the CoNLL-U dataset.
 """
 import collections
+import os
 
 from podium.datasets import Dataset
 from podium.datasets.example_factory import ExampleFactory
@@ -87,7 +88,7 @@ def safe_conllu_parse(in_file):
         example_factory = ExampleFactory(fields)
 
         examples = []
-        with open(file_path, encoding="utf-8") as in_file:
+        with open(os.path.expanduser(file_path), encoding="utf-8") as in_file:
             for tokenlist in safe_conllu_parse(in_file):
                 example_dict = collections.defaultdict(lambda: [])
                 for token in tokenlist:
diff --git a/podium/datasets/impl/imdb.py b/podium/datasets/impl/imdb.py
index 500cd563..17950c49 100644
--- a/podium/datasets/impl/imdb.py
+++ b/podium/datasets/impl/imdb.py
@@ -153,7 +153,9 @@ def _create_labeled_examples(dir_path, label, fields):
         ]
         examples = []
         for file_path in files_list:
-            with open(file=os.path.join(dir_path, file_path), encoding="utf8") as fpr:
+            with open(
+                os.path.expanduser(os.path.join(dir_path, file_path)), encoding="utf-8"
+            ) as fpr:
                 data = {IMDB.TEXT_FIELD_NAME: fpr.read(), IMDB.LABEL_FIELD_NAME: label}
                 examples.append(example_factory.from_dict(data))
         return examples
diff --git a/podium/datasets/impl/snli.py b/podium/datasets/impl/snli.py
index bebc522e..52503052 100644
--- a/podium/datasets/impl/snli.py
+++ b/podium/datasets/impl/snli.py
@@ -99,7 +99,7 @@ def _create_examples(file_path, fields):
         example_factory = ExampleFactory(fields)
         examples = []
 
-        with open(file=file_path, encoding="utf8") as in_file:
+        with open(os.path.expanduser(file_path), encoding="utf-8") as in_file:
             for line in in_file:
                 examples.append(example_factory.from_json(line))
         return examples
diff --git a/podium/datasets/impl/sst.py b/podium/datasets/impl/sst.py
index e0a40f45..f6200f25 100644
--- a/podium/datasets/impl/sst.py
+++ b/podium/datasets/impl/sst.py
@@ -123,7 +123,7 @@ def label_trf(label):
             return label_to_string_map[label]
 
         examples = []
-        with open(file=file_path, encoding="utf8") as fpr:
+        with open(os.path.expanduser(file_path), encoding="utf-8") as fpr:
             for line in fpr:
 
                 example = example_factory.from_fields_tree(
diff --git a/podium/vectorizers/vectorizer.py b/podium/vectorizers/vectorizer.py
index 2f7b1e1a..982e1f20 100644
--- a/podium/vectorizers/vectorizer.py
+++ b/podium/vectorizers/vectorizer.py
@@ -298,7 +298,7 @@ def _cache_vectors(self):
         """
         Method for caching loaded vectors to cache_dir.
         """
-        with open(self._cache_path, "wb") as cache_file:
+        with open(os.path.expanduser(self._cache_path), "wb") as cache_file:
             for word in self._vectors:
                 vector_values_string = " ".join(map(str, self._vectors[word]))
                 cache_file.write(f"{word} {vector_values_string}\n".encode("utf-8"))
@@ -362,7 +362,9 @@ def _load_vectors(self, vocab=None):
             vocab = set(vocab)
 
         open_mode, split_delimiter = ("rb", b" ") if self._binary else ("r", " ")
-        with open(curr_path, open_mode, encoding=self._encoding) as vector_file:
+        with open(
+            os.path.expanduser(curr_path), open_mode, encoding=self._encoding
+        ) as vector_file:
 
             vectors_loaded = 0
             header_lines = 0

From 41fd885d3c0de451ecba71a18c6ee411a6f62ba2 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 13:13:00 +0200
Subject: [PATCH 09/15] Small fix

---
 docs/source/scripts/check_notebooks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
index 92fd08b2..6020388c 100644
--- a/docs/source/scripts/check_notebooks.py
+++ b/docs/source/scripts/check_notebooks.py
@@ -101,7 +101,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
 
         new_cell_stdout = "".join([
             new_cell_output["text"]
-            for new_cell_output in new_cell["outputs"] if new_cell_output["name"] == "stdout"
+            for new_cell_output in new_cell["outputs"] if new_cell_output["output_type"] == "stream" and new_cell_output["name"] == "stdout"
         ])
 
         original_cell_stdout_ = original_cell_stdout

From c4c650f5669e0415140fe660e1dab64a3a35a0a6 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 13:20:26 +0200
Subject: [PATCH 10/15] Fix argparse argument name

---
 docs/source/scripts/check_notebooks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
index 6020388c..ab4442c6 100644
--- a/docs/source/scripts/check_notebooks.py
+++ b/docs/source/scripts/check_notebooks.py
@@ -155,7 +155,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
         with multiprocess.Pool(num_proc) as pool:
             reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths)
 
-    if args.keep_artefacts is False:
+    if args.keep_artifacts is False:
         snap_after_exec = list(Path(NOTEBOOKS_PATH).iterdir())
         cleanup(snap_before_exec, snap_after_exec)
 

From 79868fb47fea58ac30eed77362c96acc99def059 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 15:04:30 +0200
Subject: [PATCH 11/15] Fix comparison

---
 docs/source/scripts/check_notebooks.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
index ab4442c6..648ca6df 100644
--- a/docs/source/scripts/check_notebooks.py
+++ b/docs/source/scripts/check_notebooks.py
@@ -1,6 +1,5 @@
 import argparse
 import copy
-import multiprocess
 import os
 import shutil
 import string
@@ -9,6 +8,7 @@
 from functools import partial
 from pathlib import Path
 
+import multiprocess
 import nbformat
 from nbconvert.preprocessors import ExecutePreprocessor
 
@@ -20,7 +20,7 @@
 TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace))
 
 
-def inject_sst():
+def inject_shared_download():
     delim = "&" if os.name == "nt" else ";"
     subprocess.call(
         delim.join([INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]),
@@ -108,7 +108,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
         new_cell_stdout_ = new_cell_stdout
 
         if ignore_whitespace:
-            original_cell = original_cell_stdout.translate(TRANS_TABLE)
+            original_cell_stdout = original_cell_stdout.translate(TRANS_TABLE)
             new_cell_stdout = new_cell_stdout.translate(TRANS_TABLE)
         else:
             if new_cell_stdout[-1] == "\n" and original_cell_stdout[-1] != "\n":
@@ -150,8 +150,8 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
             report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace)
             reports.append(report)
     else:
-        # inject the SST dataset to prevent parallel download
-        inject_sst()
+        # predownload datasets/vectorizers to prevent parallel download
+        inject_shared_download()
         with multiprocess.Pool(num_proc) as pool:
             reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths)
 
@@ -168,7 +168,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
                     for i, original_output, new_output in report),
                 " " * 4,
             )
-            for notebook, report in reports
+            for notebook, report in reports if len(report) > 0
         ])
         raise Exception(
             "❌❌ Mismatches found in the outputs of the notebooks:\n\n" + reports_str

From 4ba16f808e2bd33df37f9f7720d9b07089890239 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 15:06:57 +0200
Subject: [PATCH 12/15] Redirect datasets output

---
 docs/source/notebooks/walkthrough.ipynb | 13 +++++++++----
 docs/source/walkthrough.rst             | 13 +++++++++----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb
index 3cfb3902..4e4838da 100644
--- a/docs/source/notebooks/walkthrough.ipynb
+++ b/docs/source/notebooks/walkthrough.ipynb
@@ -158,11 +158,16 @@
     }
    ],
    "source": [
-    "import datasets\n",
+    "from datasets import load_dataset\n",
+    "from contextlib import redirect_stdout\n",
     "from pprint import pprint\n",
-    "# Loading a huggingface dataset returns an instance of DatasetDict\n",
-    "# which contains the dataset splits (usually: train, valid, test) \n",
-    "imdb = datasets.load_dataset('imdb')\n",
+    "\n",
+    "# Silence download logs\n",
+    "with redirect_stdout(None):\n",
+    "    # Loading a huggingface dataset returns an instance of DatasetDict\n",
+    "    # which contains the dataset splits (usually: train, valid, test) \n",
+    "    imdb = load_dataset('imdb')\n",
+    "\n",
     "print(imdb.keys())\n",
     "\n",
     "# Each dataset has a set of features which need to be mapped\n",
diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst
index 5d7089be..c14ae032 100644
--- a/docs/source/walkthrough.rst
+++ b/docs/source/walkthrough.rst
@@ -73,11 +73,16 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your
 
 .. code-block:: python
 
-  >>> import datasets
+  >>> from datasets import load_dataset
+  >>> from contextlib import redirect_stdout
   >>> from pprint import pprint
-  >>> # Loading a huggingface dataset returns an instance of DatasetDict
-  >>> # which contains the dataset splits (usually: train, valid, test) 
-  >>> imdb = datasets.load_dataset('imdb')
+  >>>
+  >>> # Silence download logs
+  >>> with redirect_stdout(None):
+  >>>     # Loading a huggingface dataset returns an instance of DatasetDict
+  >>>     # which contains the dataset splits (usually: train, valid, test) 
+  >>>     imdb = load_dataset('imdb')
+  >>>
   >>> print(imdb.keys())
   dict_keys(['train', 'test', 'unsupervised'])
   >>> 

From 67d202a4272bf0d425291f13399565885bcbb995 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 15:22:30 +0200
Subject: [PATCH 13/15] Redirect to devnull

---
 docs/source/walkthrough.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst
index c14ae032..fa346a61 100644
--- a/docs/source/walkthrough.rst
+++ b/docs/source/walkthrough.rst
@@ -73,12 +73,14 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your
 
 .. code-block:: python
 
-  >>> from datasets import load_dataset
+  >>> import os
   >>> from contextlib import redirect_stdout
   >>> from pprint import pprint
   >>>
+  >>> from datasets import load_dataset
+  >>>
   >>> # Silence download logs
-  >>> with redirect_stdout(None):
+  >>> with redirect_stdout(open(os.devnull, "w")):
   >>>     # Loading a huggingface dataset returns an instance of DatasetDict
   >>>     # which contains the dataset splits (usually: train, valid, test) 
   >>>     imdb = load_dataset('imdb')

From 8964fa84a21821158f1752f342478ff2e6f80bb0 Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Thu, 1 Apr 2021 15:27:37 +0200
Subject: [PATCH 14/15] Update notebooks

---
 docs/source/notebooks/walkthrough.ipynb | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb
index 4e4838da..46991ef4 100644
--- a/docs/source/notebooks/walkthrough.ipynb
+++ b/docs/source/notebooks/walkthrough.ipynb
@@ -158,12 +158,14 @@
     }
    ],
    "source": [
-    "from datasets import load_dataset\n",
+    "import os\n",
     "from contextlib import redirect_stdout\n",
     "from pprint import pprint\n",
     "\n",
+    "from datasets import load_dataset\n",
+    "\n",
     "# Silence download logs\n",
-    "with redirect_stdout(None):\n",
+    "with redirect_stdout(open(os.devnull, \"w\")):\n",
     "    # Loading a huggingface dataset returns an instance of DatasetDict\n",
     "    # which contains the dataset splits (usually: train, valid, test) \n",
     "    imdb = load_dataset('imdb')\n",

From ce1b17f5761a22e75d1b77c4a02f00f79099aefe Mon Sep 17 00:00:00 2001
From: mariosasko <mariosasko777@gmail.com>
Date: Fri, 2 Apr 2021 01:22:54 +0200
Subject: [PATCH 15/15] Fix multiprocessing

---
 docs/source/notebooks/sample_dataset.csv |  3 +++
 docs/source/notebooks/walkthrough.ipynb  | 14 ++++++--------
 docs/source/scripts/check_notebooks.py   | 21 ++++++++++++++++-----
 docs/source/walkthrough.rst              | 14 ++++++--------
 4 files changed, 31 insertions(+), 21 deletions(-)
 create mode 100644 docs/source/notebooks/sample_dataset.csv

diff --git a/docs/source/notebooks/sample_dataset.csv b/docs/source/notebooks/sample_dataset.csv
new file mode 100644
index 00000000..7827a95f
--- /dev/null
+++ b/docs/source/notebooks/sample_dataset.csv
@@ -0,0 +1,3 @@
+text,label
+Absorbing character study .,positive
+Amazingly lame .,negative
diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb
index 46991ef4..8ff1658d 100644
--- a/docs/source/notebooks/walkthrough.ipynb
+++ b/docs/source/notebooks/walkthrough.ipynb
@@ -149,7 +149,8 @@
      "data": {
       "text/plain": [
        "dict_keys(['train', 'test', 'unsupervised'])\n",
-       "{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}"
+       "{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),\n",
+       " 'text': Value(dtype='string', id=None)}"
       ]
      },
      "execution_count": null,
@@ -159,22 +160,19 @@
    ],
    "source": [
     "import os\n",
-    "from contextlib import redirect_stdout\n",
     "from pprint import pprint\n",
     "\n",
     "from datasets import load_dataset\n",
     "\n",
-    "# Silence download logs\n",
-    "with redirect_stdout(open(os.devnull, \"w\")):\n",
-    "    # Loading a huggingface dataset returns an instance of DatasetDict\n",
-    "    # which contains the dataset splits (usually: train, valid, test) \n",
-    "    imdb = load_dataset('imdb')\n",
+    "# Loading a huggingface dataset returns an instance of DatasetDict\n",
+    "# which contains the dataset splits (usually: train, valid, test) \n",
+    "imdb = load_dataset('imdb')\n",
     "\n",
     "print(imdb.keys())\n",
     "\n",
     "# Each dataset has a set of features which need to be mapped\n",
     "# to Podium Fields.\n",
-    "print(imdb['train'].features)"
+    "pprint(imdb['train'].features)"
    ]
   },
   {
diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py
index 648ca6df..8671a0c8 100644
--- a/docs/source/scripts/check_notebooks.py
+++ b/docs/source/scripts/check_notebooks.py
@@ -1,6 +1,7 @@
 import argparse
 import copy
 import os
+import re
 import shutil
 import string
 import subprocess
@@ -19,11 +20,21 @@
 INSTALL_SST_COMMAND = "python -c \"from podium.datasets import SST; SST.get_dataset_splits()\""
 TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace))
 
+_re_pip_install = re.compile(r"!\s*(pip\s+install\s+[^\\\"]*)")
+_re_python = re.compile(r"!\s*(python[^\\\"]*)")
+
+
+def init(notebook_paths):
+    all_commands = []
+    for notebook_path in notebook_paths:
+        with open(notebook_path, encoding="utf-8") as f:
+            notebook_raw = f.read()
+            commands = _re_pip_install.findall(notebook_raw) + _re_python.findall(notebook_raw)
+        all_commands.extend(commands)
 
-def inject_shared_download():
     delim = "&" if os.name == "nt" else ";"
     subprocess.call(
-        delim.join([INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]),
+        delim.join([*all_commands, INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]),
         shell=True,
         cwd=Path(NOTEBOOKS_PATH).absolute(),
         stdout=subprocess.DEVNULL,
@@ -138,7 +149,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
     notebook_paths = [
         notebook_path
         for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb")
-        if not notebook_path.name.endswith("-checkpoint.ipynb")
+        if not (notebook_path.name.endswith("-checkpoint.ipynb") or notebook_path.parts[-2] == "examples")
     ]
 
     snap_before_exec = list(Path(NOTEBOOKS_PATH).iterdir())
@@ -150,8 +161,8 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False)
             report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace)
             reports.append(report)
     else:
-        # predownload datasets/vectorizers to prevent parallel download
-        inject_shared_download()
+        # install packages and predownload datasets/vectorizers to prevent parallel download
+        init(notebook_paths)
         with multiprocess.Pool(num_proc) as pool:
             reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths)
 
diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst
index fa346a61..cc573c28 100644
--- a/docs/source/walkthrough.rst
+++ b/docs/source/walkthrough.rst
@@ -74,24 +74,22 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your
 .. code-block:: python
 
   >>> import os
-  >>> from contextlib import redirect_stdout
   >>> from pprint import pprint
   >>>
   >>> from datasets import load_dataset
   >>>
-  >>> # Silence download logs
-  >>> with redirect_stdout(open(os.devnull, "w")):
-  >>>     # Loading a huggingface dataset returns an instance of DatasetDict
-  >>>     # which contains the dataset splits (usually: train, valid, test) 
-  >>>     imdb = load_dataset('imdb')
+  >>> # Loading a huggingface dataset returns an instance of DatasetDict
+  >>> # which contains the dataset splits (usually: train, valid, test) 
+  >>> imdb = load_dataset('imdb')
   >>>
   >>> print(imdb.keys())
   dict_keys(['train', 'test', 'unsupervised'])
   >>> 
   >>> # Each dataset has a set of features which need to be mapped
   >>> # to Podium Fields.
-  >>> print(imdb['train'].features)
-  {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}
+  >>> pprint(imdb['train'].features)
+  {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),
+   'text': Value(dtype='string', id=None)}
 
 As is the case with loading your custom dataset, ``features`` of 🤗 datasets need to be mapped to Podium Fields in order to direct the data flow for preprocessing.