From 51eb9e7e85776566a501e4a63da68e2fc10b7822 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Sat, 27 Mar 2021 13:48:13 +0100 Subject: [PATCH 01/15] Fix references in colab --- docs/source/scripts/convert_doc_to_notebooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/scripts/convert_doc_to_notebooks.py b/docs/source/scripts/convert_doc_to_notebooks.py index 1feb0731..913c406c 100644 --- a/docs/source/scripts/convert_doc_to_notebooks.py +++ b/docs/source/scripts/convert_doc_to_notebooks.py @@ -273,7 +273,7 @@ def convert_math(text): def convert_anchor(text): """ Convert text to an anchor that can be used in the notebook.""" anchor_name = _re_anchor_section.search(text).groups()[0] - return f"" + return f"" ################################### From b1303b7b3f53d9d8c9c0c6b93d7c91adea66cf16 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 31 Mar 2021 15:57:36 +0200 Subject: [PATCH 02/15] Add check_notebooks script --- .github/workflows/scheduled.yml | 28 ++ README.md | 2 +- docs/source/installation.md | 2 +- docs/source/notebooks/advanced.ipynb | 443 +++++++++++------- docs/source/scripts/check_notebooks.py | 106 +++++ .../scripts/convert_doc_to_notebooks.py | 2 +- 6 files changed, 409 insertions(+), 174 deletions(-) create mode 100644 .github/workflows/scheduled.yml create mode 100644 docs/source/scripts/check_notebooks.py diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml new file mode 100644 index 00000000..9f394b8c --- /dev/null +++ b/.github/workflows/scheduled.yml @@ -0,0 +1,28 @@ +name: Scheduled jobs + +on: + repository_dispatch: + schedule: + - cron: "0 0 * * *" + +jobs: + check_notebooks: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.6] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install + - name: Execute notebooks + run: | + cd docs/source/scripts + python check_notebooks.py --num_proc auto diff --git a/README.md b/README.md index 84ea9eef..e7489d16 100644 --- a/README.md +++ b/README.md @@ -207,7 +207,7 @@ Example({ }) ``` -For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/takelab/podium/blob/master/docs/source/notebooks/quickstart.ipynb) +For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/TakeLab/podium/blob/master/docs/source/notebooks/quickstart.ipynb) More complex examples can be found in our [examples folder](./examples). diff --git a/docs/source/installation.md b/docs/source/installation.md index 7c67d2fb..bcc7ab55 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -23,5 +23,5 @@ Coming soon! ## Installing from source To install from source via terminal: -1. Clone the repository: `git clone git@github.com:takelab/podium.git && cd podium` +1. Clone the repository: `git clone git@github.com:TakeLab/podium.git && cd podium` 2. Install podium: `pip install .` diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb index c9f7d874..73484cb5 100644 --- a/docs/source/notebooks/advanced.ipynb +++ b/docs/source/notebooks/advanced.ipynb @@ -2,14 +2,117 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting git+https://github.com/TakeLab/podium.git\n", + " Cloning https://github.com/TakeLab/podium.git to c:\\users\\mario\\appdata\\local\\temp\\pip-req-build-ixj3hts4\n", + "Requirement already satisfied: dill in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.3.3)\n", + "Requirement already satisfied: nltk>=3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (3.5)\n", + "Requirement already satisfied: paramiko in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (2.7.2)\n", + "Requirement already satisfied: requests in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (2.24.0)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.23.2)\n", + "Requirement already satisfied: tqdm in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (4.49.0)\n", + "Requirement already satisfied: numpy<=1.19 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.19.0)\n", + "Requirement already satisfied: pandas<1.2.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.1.2)\n", + "Requirement already satisfied: scipy<1.6.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.5.2)\n", + "Requirement already satisfied: dataclasses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.7)\n", + "Requirement already satisfied: regex in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (2020.11.13)\n", + "Requirement already satisfied: joblib in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (0.16.0)\n", + "Requirement already satisfied: click in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (7.1.2)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from pandas<1.2.0->podium==1.1.1) (2020.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from pandas<1.2.0->podium==1.1.1) (2.8.1)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from python-dateutil>=2.7.3->pandas<1.2.0->podium==1.1.1) (1.15.0)\n", + "Requirement already satisfied: pynacl>=1.0.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (1.4.0)\n", + "Requirement already satisfied: bcrypt>=3.1.3 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (3.2.0)\n", + "Requirement already satisfied: cryptography>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (3.1.1)\n", + "Requirement already satisfied: cffi>=1.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from bcrypt>=3.1.3->paramiko->podium==1.1.1) (1.14.3)\n", + "Requirement already satisfied: pycparser in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from cffi>=1.1->bcrypt>=3.1.3->paramiko->podium==1.1.1) (2.20)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (1.25.10)\n", + "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (2020.6.20)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (3.0.4)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from scikit-learn->podium==1.1.1) (2.1.0)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " Running command git clone -q https://github.com/TakeLab/podium.git 'C:\\Users\\Mario\\AppData\\Local\\Temp\\pip-req-build-ixj3hts4'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: transformers in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (4.4.0.dev0)\n", + "Requirement already satisfied: spacy in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (2.2.3)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (2.24.0)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.9.6)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.8.2)\n", + "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.19.0)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.4.1)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (2.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.28.0)\n", + "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (7.3.1)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (3.0.5)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.0.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (52.0.0.post20210125)\n", + "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.0.5)\n", + "Requirement already satisfied: importlib-metadata>=0.20 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy) (2.0.0)\n", + "Requirement already satisfied: zipp>=0.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.2.0)\n", + "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.6.20)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.25.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from thinc<7.4.0,>=7.3.0->spacy) (4.49.0)\n", + "Requirement already satisfied: packaging in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (20.4)\n", + "Requirement already satisfied: filelock in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (2020.11.13)\n", + "Requirement already satisfied: sacremoses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.0.43)\n", + "Requirement already satisfied: dataclasses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.7)\n", + "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.10.1)\n", + "Requirement already satisfied: six in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from packaging->transformers) (1.15.0)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from packaging->transformers) (2.4.7)\n", + "Requirement already satisfied: click in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from sacremoses->transformers) (7.1.2)\n", + "Requirement already satisfied: joblib in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from sacremoses->transformers) (0.16.0)\n", + "Collecting en_core_web_sm==2.2.5\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)\n", + "Requirement already satisfied: spacy>=2.2.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from en_core_web_sm==2.2.5) (2.2.3)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.5)\n", + "Requirement already satisfied: setuptools in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (52.0.0.post20210125)\n", + "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.5)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.5)\n", + "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.3.1)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.8.2)\n", + "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.19.0)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.28.0)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.9.6)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.24.0)\n", + "Requirement already satisfied: importlib-metadata>=0.20 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.0)\n", + "Requirement already satisfied: zipp>=0.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.2.0)\n", + "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2020.6.20)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.25.10)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from thinc<7.4.0,>=7.3.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (4.49.0)\n", + "[+] Download and installation successful\n", + "You can now load the model via spacy.load('en_core_web_sm')\n" + ] + } + ], "source": [ "# Podium installation\n", - "! pip install podium-nlp\n", + "# ! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# ! pip install git+https://github.com/takelab/podium\n", + "! pip install git+https://github.com/TakeLab/podium.git\n", "\n", "# Additional dependencies required to run this notebook:\n", "! pip install transformers spacy\n", @@ -34,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -67,19 +170,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "None ['A', 'slick', ',', 'engrossing', 'melodrama', '.']\n", - "None positive" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "None ['A', 'slick', ',', 'engrossing', 'melodrama', '.']\n", + "None positive\n" + ] } ], "source": [ @@ -98,18 +198,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "('A slick , engrossing melodrama .', ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "('A slick , engrossing melodrama .', ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n" + ] } ], "source": [ @@ -169,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -187,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -206,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -242,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -274,7 +371,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -297,18 +394,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "('a slick , engrossing melodrama .', ['a', 'slick', 'engrossing', 'melodrama'])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "('a slick , engrossing melodrama .', ['a', 'slick', 'engrossing', 'melodrama'])\n" + ] } ], "source": [ @@ -367,18 +461,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] } ], "source": [ @@ -403,19 +494,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "\n", - "True" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "True\n" + ] } ], "source": [ @@ -435,18 +523,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "(None, ['', 'A', 'slick', ',', 'engrossing', 'melodrama', '.'])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "(None, ['', 'A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n" + ] } ], "source": [ @@ -498,18 +583,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])\n" + ] } ], "source": [ @@ -545,19 +627,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n", - "(None, ['A', ' ', 's', 'l', 'i', 'c', 'k', ' ', ',', ' ', 'e', 'n', 'g', 'r', 'o', 's', 's', 'i', 'n', 'g', ' ', 'm', 'e', 'l', 'o', 'd', 'r', 'a', 'm', 'a', ' ', '.'])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n", + "(None, ['A', ' ', 's', 'l', 'i', 'c', 'k', ' ', ',', ' ', 'e', 'n', 'g', 'r', 'o', 's', 's', 'i', 'n', 'g', ' ', 'm', 'e', 'l', 'o', 'd', 'r', 'a', 'm', 'a', ' ', '.'])\n" + ] } ], "source": [ @@ -597,19 +676,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n", - "(None, ['DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n", + "(None, ['DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'])\n" + ] } ], "source": [ @@ -673,18 +749,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "0.5 0.3 0.2" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5 0.3 0.2\n" + ] } ], "source": [ @@ -704,20 +777,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "{'negative': 0.47803468208092487, 'positive': 0.5219653179190752}\n", - "{'negative': 0.48458574181117536, 'positive': 0.5154142581888247}\n", - "{'negative': 0.46965317919075145, 'positive': 0.5303468208092486}" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "{'negative': 0.47803468208092487, 'positive': 0.5219653179190752}\n", + "{'negative': 0.48458574181117536, 'positive': 0.5154142581888247}\n", + "{'negative': 0.46965317919075145, 'positive': 0.5303468208092486}\n" + ] } ], "source": [ @@ -742,18 +812,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "0.5 0.3 0.2" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5 0.3 0.2\n" + ] } ], "source": [ @@ -771,20 +838,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n", - "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n", - "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n", + "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n", + "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n" + ] } ], "source": [ @@ -822,20 +886,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "['positive', 'negative']\n", - "['positive', 'negative']\n", - "31920 = 25000 + 6920" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████| 84.1M/84.1M [00:27<00:00, 3.04MB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['positive', 'negative']\n", + "['positive', 'negative']\n", + "31920 = 25000 + 6920\n" + ] } ], "source": [ @@ -888,7 +956,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -919,19 +987,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "For Iterator, padding = 148141 out of 281696 = 52.588961149608096%\n", - "For BucketIterator, padding = 2125 out of 135680 = 1.5661851415094339%" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "For Iterator, padding = 145749 out of 279304 = 52.18%\n", + "For BucketIterator, padding = 2125 out of 135680 = 1.57%\n" + ] } ], "source": [ @@ -1011,7 +1076,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1070,10 +1135,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [ { +<<<<<<< HEAD "data": { "text/plain": [ "Example({\n", @@ -1085,6 +1151,16 @@ "execution_count": null, "metadata": {}, "output_type": "execute_result" +======= + "name": "stdout", + "output_type": "stream", + "text": [ + "Example({\n", + " text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n", + " label: (None, 'positive')\n", + "})\n" + ] +>>>>>>> Add check_notebooks script } ], "source": [ @@ -1116,10 +1192,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [ { +<<<<<<< HEAD "data": { "text/plain": [ "[[ 14 1144 9 2955 8 27 4 2956 3752 10 149 62 0 64\n", @@ -1130,6 +1207,15 @@ "execution_count": null, "metadata": {}, "output_type": "execute_result" +======= + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 14 1057 10 2580 8 28 4 3334 3335 9 154 68 0 67\n", + " 5 11 81 9 274 8 83 6 4683 74 2901 38 1410 2581\n", + " 3 0 2102 0 49 870 0 2]]\n" + ] +>>>>>>> Add check_notebooks script } ], "source": [ @@ -1156,19 +1242,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "True\n", - "True" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n" + ] } ], "source": [ @@ -1188,7 +1271,25 @@ ] } ], - "metadata": {}, + "metadata": { + "kernelspec": { + "display_name": "test-podium", + "language": "python", + "name": "test-podium" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.12" + } + }, "nbformat": 4, "nbformat_minor": 4 } diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py new file mode 100644 index 00000000..55833d23 --- /dev/null +++ b/docs/source/scripts/check_notebooks.py @@ -0,0 +1,106 @@ +import argparse +import multiprocessing +import textwrap +from functools import partial +from pathlib import Path + +import nbformat +from nbconvert.preprocessors import CellExecutionError, ExecutePreprocessor + + +NOTEBOOKS_PATH = "../notebooks" +INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp\n" +INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git\n" + + +def replace_install_release_with_source(nb): + cell = nb.cells[0] + # sanity check + assert cell["cell_type"] == "code" + assert isinstance(cell.source, list) + + irv_idx = cell["source"].index(INSTALL_RELEASE_VERSION_COMMAND) + cell["source"][irv_idx] = "# " + cell["source"][irv_idx] + + isv_idx = cell["source"].index(INSTALL_SOURCE_VERSION_COMMAND) + cell["source"][isv_idx] = cell["source"][isv_idx][cell["source"][isv_idx].index("!"):] + + +def check_notebook_output(notebook_path, env="python3"): + with open(notebook_path, encoding="utf-8") as f: + nb = nbformat.read(f, as_version=4) + + original_nb = nb.copy() + ep = ExecutePreprocessor(kernel_name=env) + new_nb = nb + replace_install_release_with_source(new_nb) + try: + ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}}) + except CellExecutionError: + print(f"Error happened while executing the notebook {notebook_path.name}") + raise + + report = [] + assert len(original_nb["cells"]) == len(new_nb["cells"]) + for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cell"])): + # consider only cells with code + if original_cell["cell_type"] != "code": + continue + + # sanity check + assert isinstance(original_cell["source"], list) + # skip cells with commands + for line in original_cell["source"]: + if line.strip().startswith(("!", "%")): + continue + + # sanity check + assert isinstance(original_cell["outputs"]["data"]["text/plain"], list) + original_cell_stdout = "".join(original_cell["outputs"]["data"]["text/plain"]) + + new_cell_stdout = "".join([ + new_cell_output["text"] + for new_cell_output in new_cell["outputs"] if new_cell_output["name"] == "stdout" + ]) + + if original_cell_stdout != new_cell_stdout: + report.append(i, original_cell_stdout, new_cell_stdout) + + return notebook_path.name, report + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--env", help="kernel that executes the notebook") + parser.add_argument("--num_proc", help="number of processes for parallel execution") + args = parser.parse_args() + + if args.num_proc is None: + num_proc = 1 + elif args.num_proc == "auto": + num_proc = multiprocessing.cpu_count() + else: + num_proc = int(args.num_proc) + + notebook_paths = [notebook_path for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb")] + num_proc = min(min(num_proc, multiprocessing.cpu_count()), len(notebook_paths)) + if num_proc == 1: + reports = [] + for notebook_path in notebook_paths: + report = check_notebook_output(notebook_path, env=args.env) + reports.append(report) + else: + with multiprocessing.Pool(num_proc) as pool: + reports = pool.map(partial(check_notebook_output, env=args.env), notebook_paths) + + if any(report for _, report in reports): + reports_str = "\n\n".join([ + f"In notebook {notebook}:\n" + textwrap.indent( + "\n".join( + f"Original output:\n{original_output}\nAfter execution:\n{new_output}" + for original_output, new_output in report), " " * 4) + for notebook, report in reports + ]) + raise Exception( + "❌❌ Found mismatches in the outputs of the notebooks:\n\n" + reports_str + ) diff --git a/docs/source/scripts/convert_doc_to_notebooks.py b/docs/source/scripts/convert_doc_to_notebooks.py index 913c406c..dd870b94 100644 --- a/docs/source/scripts/convert_doc_to_notebooks.py +++ b/docs/source/scripts/convert_doc_to_notebooks.py @@ -428,7 +428,7 @@ def rm_first_line(text): INSTALL_CODE = """# Podium installation ! pip install podium-nlp # To install from source instead of the last release, comment the command above and uncomment the following one. -# ! pip install git+https://github.com/takelab/podium +# ! pip install git+https://github.com/TakeLab/podium.git """ ADDITIONAL_DEPS = { From 977af00dce88699a4df1f3e075b0bb8eae10ce5c Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 31 Mar 2021 19:49:45 +0200 Subject: [PATCH 03/15] Fixes --- docs/source/notebooks/advanced.ipynb | 210 +++++----------------- docs/source/notebooks/preprocessing.ipynb | 2 +- docs/source/notebooks/walkthrough.ipynb | 4 +- docs/source/scripts/check_notebooks.py | 26 +-- 4 files changed, 61 insertions(+), 181 deletions(-) diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb index 73484cb5..9a0b0710 100644 --- a/docs/source/notebooks/advanced.ipynb +++ b/docs/source/notebooks/advanced.ipynb @@ -2,117 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting git+https://github.com/TakeLab/podium.git\n", - " Cloning https://github.com/TakeLab/podium.git to c:\\users\\mario\\appdata\\local\\temp\\pip-req-build-ixj3hts4\n", - "Requirement already satisfied: dill in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.3.3)\n", - "Requirement already satisfied: nltk>=3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (3.5)\n", - "Requirement already satisfied: paramiko in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (2.7.2)\n", - "Requirement already satisfied: requests in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (2.24.0)\n", - "Requirement already satisfied: scikit-learn in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.23.2)\n", - "Requirement already satisfied: tqdm in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (4.49.0)\n", - "Requirement already satisfied: numpy<=1.19 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.19.0)\n", - "Requirement already satisfied: pandas<1.2.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.1.2)\n", - "Requirement already satisfied: scipy<1.6.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (1.5.2)\n", - "Requirement already satisfied: dataclasses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from podium==1.1.1) (0.7)\n", - "Requirement already satisfied: regex in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (2020.11.13)\n", - "Requirement already satisfied: joblib in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (0.16.0)\n", - "Requirement already satisfied: click in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from nltk>=3.0->podium==1.1.1) (7.1.2)\n", - "Requirement already satisfied: pytz>=2017.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from pandas<1.2.0->podium==1.1.1) (2020.1)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from pandas<1.2.0->podium==1.1.1) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from python-dateutil>=2.7.3->pandas<1.2.0->podium==1.1.1) (1.15.0)\n", - "Requirement already satisfied: pynacl>=1.0.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (1.4.0)\n", - "Requirement already satisfied: bcrypt>=3.1.3 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (3.2.0)\n", - "Requirement already satisfied: cryptography>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from paramiko->podium==1.1.1) (3.1.1)\n", - "Requirement already satisfied: cffi>=1.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from bcrypt>=3.1.3->paramiko->podium==1.1.1) (1.14.3)\n", - "Requirement already satisfied: pycparser in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from cffi>=1.1->bcrypt>=3.1.3->paramiko->podium==1.1.1) (2.20)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (1.25.10)\n", - "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (2020.6.20)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests->podium==1.1.1) (3.0.4)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from scikit-learn->podium==1.1.1) (2.1.0)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " Running command git clone -q https://github.com/TakeLab/podium.git 'C:\\Users\\Mario\\AppData\\Local\\Temp\\pip-req-build-ixj3hts4'\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: transformers in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (4.4.0.dev0)\n", - "Requirement already satisfied: spacy in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (2.2.3)\n", - "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (2.24.0)\n", - "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.9.6)\n", - "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.8.2)\n", - "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.19.0)\n", - "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.4.1)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (2.0.5)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (0.28.0)\n", - "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (7.3.1)\n", - "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (3.0.5)\n", - "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.0.0)\n", - "Requirement already satisfied: setuptools in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (52.0.0.post20210125)\n", - "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy) (1.0.5)\n", - "Requirement already satisfied: importlib-metadata>=0.20 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy) (2.0.0)\n", - "Requirement already satisfied: zipp>=0.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.2.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.6.20)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.25.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from thinc<7.4.0,>=7.3.0->spacy) (4.49.0)\n", - "Requirement already satisfied: packaging in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (20.4)\n", - "Requirement already satisfied: filelock in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (3.0.12)\n", - "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (2020.11.13)\n", - "Requirement already satisfied: sacremoses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.0.43)\n", - "Requirement already satisfied: dataclasses in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.7)\n", - "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from transformers) (0.10.1)\n", - "Requirement already satisfied: six in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from packaging->transformers) (1.15.0)\n", - "Requirement already satisfied: pyparsing>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from packaging->transformers) (2.4.7)\n", - "Requirement already satisfied: click in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from sacremoses->transformers) (7.1.2)\n", - "Requirement already satisfied: joblib in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from sacremoses->transformers) (0.16.0)\n", - "Collecting en_core_web_sm==2.2.5\n", - " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)\n", - "Requirement already satisfied: spacy>=2.2.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from en_core_web_sm==2.2.5) (2.2.3)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.5)\n", - "Requirement already satisfied: setuptools in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (52.0.0.post20210125)\n", - "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.5)\n", - "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n", - "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.5)\n", - "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.3.1)\n", - "Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n", - "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.8.2)\n", - "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.19.0)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.28.0)\n", - "Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.9.6)\n", - "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.24.0)\n", - "Requirement already satisfied: importlib-metadata>=0.20 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.0)\n", - "Requirement already satisfied: zipp>=0.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.2.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2020.6.20)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.25.10)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in c:\\users\\mario\\anaconda3\\envs\\test-podium\\lib\\site-packages (from thinc<7.4.0,>=7.3.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (4.49.0)\n", - "[+] Download and installation successful\n", - "You can now load the model via spacy.load('en_core_web_sm')\n" - ] - } - ], + "outputs": [], "source": [ "# Podium installation\n", - "# ! pip install podium-nlp\n", + "! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "! pip install git+https://github.com/TakeLab/podium.git\n", + "# ! pip install git+https://github.com/TakeLab/podium.git\n", "\n", "# Additional dependencies required to run this notebook:\n", "! pip install transformers spacy\n", @@ -137,21 +34,18 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "Example({\n", - " text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n", - " label: (None, 'positive')\n", - "})" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Example({\n", + " text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n", + " label: (None, 'positive')\n", + "})\n" + ] } ], "source": [ @@ -170,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -198,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -266,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -284,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -303,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -339,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -371,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -394,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -431,7 +325,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -461,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -494,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -523,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -583,13 +477,28 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "66267ed5720f410faf47766dea3dbad6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ + "\n", "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])\n" ] } @@ -889,13 +798,6 @@ "execution_count": 22, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|█████████████████████████████████████████████████████████████████████████████| 84.1M/84.1M [00:27<00:00, 3.04MB/s]\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -1139,19 +1041,6 @@ "metadata": {}, "outputs": [ { -<<<<<<< HEAD - "data": { - "text/plain": [ - "Example({\n", - " text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n", - " label: (None, 'positive')\n", - "})" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" -======= "name": "stdout", "output_type": "stream", "text": [ @@ -1160,7 +1049,6 @@ " label: (None, 'positive')\n", "})\n" ] ->>>>>>> Add check_notebooks script } ], "source": [ @@ -1196,7 +1084,6 @@ "metadata": {}, "outputs": [ { -<<<<<<< HEAD "data": { "text/plain": [ "[[ 14 1144 9 2955 8 27 4 2956 3752 10 149 62 0 64\n", @@ -1207,15 +1094,6 @@ "execution_count": null, "metadata": {}, "output_type": "execute_result" -======= - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 14 1057 10 2580 8 28 4 3334 3335 9 154 68 0 67\n", - " 5 11 81 9 274 8 83 6 4683 74 2901 38 1410 2581\n", - " 3 0 2102 0 49 870 0 2]]\n" - ] ->>>>>>> Add check_notebooks script } ], "source": [ @@ -1273,9 +1151,9 @@ ], "metadata": { "kernelspec": { - "display_name": "test-podium", + "display_name": "Python 3", "language": "python", - "name": "test-podium" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1287,7 +1165,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/docs/source/notebooks/preprocessing.ipynb b/docs/source/notebooks/preprocessing.ipynb index 53e59961..1ade9a99 100644 --- a/docs/source/notebooks/preprocessing.ipynb +++ b/docs/source/notebooks/preprocessing.ipynb @@ -9,7 +9,7 @@ "# Podium installation\n", "! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# ! pip install git+https://github.com/takelab/podium\n", + "# ! pip install git+https://github.com/TakeLab/podium.git\n", "\n", "# Additional dependencies required to run this notebook:\n", "! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n", diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb index f78d4d45..d6944d23 100644 --- a/docs/source/notebooks/walkthrough.ipynb +++ b/docs/source/notebooks/walkthrough.ipynb @@ -9,7 +9,7 @@ "# Podium installation\n", "! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# ! pip install git+https://github.com/takelab/podium\n", + "# ! pip install git+https://github.com/TakeLab/podium.git\n", "\n", "# Additional dependencies required to run this notebook:\n", "! pip install datasets spacy\n", @@ -49,7 +49,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py index 55833d23..941037ba 100644 --- a/docs/source/scripts/check_notebooks.py +++ b/docs/source/scripts/check_notebooks.py @@ -1,4 +1,5 @@ import argparse +import copy import multiprocessing import textwrap from functools import partial @@ -14,25 +15,25 @@ def replace_install_release_with_source(nb): - cell = nb.cells[0] + cell = nb["cells"][0] # sanity check assert cell["cell_type"] == "code" - assert isinstance(cell.source, list) + assert isinstance(cell["source"], str) - irv_idx = cell["source"].index(INSTALL_RELEASE_VERSION_COMMAND) - cell["source"][irv_idx] = "# " + cell["source"][irv_idx] + assert INSTALL_RELEASE_VERSION_COMMAND in cell["source"] + cell["source"] = cell["source"].replace(INSTALL_RELEASE_VERSION_COMMAND, "# " + INSTALL_RELEASE_VERSION_COMMAND) - isv_idx = cell["source"].index(INSTALL_SOURCE_VERSION_COMMAND) - cell["source"][isv_idx] = cell["source"][isv_idx][cell["source"][isv_idx].index("!"):] + assert INSTALL_SOURCE_VERSION_COMMAND in cell["source"] + cell["source"] = cell["source"].replace(INSTALL_SOURCE_VERSION_COMMAND, INSTALL_SOURCE_VERSION_COMMAND[2:]) def check_notebook_output(notebook_path, env="python3"): with open(notebook_path, encoding="utf-8") as f: nb = nbformat.read(f, as_version=4) - original_nb = nb.copy() + original_nb = nb ep = ExecutePreprocessor(kernel_name=env) - new_nb = nb + new_nb = copy.deepcopy(nb) replace_install_release_with_source(new_nb) try: ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}}) @@ -40,6 +41,7 @@ def check_notebook_output(notebook_path, env="python3"): print(f"Error happened while executing the notebook {notebook_path.name}") raise + print(new_nb) report = [] assert len(original_nb["cells"]) == len(new_nb["cells"]) for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cell"])): @@ -48,9 +50,9 @@ def check_notebook_output(notebook_path, env="python3"): continue # sanity check - assert isinstance(original_cell["source"], list) + assert isinstance(original_cell["source"], str) # skip cells with commands - for line in original_cell["source"]: + for line in original_cell["source"].splitlines(): if line.strip().startswith(("!", "%")): continue @@ -71,7 +73,7 @@ def check_notebook_output(notebook_path, env="python3"): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--env", help="kernel that executes the notebook") + parser.add_argument("--env", default="python3", help="kernel that executes the notebook") parser.add_argument("--num_proc", help="number of processes for parallel execution") args = parser.parse_args() @@ -102,5 +104,5 @@ def check_notebook_output(notebook_path, env="python3"): for notebook, report in reports ]) raise Exception( - "❌❌ Found mismatches in the outputs of the notebooks:\n\n" + reports_str + "❌❌ Mismatches found in the outputs of the notebooks:\n\n" + reports_str ) From 0d03cd093bf16da861650cfd8a4605406f8c3014 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 02:18:26 +0200 Subject: [PATCH 04/15] Add check notebooks output script --- .github/workflows/scheduled.yml | 11 +- docs/source/advanced.rst | 11 +- docs/source/notebooks/advanced.ipynb | 366 ++++++++++-------- docs/source/notebooks/preprocessing.ipynb | 7 +- docs/source/notebooks/quickstart.ipynb | 8 +- docs/source/notebooks/walkthrough.ipynb | 52 ++- docs/source/preprocessing.rst | 2 +- docs/source/quickstart.rst | 6 +- docs/source/scripts/check_notebooks.py | 91 +++-- .../scripts/convert_doc_to_notebooks.py | 13 +- docs/source/scripts/requirements.txt | 4 + docs/source/walkthrough.rst | 40 +- podium/datasets/dataset.py | 2 +- podium/field.py | 2 +- setup.py | 1 - 15 files changed, 348 insertions(+), 268 deletions(-) create mode 100644 docs/source/scripts/requirements.txt diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml index 9f394b8c..856fa93a 100644 --- a/.github/workflows/scheduled.yml +++ b/.github/workflows/scheduled.yml @@ -1,6 +1,9 @@ name: Scheduled jobs on: + pull_request: + branches: + - master repository_dispatch: schedule: - cron: "0 0 * * *" @@ -11,6 +14,9 @@ jobs: strategy: matrix: python-version: [3.6] + defaults: + run: + working-direcory: docs/source/scripts steps: - uses: actions/checkout@v2 @@ -21,8 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install + pip install --r requirements.txt - name: Execute notebooks run: | - cd docs/source/scripts - python check_notebooks.py --num_proc auto + python check_notebooks.py --num_proc auto --ignore_whitespace diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 25a939eb..9572daa5 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -379,7 +379,9 @@ For a simple example, we will take a look at the built-in SST and IMDB datasets: >>> from podium import Field, LabelField, Vocab >>> # Load the datasets >>> imdb_train, imdb_test = IMDB.get_dataset_splits() + >>> imdb_train.finalize_fields() >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits() + >>> sst_train.finalize_fields() >>> >>> # Luckily, both label vocabularies are already equal >>> print(imdb_train.field('label').vocab.itos) @@ -415,7 +417,8 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended >>> label = LabelField(name='label') >>> fields = {'text': text, 'label': label} >>> - >>> train, valid, test = SST.get_dataset_splits(fields=fields) + >>> sst_train, sst_valid, sst_test = SST.get_dataset_splits(fields=fields) + >>> sst_train.finalize_fields() >>> >>> # Define the iterators and our sort key >>> from podium import Iterator, BucketIterator @@ -423,14 +426,14 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended >>> # Use the text Field >>> raw, tokenized = instance.text >>> return len(tokenized) - >>> bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length) + >>> bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length) The ``bucket_sort_key`` function defines how the instances in the dataset should be sorted. The method accepts an instance of the dataset, and should return a value which will be used as a sort key in the ``BucketIterator``. It might be interesting (and surprising) to see how much space (and time) do we earn by bucketing. We will define a naive iterator on the same dataset and measure the total amount of padding used when iterating over a dataset. .. code-block:: python >>> import numpy as np - >>> vanilla_iter = Iterator(train, batch_size=32) + >>> vanilla_iter = Iterator(sst_train, batch_size=32) >>> >>> def count_padding(batch, padding_idx): >>> return np.count_nonzero(batch == padding_idx) @@ -518,7 +521,7 @@ Each ``Dataset`` instance in the SST dataset splits contains ``Field``\s and a ` >>> import pickle >>> >>> cache_dir = Path('cache') - >>> cache_dir.mkdir() + >>> cache_dir.mkdir(exist_ok=True) >>> >>> dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl') >>> diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb index 9a0b0710..c5eac847 100644 --- a/docs/source/notebooks/advanced.ipynb +++ b/docs/source/notebooks/advanced.ipynb @@ -34,18 +34,21 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Example({\n", - " text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n", - " label: (None, 'positive')\n", - "})\n" - ] + "data": { + "text/plain": [ + "Example({\n", + " text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n", + " label: (None, 'positive')\n", + "})" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -64,16 +67,19 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "None ['A', 'slick', ',', 'engrossing', 'melodrama', '.']\n", - "None positive\n" - ] + "data": { + "text/plain": [ + "None ['A', 'slick', ',', 'engrossing', 'melodrama', '.']\n", + "None positive" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -92,15 +98,18 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "('A slick , engrossing melodrama .', ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n" - ] + "data": { + "text/plain": [ + "('A slick , engrossing melodrama .', ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -125,7 +134,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -160,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -178,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -197,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -233,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -265,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -288,15 +297,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "('a slick , engrossing melodrama .', ['a', 'slick', 'engrossing', 'melodrama'])\n" - ] + "data": { + "text/plain": [ + "('a slick , engrossing melodrama .', ['a', 'slick', 'engrossing', 'melodrama'])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -355,15 +367,18 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -388,16 +403,19 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "True\n" - ] + "data": { + "text/plain": [ + "\n", + "True" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -417,15 +435,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(None, ['', 'A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n" - ] + "data": { + "text/plain": [ + "(None, ['', 'A', 'slick', ',', 'engrossing', 'melodrama', '.'])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -456,7 +477,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -477,30 +498,18 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "66267ed5720f410faf47766dea3dbad6", - "version_major": 2, - "version_minor": 0 - }, "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…" + "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])" ] }, + "execution_count": null, "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "(None, ['a', 'slick', ',', 'eng', '##ross', '##ing', 'mel', '##od', '##rama', '.'])\n" - ] + "output_type": "execute_result" } ], "source": [ @@ -536,16 +545,19 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n", - "(None, ['A', ' ', 's', 'l', 'i', 'c', 'k', ' ', ',', ' ', 'e', 'n', 'g', 'r', 'o', 's', 's', 'i', 'n', 'g', ' ', 'm', 'e', 'l', 'o', 'd', 'r', 'a', 'm', 'a', ' ', '.'])\n" - ] + "data": { + "text/plain": [ + "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n", + "(None, ['A', ' ', 's', 'l', 'i', 'c', 'k', ' ', ',', ' ', 'e', 'n', 'g', 'r', 'o', 's', 's', 'i', 'n', 'g', ' ', 'm', 'e', 'l', 'o', 'd', 'r', 'a', 'm', 'a', ' ', '.'])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -585,16 +597,19 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n", - "(None, ['DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'])\n" - ] + "data": { + "text/plain": [ + "(None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.'])\n", + "(None, ['DET', 'ADJ', 'PUNCT', 'VERB', 'NOUN', 'PUNCT'])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -658,15 +673,18 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.5 0.3 0.2\n" - ] + "data": { + "text/plain": [ + "0.5 0.3 0.2" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -686,17 +704,20 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'negative': 0.47803468208092487, 'positive': 0.5219653179190752}\n", - "{'negative': 0.48458574181117536, 'positive': 0.5154142581888247}\n", - "{'negative': 0.46965317919075145, 'positive': 0.5303468208092486}\n" - ] + "data": { + "text/plain": [ + "{'negative': 0.47803468208092487, 'positive': 0.5219653179190752}\n", + "{'negative': 0.48458574181117536, 'positive': 0.5154142581888247}\n", + "{'negative': 0.46965317919075145, 'positive': 0.5303468208092486}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -721,15 +742,18 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.5 0.3 0.2\n" - ] + "data": { + "text/plain": [ + "0.5 0.3 0.2" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -747,17 +771,20 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n", - "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n", - "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n" - ] + "data": { + "text/plain": [ + "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n", + "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}\n", + "{'negative': 0.47832369942196534, 'positive': 0.5216763005780347}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -771,7 +798,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -795,17 +822,20 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "['positive', 'negative']\n", - "['positive', 'negative']\n", - "31920 = 25000 + 6920\n" - ] + "data": { + "text/plain": [ + "['positive', 'negative']\n", + "['positive', 'negative']\n", + "31920 = 25000 + 6920" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -813,7 +843,9 @@ "from podium import Field, LabelField, Vocab\n", "# Load the datasets\n", "imdb_train, imdb_test = IMDB.get_dataset_splits()\n", + "imdb_train.finalize_fields()\n", "sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n", + "sst_train.finalize_fields()\n", "\n", "# Luckily, both label vocabularies are already equal\n", "print(imdb_train.field('label').vocab.itos)\n", @@ -837,7 +869,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -858,7 +890,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -869,7 +901,8 @@ "label = LabelField(name='label')\n", "fields = {'text': text, 'label': label}\n", "\n", - "train, valid, test = SST.get_dataset_splits(fields=fields)\n", + "sst_train, sst_valid, sst_test = SST.get_dataset_splits(fields=fields)\n", + "sst_train.finalize_fields()\n", "\n", "# Define the iterators and our sort key\n", "from podium import Iterator, BucketIterator\n", @@ -877,7 +910,7 @@ " # Use the text Field\n", " raw, tokenized = instance.text\n", " return len(tokenized)\n", - "bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length)" + "bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length)" ] }, { @@ -889,21 +922,24 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "For Iterator, padding = 145749 out of 279304 = 52.18%\n", - "For BucketIterator, padding = 2125 out of 135680 = 1.57%\n" - ] + "data": { + "text/plain": [ + "For Iterator, padding = 148141 out of 281696 = 52.588961149608096%\n", + "For BucketIterator, padding = 2125 out of 135680 = 1.5661851415094339%" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "import numpy as np\n", - "vanilla_iter = Iterator(train, batch_size=32)\n", + "vanilla_iter = Iterator(sst_train, batch_size=32)\n", "\n", "def count_padding(batch, padding_idx):\n", " return np.count_nonzero(batch == padding_idx)\n", @@ -978,7 +1014,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1037,18 +1073,21 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Example({\n", - " text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n", - " label: (None, 'positive')\n", - "})\n" - ] + "data": { + "text/plain": [ + "Example({\n", + " text: (None, ['A', 'slick', ',', 'engrossing', 'melodrama', '.']),\n", + " label: (None, 'positive')\n", + "})" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1056,7 +1095,7 @@ "import pickle\n", "\n", "cache_dir = Path('cache')\n", - "cache_dir.mkdir()\n", + "cache_dir.mkdir(exist_ok=True)\n", "\n", "dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl')\n", "\n", @@ -1080,7 +1119,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1120,16 +1159,19 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "True\n", - "True\n" - ] + "data": { + "text/plain": [ + "True\n", + "True" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1149,25 +1191,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 4 } diff --git a/docs/source/notebooks/preprocessing.ipynb b/docs/source/notebooks/preprocessing.ipynb index 1ade9a99..609c259d 100644 --- a/docs/source/notebooks/preprocessing.ipynb +++ b/docs/source/notebooks/preprocessing.ipynb @@ -12,8 +12,9 @@ "# ! pip install git+https://github.com/TakeLab/podium.git\n", "\n", "# Additional dependencies required to run this notebook:\n", - "! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n", - "! python -m spacy download en_core_web_sm" + "! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n", + "! python -m spacy download en_core_web_sm\n", + "! python -m nltk.downloader stopwords" ] }, { @@ -387,7 +388,7 @@ { "data": { "text/plain": [ - "(None, [opinion', 'exciting', 'funny', 'movie'])" + "(None, ['opinion', 'exciting', 'funny', 'movie'])" ] }, "execution_count": null, diff --git a/docs/source/notebooks/quickstart.ipynb b/docs/source/notebooks/quickstart.ipynb index 7d23db64..6522f6f2 100644 --- a/docs/source/notebooks/quickstart.ipynb +++ b/docs/source/notebooks/quickstart.ipynb @@ -9,7 +9,7 @@ "# Podium installation\n", "! pip install podium-nlp\n", "# To install from source instead of the last release, comment the command above and uncomment the following one.\n", - "# ! pip install git+https://github.com/takelab/podium" + "# ! pip install git+https://github.com/TakeLab/podium.git" ] }, { @@ -131,9 +131,9 @@ "data": { "text/plain": [ "Example({\n", - " input_text: (None, ['Amazingly', 'lame', '.']),\n", - " input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n", - " target: (None, 'negative')\n", + " input_text: (None, ['Amazingly', 'lame', '.']),\n", + " input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n", + " target: (None, 'negative')\n", "})" ] }, diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb index d6944d23..3cfb3902 100644 --- a/docs/source/notebooks/walkthrough.ipynb +++ b/docs/source/notebooks/walkthrough.ipynb @@ -104,7 +104,7 @@ ], "source": [ "from podium.datasets import SST\n", - "sst_train, sst_dev, sst_test = SST.get_dataset_splits() # doctest:+ELLIPSIS\n", + "sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n", "sst_train.finalize_fields()\n", "print(sst_train)\n", "print(sst_train[222]) # A short example" @@ -121,7 +121,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -149,8 +149,7 @@ "data": { "text/plain": [ "dict_keys(['train', 'test', 'unsupervised'])\n", - "{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),\n", - " 'text': Value(dtype='string', id=None)}" + "{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}" ] }, "execution_count": null, @@ -188,16 +187,15 @@ { "data": { "text/plain": [ - "{'label': LabelField({\n", - " name: 'label',\n", - " keep_raw: False,\n", - " is_target: True\n", - "}),\n", - " 'text': Field({\n", - " name: 'text',\n", - " keep_raw: False,\n", - " is_target: False,\n", - " vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619})\n", + "{'text': Field({\n", + " name: 'text',\n", + " keep_raw: False,\n", + " is_target: False,\n", + " vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619})\n", + " }), 'label': LabelField({\n", + " name: 'label',\n", + " keep_raw: False,\n", + " is_target: True\n", "})}" ] }, @@ -213,7 +211,7 @@ "imdb_train, imdb_test, imdb_unsupervised = HF.from_dataset_dict(imdb).values()\n", "imdb_train.finalize_fields()\n", "\n", - "print(imdb_train.field_dict())" + "print(imdb_train.field_dict)" ] }, { @@ -266,7 +264,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -432,7 +430,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -494,7 +492,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -682,7 +680,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -817,7 +815,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "" + "" ] }, { @@ -994,7 +992,7 @@ { "data": { "text/plain": [ - "For vocabulary of size: 21701 loaded embedding matrix of shape: (21701, 300)\n", + "For vocabulary of size: 21701 loaded embedding matrix of shape: (16284, 300)\n", "Vector for sport: [ 0.34566 0.15934 0.48444 -0.13693 0.18737 0.2678\n", " -0.39159 0.4931 -0.76111 -1.4586 0.41475 0.55837\n", " ...\n", @@ -1083,12 +1081,12 @@ "data": { "text/plain": [ " (6920, 4998)\n", - "(0, 2111) 0.617113703893198\n", - "(0, 549) 0.5208201737884445\n", - "(0, 499) 0.5116152860290002\n", - "(0, 19) 0.2515101839877878\n", - "(0, 1) 0.12681755258500052\n", - "(0, 0) 0.08262419651916046" + " (0, 2111) 0.617113703893198\n", + " (0, 549) 0.5208201737884445\n", + " (0, 499) 0.5116152860290002\n", + " (0, 19) 0.2515101839877878\n", + " (0, 1) 0.12681755258500052\n", + " (0, 0) 0.08262419651916046" ] }, "execution_count": null, diff --git a/docs/source/preprocessing.rst b/docs/source/preprocessing.rst index 58e2c86c..c0f5a3bd 100644 --- a/docs/source/preprocessing.rst +++ b/docs/source/preprocessing.rst @@ -159,7 +159,7 @@ Stopword removal >>> raw_text = None >>> tokenized_text = ['in', 'my', 'opinion', 'an', 'exciting', 'and', 'funny', 'movie'] >>> print(remove_stopwords_hook(raw_text, tokenized_text)) - (None, [opinion', 'exciting', 'funny', 'movie']) + (None, ['opinion', 'exciting', 'funny', 'movie']) Keyword extraction ------------------ diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 715f3452..0466b095 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -69,9 +69,9 @@ You might wonder, why not simply use the input column names from the header to s >>> dataset_with_chars.finalize_fields() >>> print(dataset_with_chars[1]) Example({ - input_text: (None, ['Amazingly', 'lame', '.']), - input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']), - target: (None, 'negative') + input_text: (None, ['Amazingly', 'lame', '.']), + input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']), + target: (None, 'negative') }) You might wonder what the ``None``\s we've been seeing represent. For each Field, we store raw and processed data as a tuple. The first element of the tuple is reserved for raw data, by default blank to preserve memory. For a detailed overview of the Field constructor arguments and how to use them, check :ref:`fields`. diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py index 941037ba..e8e54566 100644 --- a/docs/source/scripts/check_notebooks.py +++ b/docs/source/scripts/check_notebooks.py @@ -1,17 +1,33 @@ import argparse import copy -import multiprocessing +import multiprocess +import string import textwrap from functools import partial from pathlib import Path import nbformat -from nbconvert.preprocessors import CellExecutionError, ExecutePreprocessor +from nbconvert.preprocessors import ExecutePreprocessor NOTEBOOKS_PATH = "../notebooks" -INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp\n" -INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git\n" +INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp" +INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git" +TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace)) + + +def print_notebook_name_with_error(func): + def wrapper(*args, **kwargs): + if args: + notebook_path = args[0] + else: + notebook_path = kwargs.pop("self") + try: + return func(*args, **kwargs) + except Exception as err: + print(f"Error in notebook {Path(notebook_path).name}:\n{err}") + raise + return wrapper def replace_install_release_with_source(nb): @@ -27,26 +43,26 @@ def replace_install_release_with_source(nb): cell["source"] = cell["source"].replace(INSTALL_SOURCE_VERSION_COMMAND, INSTALL_SOURCE_VERSION_COMMAND[2:]) -def check_notebook_output(notebook_path, env="python3"): +@print_notebook_name_with_error +def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False): with open(notebook_path, encoding="utf-8") as f: nb = nbformat.read(f, as_version=4) original_nb = nb - ep = ExecutePreprocessor(kernel_name=env) new_nb = copy.deepcopy(nb) replace_install_release_with_source(new_nb) - try: - ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}}) - except CellExecutionError: - print(f"Error happened while executing the notebook {notebook_path.name}") - raise - print(new_nb) - report = [] + ep = ExecutePreprocessor(kernel_name=env) + + print(str(Path(notebook_path).parent)) + ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}}) + assert len(original_nb["cells"]) == len(new_nb["cells"]) - for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cell"])): + + report = [] + for i, (original_cell, new_cell) in enumerate(zip(original_nb["cells"], new_nb["cells"])): # consider only cells with code - if original_cell["cell_type"] != "code": + if original_cell["cell_type"] != "code" or original_cell["outputs"] == [] or original_cell["metadata"].get("elippsis"): continue # sanity check @@ -57,16 +73,27 @@ def check_notebook_output(notebook_path, env="python3"): continue # sanity check - assert isinstance(original_cell["outputs"]["data"]["text/plain"], list) - original_cell_stdout = "".join(original_cell["outputs"]["data"]["text/plain"]) + assert len(original_cell["outputs"]) == 1 + original_cell_stdout = original_cell["outputs"][0]["data"]["text/plain"] + assert isinstance(original_cell_stdout, str) new_cell_stdout = "".join([ new_cell_output["text"] for new_cell_output in new_cell["outputs"] if new_cell_output["name"] == "stdout" ]) + original_cell_stdout_ = original_cell_stdout + new_cell_stdout_ = new_cell_stdout + + if ignore_whitespace: + original_cell = original_cell_stdout.translate(TRANS_TABLE) + new_cell_stdout = new_cell_stdout.translate(TRANS_TABLE) + else: + if new_cell_stdout[-1] == "\n" and original_cell_stdout[-1] != "\n": + original_cell_stdout += "\n" + if original_cell_stdout != new_cell_stdout: - report.append(i, original_cell_stdout, new_cell_stdout) + report.append((i, original_cell_stdout_, new_cell_stdout_)) return notebook_path.name, report @@ -75,33 +102,45 @@ def check_notebook_output(notebook_path, env="python3"): parser = argparse.ArgumentParser() parser.add_argument("--env", default="python3", help="kernel that executes the notebook") parser.add_argument("--num_proc", help="number of processes for parallel execution") + parser.add_argument("--ignore_whitespace", action="store_true", help="ignore whitespace when comparing cell outputs") args = parser.parse_args() if args.num_proc is None: num_proc = 1 elif args.num_proc == "auto": - num_proc = multiprocessing.cpu_count() + num_proc = multiprocess.cpu_count() else: num_proc = int(args.num_proc) - notebook_paths = [notebook_path for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb")] - num_proc = min(min(num_proc, multiprocessing.cpu_count()), len(notebook_paths)) + notebook_paths = [ + notebook_path + for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb") + if not notebook_path.name.endswith("-checkpoint.ipynb") + ] + # print(notebook_paths) + # exit() + # notebook_paths = [notebook_paths[2]] + # print(notebook_paths) + num_proc = min(min(num_proc, multiprocess.cpu_count()), len(notebook_paths)) if num_proc == 1: reports = [] for notebook_path in notebook_paths: - report = check_notebook_output(notebook_path, env=args.env) + report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace) reports.append(report) else: - with multiprocessing.Pool(num_proc) as pool: - reports = pool.map(partial(check_notebook_output, env=args.env), notebook_paths) + with multiprocess.Pool(num_proc) as pool: + reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths) if any(report for _, report in reports): reports_str = "\n\n".join([ f"In notebook {notebook}:\n" + textwrap.indent( "\n".join( + f"Cell {i}\n" + "=" * len(f"Cell {i}") + "\n" + f"Original output:\n{original_output}\nAfter execution:\n{new_output}" - for original_output, new_output in report), " " * 4) - for notebook, report in reports + for i, original_output, new_output in report), + " " * 4, + ) + for notebook, report in reports ]) raise Exception( "❌❌ Mismatches found in the outputs of the notebooks:\n\n" + reports_str diff --git a/docs/source/scripts/convert_doc_to_notebooks.py b/docs/source/scripts/convert_doc_to_notebooks.py index dd870b94..e6f91e6c 100644 --- a/docs/source/scripts/convert_doc_to_notebooks.py +++ b/docs/source/scripts/convert_doc_to_notebooks.py @@ -357,6 +357,8 @@ def split_frameworks(code): # Matches any doctest pattern. _re_doctest = re.compile(r"^(>>>|\.\.\.)") +# Re pattern that matches doctest options in code blocks. +_re_doctest_option = re.compile(r'#\s*doctest:\s*([^\n\'"]*)$', re.MULTILINE) def parse_code_and_output(code): @@ -395,10 +397,16 @@ def code_cell(code, output=None): if output is None or len(output) == 0: outputs = [] else: + metadata = {} + for m in _re_doctest_option.finditer(code): + group = m.group(1) + if group == "+ELIPPSIS": + metadata["elippsis"] = True + code.replace(group, "") outputs = [nbformat.notebooknode.NotebookNode({ 'data': {'text/plain': output}, 'execution_count': None, - 'metadata': {}, + 'metadata': metadata, 'output_type': 'execute_result' })] return nbformat.notebooknode.NotebookNode( @@ -440,8 +448,9 @@ def rm_first_line(text): ), "preprocessing.rst": textwrap.dedent( """\ - ! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz + ! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz ! python -m spacy download en_core_web_sm + ! python -m nltk.downloader stopwords """ ), "walkthrough.rst": textwrap.dedent( diff --git a/docs/source/scripts/requirements.txt b/docs/source/scripts/requirements.txt new file mode 100644 index 00000000..5623fe18 --- /dev/null +++ b/docs/source/scripts/requirements.txt @@ -0,0 +1,4 @@ +ipykernel +nbformat +nbconvert +multiprocess \ No newline at end of file diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst index fa2150f4..5d7089be 100644 --- a/docs/source/walkthrough.rst +++ b/docs/source/walkthrough.rst @@ -33,7 +33,7 @@ One built-in dataset available in Podium is the `Stanford Sentiment Treebank >> from podium.datasets import SST - >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits() # doctest:+ELLIPSIS + >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits() >>> sst_train.finalize_fields() >>> print(sst_train) SST({ @@ -84,8 +84,7 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your >>> # Each dataset has a set of features which need to be mapped >>> # to Podium Fields. >>> print(imdb['train'].features) - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None), - 'text': Value(dtype='string', id=None)} + {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)} As is the case with loading your custom dataset, ``features`` of 🤗 datasets need to be mapped to Podium Fields in order to direct the data flow for preprocessing. @@ -99,17 +98,16 @@ Datasets from 🤗 need to either (1) be wrapped them in :class:`podium.datasets >>> imdb_train, imdb_test, imdb_unsupervised = HF.from_dataset_dict(imdb).values() >>> imdb_train.finalize_fields() >>> - >>> print(imdb_train.field_dict()) - {'label': LabelField({ - name: 'label', - keep_raw: False, - is_target: True - }), - 'text': Field({ - name: 'text', - keep_raw: False, - is_target: False, - vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619}) + >>> print(imdb_train.field_dict) + {'text': Field({ + name: 'text', + keep_raw: False, + is_target: False, + vocab: Vocab({specials: ('', ''), eager: False, is_finalized: True, size: 280619}) + }), 'label': LabelField({ + name: 'label', + keep_raw: False, + is_target: True })} .. note:: @@ -509,7 +507,7 @@ The output of the function call is a numpy matrix of word embeddings which you c >>> glove = GloVe() >>> embeddings = glove.load_vocab(vocab) >>> print(f"For vocabulary of size: {len(vocab)} loaded embedding matrix of shape: {embeddings.shape}") - For vocabulary of size: 21701 loaded embedding matrix of shape: (21701, 300) + For vocabulary of size: 21701 loaded embedding matrix of shape: (16284, 300) >>> # We can obtain vectors for a single word (given the word is loaded) like this: >>> word = "sport" >>> print(f"Vector for {word}: {glove.token_to_vector(word)}") @@ -555,12 +553,12 @@ Now our vectorizer has seen the dataset as well as the vocabulary and has all th >>> print(type(tfidf_batch), tfidf_batch.shape) (6920, 4998) >>> print(tfidf_batch[222]) - (0, 2111) 0.617113703893198 - (0, 549) 0.5208201737884445 - (0, 499) 0.5116152860290002 - (0, 19) 0.2515101839877878 - (0, 1) 0.12681755258500052 - (0, 0) 0.08262419651916046 + (0, 2111) 0.617113703893198 + (0, 549) 0.5208201737884445 + (0, 499) 0.5116152860290002 + (0, 19) 0.2515101839877878 + (0, 1) 0.12681755258500052 + (0, 0) 0.08262419651916046 The Tf-Idf counts are highly sparse since not all words from the vocabulary are present in every instance. To reduce the memory footprint of count-based numericalization, we store the values in a `SciPy `__ `sparse matrix `__, which can be used in various `scikit-learn `__ models. diff --git a/podium/datasets/dataset.py b/podium/datasets/dataset.py index f26b3a6e..ab681adf 100644 --- a/podium/datasets/dataset.py +++ b/podium/datasets/dataset.py @@ -278,7 +278,7 @@ def shuffled(self) -> "DatasetBase": return self[shuffled_indices] def __repr__(self): - fields_str = ",\n".join(textwrap.indent(repr(f), " " * 8) for f in self.fields) + fields_str = ",\n".join(textwrap.indent(repr(f), " " * 4) for f in self.fields) fields_str = f"[\n{fields_str}\n \n]" attrs = {"size": len(self), "fields": fields_str} return repr_type_and_attrs(self, attrs, with_newlines=True, repr_values=False) diff --git a/podium/field.py b/podium/field.py index cd946577..fc554412 100644 --- a/podium/field.py +++ b/podium/field.py @@ -954,7 +954,7 @@ def remove_pretokenize_hooks(self): def __repr__(self): fields_str = ",\n".join( - textwrap.indent(repr(f), " " * 8) for f in self._output_fields + textwrap.indent(repr(f), " " * 4) for f in self._output_fields ) fields_str = f"[\n{fields_str}\n \n]" attrs = {"fields": fields_str} diff --git a/setup.py b/setup.py index 9b643cad..ef4dc1f0 100644 --- a/setup.py +++ b/setup.py @@ -97,7 +97,6 @@ def _get_version(): 'sphinx_rtd_theme', 'sphinx-copybutton', 'recommonmark', - 'nbformat', 'datasets', ] From 5fc112d94efabe43bd5bf30c251a48018dd5d1fe Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 02:20:00 +0200 Subject: [PATCH 05/15] Remove comments --- docs/source/scripts/check_notebooks.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py index e8e54566..34556eca 100644 --- a/docs/source/scripts/check_notebooks.py +++ b/docs/source/scripts/check_notebooks.py @@ -53,8 +53,6 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) replace_install_release_with_source(new_nb) ep = ExecutePreprocessor(kernel_name=env) - - print(str(Path(notebook_path).parent)) ep.preprocess(new_nb, {"metadata": {"path": str(Path(notebook_path).parent)}}) assert len(original_nb["cells"]) == len(new_nb["cells"]) @@ -117,10 +115,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb") if not notebook_path.name.endswith("-checkpoint.ipynb") ] - # print(notebook_paths) - # exit() - # notebook_paths = [notebook_paths[2]] - # print(notebook_paths) + num_proc = min(min(num_proc, multiprocess.cpu_count()), len(notebook_paths)) if num_proc == 1: reports = [] From b3d74cc9d58fd0a649297c5ec1e608c981dc5d2c Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 02:22:22 +0200 Subject: [PATCH 06/15] CI fix --- .github/workflows/scheduled.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml index 856fa93a..cb6be8d8 100644 --- a/.github/workflows/scheduled.yml +++ b/.github/workflows/scheduled.yml @@ -16,7 +16,7 @@ jobs: python-version: [3.6] defaults: run: - working-direcory: docs/source/scripts + working-directory: docs/source/scripts steps: - uses: actions/checkout@v2 From d13a9c0268c2efeac95cc9b5309c54477e877c2e Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 02:23:31 +0200 Subject: [PATCH 07/15] CI fix #2 --- .github/workflows/scheduled.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml index cb6be8d8..941a3d26 100644 --- a/.github/workflows/scheduled.yml +++ b/.github/workflows/scheduled.yml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install --r requirements.txt + pip install -r requirements.txt - name: Execute notebooks run: | python check_notebooks.py --num_proc auto --ignore_whitespace From 187e30525dca701dd7a5d2a5545b435461328952 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 12:59:49 +0200 Subject: [PATCH 08/15] Inject SST --- docs/source/advanced.rst | 4 ++-- docs/source/notebooks/advanced.ipynb | 4 ++-- docs/source/scripts/check_notebooks.py | 33 ++++++++++++++++++++++++++ docs/source/scripts/requirements.txt | 1 + podium/datasets/arrow.py | 6 ++--- podium/datasets/impl/conllu_dataset.py | 3 ++- podium/datasets/impl/imdb.py | 4 +++- podium/datasets/impl/snli.py | 2 +- podium/datasets/impl/sst.py | 2 +- podium/vectorizers/vectorizer.py | 6 +++-- 10 files changed, 52 insertions(+), 13 deletions(-) diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index 9572daa5..705df020 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -417,8 +417,8 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended >>> label = LabelField(name='label') >>> fields = {'text': text, 'label': label} >>> - >>> sst_train, sst_valid, sst_test = SST.get_dataset_splits(fields=fields) - >>> sst_train.finalize_fields() + >>> train, valid, test = SST.get_dataset_splits(fields=fields) + >>> train.finalize_fields() >>> >>> # Define the iterators and our sort key >>> from podium import Iterator, BucketIterator diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb index c5eac847..b5dd9b4c 100644 --- a/docs/source/notebooks/advanced.ipynb +++ b/docs/source/notebooks/advanced.ipynb @@ -901,8 +901,8 @@ "label = LabelField(name='label')\n", "fields = {'text': text, 'label': label}\n", "\n", - "sst_train, sst_valid, sst_test = SST.get_dataset_splits(fields=fields)\n", - "sst_train.finalize_fields()\n", + "train, valid, test = SST.get_dataset_splits(fields=fields)\n", + "train.finalize_fields()\n", "\n", "# Define the iterators and our sort key\n", "from podium import Iterator, BucketIterator\n", diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py index 34556eca..92fd08b2 100644 --- a/docs/source/scripts/check_notebooks.py +++ b/docs/source/scripts/check_notebooks.py @@ -1,7 +1,10 @@ import argparse import copy import multiprocess +import os +import shutil import string +import subprocess import textwrap from functools import partial from pathlib import Path @@ -13,9 +16,30 @@ NOTEBOOKS_PATH = "../notebooks" INSTALL_RELEASE_VERSION_COMMAND = "! pip install podium-nlp" INSTALL_SOURCE_VERSION_COMMAND = "# ! pip install git+https://github.com/TakeLab/podium.git" +INSTALL_SST_COMMAND = "python -c \"from podium.datasets import SST; SST.get_dataset_splits()\"" TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace)) +def inject_sst(): + delim = "&" if os.name == "nt" else ";" + subprocess.call( + delim.join([INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]), + shell=True, + cwd=Path(NOTEBOOKS_PATH).absolute(), + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) + + +def cleanup(snap_before_exec, snap_after_exec): + created_paths = set(snap_after_exec) - set(snap_before_exec) + for path in created_paths: + if path.is_dir(): + shutil.rmtree(path) + else: + path.unlink() + + def print_notebook_name_with_error(func): def wrapper(*args, **kwargs): if args: @@ -101,6 +125,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) parser.add_argument("--env", default="python3", help="kernel that executes the notebook") parser.add_argument("--num_proc", help="number of processes for parallel execution") parser.add_argument("--ignore_whitespace", action="store_true", help="ignore whitespace when comparing cell outputs") + parser.add_argument("--keep_artifacts", action="store_true", help="save files/directories created during execution") args = parser.parse_args() if args.num_proc is None: @@ -116,6 +141,8 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) if not notebook_path.name.endswith("-checkpoint.ipynb") ] + snap_before_exec = list(Path(NOTEBOOKS_PATH).iterdir()) + num_proc = min(min(num_proc, multiprocess.cpu_count()), len(notebook_paths)) if num_proc == 1: reports = [] @@ -123,9 +150,15 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace) reports.append(report) else: + # inject the SST dataset to prevent parallel download + inject_sst() with multiprocess.Pool(num_proc) as pool: reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths) + if args.keep_artefacts is False: + snap_after_exec = list(Path(NOTEBOOKS_PATH).iterdir()) + cleanup(snap_before_exec, snap_after_exec) + if any(report for _, report in reports): reports_str = "\n\n".join([ f"In notebook {notebook}:\n" + textwrap.indent( diff --git a/docs/source/scripts/requirements.txt b/docs/source/scripts/requirements.txt index 5623fe18..a5fc2b3c 100644 --- a/docs/source/scripts/requirements.txt +++ b/docs/source/scripts/requirements.txt @@ -1,4 +1,5 @@ ipykernel +ipywidgets nbformat nbconvert multiprocess \ No newline at end of file diff --git a/podium/datasets/arrow.py b/podium/datasets/arrow.py index b69b20b5..0cceb874 100644 --- a/podium/datasets/arrow.py +++ b/podium/datasets/arrow.py @@ -299,7 +299,7 @@ def from_tabular_file( format = format.lower() csv_reader_params = {} if csv_reader_params is None else csv_reader_params - with open(os.path.expanduser(path), encoding="utf8") as f: + with open(os.path.expanduser(path), encoding="utf-8") as f: if format in {"csv", "tsv"}: delimiter = "," if format == "csv" else "\t" reader = csv.reader(f, delimiter=delimiter, **csv_reader_params) @@ -542,7 +542,7 @@ def load_cache(cache_path) -> "DiskBackedDataset": """ # load fields fields_file_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME) - with open(fields_file_path, "rb") as fields_cache_file: + with open(os.path.expanduser(fields_file_path), "rb") as fields_cache_file: fields = pickle.load(fields_cache_file) # load dataset as memory mapped arrow table @@ -587,7 +587,7 @@ def dump_cache(self, cache_path: Optional[str] = None) -> str: # pickle fields cache_fields_path = os.path.join(cache_path, CACHE_FIELDS_FILENAME) - with open(cache_fields_path, "wb") as fields_cache_file: + with open(os.path.expanduser(cache_fields_path), "wb") as fields_cache_file: pickle.dump(self.fields, fields_cache_file) # dump table diff --git a/podium/datasets/impl/conllu_dataset.py b/podium/datasets/impl/conllu_dataset.py index 6a377e28..5872e7c4 100644 --- a/podium/datasets/impl/conllu_dataset.py +++ b/podium/datasets/impl/conllu_dataset.py @@ -2,6 +2,7 @@ Module contains the CoNLL-U dataset. """ import collections +import os from podium.datasets import Dataset from podium.datasets.example_factory import ExampleFactory @@ -87,7 +88,7 @@ def safe_conllu_parse(in_file): example_factory = ExampleFactory(fields) examples = [] - with open(file_path, encoding="utf-8") as in_file: + with open(os.path.expanduser(file_path), encoding="utf-8") as in_file: for tokenlist in safe_conllu_parse(in_file): example_dict = collections.defaultdict(lambda: []) for token in tokenlist: diff --git a/podium/datasets/impl/imdb.py b/podium/datasets/impl/imdb.py index 500cd563..17950c49 100644 --- a/podium/datasets/impl/imdb.py +++ b/podium/datasets/impl/imdb.py @@ -153,7 +153,9 @@ def _create_labeled_examples(dir_path, label, fields): ] examples = [] for file_path in files_list: - with open(file=os.path.join(dir_path, file_path), encoding="utf8") as fpr: + with open( + os.path.expanduser(os.path.join(dir_path, file_path)), encoding="utf-8" + ) as fpr: data = {IMDB.TEXT_FIELD_NAME: fpr.read(), IMDB.LABEL_FIELD_NAME: label} examples.append(example_factory.from_dict(data)) return examples diff --git a/podium/datasets/impl/snli.py b/podium/datasets/impl/snli.py index bebc522e..52503052 100644 --- a/podium/datasets/impl/snli.py +++ b/podium/datasets/impl/snli.py @@ -99,7 +99,7 @@ def _create_examples(file_path, fields): example_factory = ExampleFactory(fields) examples = [] - with open(file=file_path, encoding="utf8") as in_file: + with open(os.path.expanduser(file_path), encoding="utf-8") as in_file: for line in in_file: examples.append(example_factory.from_json(line)) return examples diff --git a/podium/datasets/impl/sst.py b/podium/datasets/impl/sst.py index e0a40f45..f6200f25 100644 --- a/podium/datasets/impl/sst.py +++ b/podium/datasets/impl/sst.py @@ -123,7 +123,7 @@ def label_trf(label): return label_to_string_map[label] examples = [] - with open(file=file_path, encoding="utf8") as fpr: + with open(os.path.expanduser(file_path), encoding="utf-8") as fpr: for line in fpr: example = example_factory.from_fields_tree( diff --git a/podium/vectorizers/vectorizer.py b/podium/vectorizers/vectorizer.py index 2f7b1e1a..982e1f20 100644 --- a/podium/vectorizers/vectorizer.py +++ b/podium/vectorizers/vectorizer.py @@ -298,7 +298,7 @@ def _cache_vectors(self): """ Method for caching loaded vectors to cache_dir. """ - with open(self._cache_path, "wb") as cache_file: + with open(os.path.expanduser(self._cache_path), "wb") as cache_file: for word in self._vectors: vector_values_string = " ".join(map(str, self._vectors[word])) cache_file.write(f"{word} {vector_values_string}\n".encode("utf-8")) @@ -362,7 +362,9 @@ def _load_vectors(self, vocab=None): vocab = set(vocab) open_mode, split_delimiter = ("rb", b" ") if self._binary else ("r", " ") - with open(curr_path, open_mode, encoding=self._encoding) as vector_file: + with open( + os.path.expanduser(curr_path), open_mode, encoding=self._encoding + ) as vector_file: vectors_loaded = 0 header_lines = 0 From 41fd885d3c0de451ecba71a18c6ee411a6f62ba2 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 13:13:00 +0200 Subject: [PATCH 09/15] Small fix --- docs/source/scripts/check_notebooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py index 92fd08b2..6020388c 100644 --- a/docs/source/scripts/check_notebooks.py +++ b/docs/source/scripts/check_notebooks.py @@ -101,7 +101,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) new_cell_stdout = "".join([ new_cell_output["text"] - for new_cell_output in new_cell["outputs"] if new_cell_output["name"] == "stdout" + for new_cell_output in new_cell["outputs"] if new_cell_output["output_type"] == "stream" and new_cell_output["name"] == "stdout" ]) original_cell_stdout_ = original_cell_stdout From c4c650f5669e0415140fe660e1dab64a3a35a0a6 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 13:20:26 +0200 Subject: [PATCH 10/15] Fix argparse argument name --- docs/source/scripts/check_notebooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py index 6020388c..ab4442c6 100644 --- a/docs/source/scripts/check_notebooks.py +++ b/docs/source/scripts/check_notebooks.py @@ -155,7 +155,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) with multiprocess.Pool(num_proc) as pool: reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths) - if args.keep_artefacts is False: + if args.keep_artifacts is False: snap_after_exec = list(Path(NOTEBOOKS_PATH).iterdir()) cleanup(snap_before_exec, snap_after_exec) From 79868fb47fea58ac30eed77362c96acc99def059 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 15:04:30 +0200 Subject: [PATCH 11/15] Fix comparison --- docs/source/scripts/check_notebooks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py index ab4442c6..648ca6df 100644 --- a/docs/source/scripts/check_notebooks.py +++ b/docs/source/scripts/check_notebooks.py @@ -1,6 +1,5 @@ import argparse import copy -import multiprocess import os import shutil import string @@ -9,6 +8,7 @@ from functools import partial from pathlib import Path +import multiprocess import nbformat from nbconvert.preprocessors import ExecutePreprocessor @@ -20,7 +20,7 @@ TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace)) -def inject_sst(): +def inject_shared_download(): delim = "&" if os.name == "nt" else ";" subprocess.call( delim.join([INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]), @@ -108,7 +108,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) new_cell_stdout_ = new_cell_stdout if ignore_whitespace: - original_cell = original_cell_stdout.translate(TRANS_TABLE) + original_cell_stdout = original_cell_stdout.translate(TRANS_TABLE) new_cell_stdout = new_cell_stdout.translate(TRANS_TABLE) else: if new_cell_stdout[-1] == "\n" and original_cell_stdout[-1] != "\n": @@ -150,8 +150,8 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace) reports.append(report) else: - # inject the SST dataset to prevent parallel download - inject_sst() + # predownload datasets/vectorizers to prevent parallel download + inject_shared_download() with multiprocess.Pool(num_proc) as pool: reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths) @@ -168,7 +168,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) for i, original_output, new_output in report), " " * 4, ) - for notebook, report in reports + for notebook, report in reports if len(report) > 0 ]) raise Exception( "❌❌ Mismatches found in the outputs of the notebooks:\n\n" + reports_str From 4ba16f808e2bd33df37f9f7720d9b07089890239 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 15:06:57 +0200 Subject: [PATCH 12/15] Redirect datasets output --- docs/source/notebooks/walkthrough.ipynb | 13 +++++++++---- docs/source/walkthrough.rst | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb index 3cfb3902..4e4838da 100644 --- a/docs/source/notebooks/walkthrough.ipynb +++ b/docs/source/notebooks/walkthrough.ipynb @@ -158,11 +158,16 @@ } ], "source": [ - "import datasets\n", + "from datasets import load_dataset\n", + "from contextlib import redirect_stdout\n", "from pprint import pprint\n", - "# Loading a huggingface dataset returns an instance of DatasetDict\n", - "# which contains the dataset splits (usually: train, valid, test) \n", - "imdb = datasets.load_dataset('imdb')\n", + "\n", + "# Silence download logs\n", + "with redirect_stdout(None):\n", + " # Loading a huggingface dataset returns an instance of DatasetDict\n", + " # which contains the dataset splits (usually: train, valid, test) \n", + " imdb = load_dataset('imdb')\n", + "\n", "print(imdb.keys())\n", "\n", "# Each dataset has a set of features which need to be mapped\n", diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst index 5d7089be..c14ae032 100644 --- a/docs/source/walkthrough.rst +++ b/docs/source/walkthrough.rst @@ -73,11 +73,16 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your .. code-block:: python - >>> import datasets + >>> from datasets import load_dataset + >>> from contextlib import redirect_stdout >>> from pprint import pprint - >>> # Loading a huggingface dataset returns an instance of DatasetDict - >>> # which contains the dataset splits (usually: train, valid, test) - >>> imdb = datasets.load_dataset('imdb') + >>> + >>> # Silence download logs + >>> with redirect_stdout(None): + >>> # Loading a huggingface dataset returns an instance of DatasetDict + >>> # which contains the dataset splits (usually: train, valid, test) + >>> imdb = load_dataset('imdb') + >>> >>> print(imdb.keys()) dict_keys(['train', 'test', 'unsupervised']) >>> From 67d202a4272bf0d425291f13399565885bcbb995 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 15:22:30 +0200 Subject: [PATCH 13/15] Redirect to devnull --- docs/source/walkthrough.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst index c14ae032..fa346a61 100644 --- a/docs/source/walkthrough.rst +++ b/docs/source/walkthrough.rst @@ -73,12 +73,14 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your .. code-block:: python - >>> from datasets import load_dataset + >>> import os >>> from contextlib import redirect_stdout >>> from pprint import pprint >>> + >>> from datasets import load_dataset + >>> >>> # Silence download logs - >>> with redirect_stdout(None): + >>> with redirect_stdout(open(os.devnull, "w")): >>> # Loading a huggingface dataset returns an instance of DatasetDict >>> # which contains the dataset splits (usually: train, valid, test) >>> imdb = load_dataset('imdb') From 8964fa84a21821158f1752f342478ff2e6f80bb0 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 1 Apr 2021 15:27:37 +0200 Subject: [PATCH 14/15] Update notebooks --- docs/source/notebooks/walkthrough.ipynb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb index 4e4838da..46991ef4 100644 --- a/docs/source/notebooks/walkthrough.ipynb +++ b/docs/source/notebooks/walkthrough.ipynb @@ -158,12 +158,14 @@ } ], "source": [ - "from datasets import load_dataset\n", + "import os\n", "from contextlib import redirect_stdout\n", "from pprint import pprint\n", "\n", + "from datasets import load_dataset\n", + "\n", "# Silence download logs\n", - "with redirect_stdout(None):\n", + "with redirect_stdout(open(os.devnull, \"w\")):\n", " # Loading a huggingface dataset returns an instance of DatasetDict\n", " # which contains the dataset splits (usually: train, valid, test) \n", " imdb = load_dataset('imdb')\n", From ce1b17f5761a22e75d1b77c4a02f00f79099aefe Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 2 Apr 2021 01:22:54 +0200 Subject: [PATCH 15/15] Fix multiprocessing --- docs/source/notebooks/sample_dataset.csv | 3 +++ docs/source/notebooks/walkthrough.ipynb | 14 ++++++-------- docs/source/scripts/check_notebooks.py | 21 ++++++++++++++++----- docs/source/walkthrough.rst | 14 ++++++-------- 4 files changed, 31 insertions(+), 21 deletions(-) create mode 100644 docs/source/notebooks/sample_dataset.csv diff --git a/docs/source/notebooks/sample_dataset.csv b/docs/source/notebooks/sample_dataset.csv new file mode 100644 index 00000000..7827a95f --- /dev/null +++ b/docs/source/notebooks/sample_dataset.csv @@ -0,0 +1,3 @@ +text,label +Absorbing character study .,positive +Amazingly lame .,negative diff --git a/docs/source/notebooks/walkthrough.ipynb b/docs/source/notebooks/walkthrough.ipynb index 46991ef4..8ff1658d 100644 --- a/docs/source/notebooks/walkthrough.ipynb +++ b/docs/source/notebooks/walkthrough.ipynb @@ -149,7 +149,8 @@ "data": { "text/plain": [ "dict_keys(['train', 'test', 'unsupervised'])\n", - "{'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}" + "{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),\n", + " 'text': Value(dtype='string', id=None)}" ] }, "execution_count": null, @@ -159,22 +160,19 @@ ], "source": [ "import os\n", - "from contextlib import redirect_stdout\n", "from pprint import pprint\n", "\n", "from datasets import load_dataset\n", "\n", - "# Silence download logs\n", - "with redirect_stdout(open(os.devnull, \"w\")):\n", - " # Loading a huggingface dataset returns an instance of DatasetDict\n", - " # which contains the dataset splits (usually: train, valid, test) \n", - " imdb = load_dataset('imdb')\n", + "# Loading a huggingface dataset returns an instance of DatasetDict\n", + "# which contains the dataset splits (usually: train, valid, test) \n", + "imdb = load_dataset('imdb')\n", "\n", "print(imdb.keys())\n", "\n", "# Each dataset has a set of features which need to be mapped\n", "# to Podium Fields.\n", - "print(imdb['train'].features)" + "pprint(imdb['train'].features)" ] }, { diff --git a/docs/source/scripts/check_notebooks.py b/docs/source/scripts/check_notebooks.py index 648ca6df..8671a0c8 100644 --- a/docs/source/scripts/check_notebooks.py +++ b/docs/source/scripts/check_notebooks.py @@ -1,6 +1,7 @@ import argparse import copy import os +import re import shutil import string import subprocess @@ -19,11 +20,21 @@ INSTALL_SST_COMMAND = "python -c \"from podium.datasets import SST; SST.get_dataset_splits()\"" TRANS_TABLE = str.maketrans(dict.fromkeys(string.whitespace)) +_re_pip_install = re.compile(r"!\s*(pip\s+install\s+[^\\\"]*)") +_re_python = re.compile(r"!\s*(python[^\\\"]*)") + + +def init(notebook_paths): + all_commands = [] + for notebook_path in notebook_paths: + with open(notebook_path, encoding="utf-8") as f: + notebook_raw = f.read() + commands = _re_pip_install.findall(notebook_raw) + _re_python.findall(notebook_raw) + all_commands.extend(commands) -def inject_shared_download(): delim = "&" if os.name == "nt" else ";" subprocess.call( - delim.join([INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]), + delim.join([*all_commands, INSTALL_SOURCE_VERSION_COMMAND[4:], INSTALL_SST_COMMAND]), shell=True, cwd=Path(NOTEBOOKS_PATH).absolute(), stdout=subprocess.DEVNULL, @@ -138,7 +149,7 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) notebook_paths = [ notebook_path for notebook_path in Path(NOTEBOOKS_PATH).rglob("*.ipynb") - if not notebook_path.name.endswith("-checkpoint.ipynb") + if not (notebook_path.name.endswith("-checkpoint.ipynb") or notebook_path.parts[-2] == "examples") ] snap_before_exec = list(Path(NOTEBOOKS_PATH).iterdir()) @@ -150,8 +161,8 @@ def check_notebook_output(notebook_path, env="python3", ignore_whitespace=False) report = check_notebook_output(notebook_path, env=args.env, ignore_whitespace=args.ignore_whitespace) reports.append(report) else: - # predownload datasets/vectorizers to prevent parallel download - inject_shared_download() + # install packages and predownload datasets/vectorizers to prevent parallel download + init(notebook_paths) with multiprocess.Pool(num_proc) as pool: reports = pool.map(partial(check_notebook_output, env=args.env, ignore_whitespace=args.ignore_whitespace), notebook_paths) diff --git a/docs/source/walkthrough.rst b/docs/source/walkthrough.rst index fa346a61..cc573c28 100644 --- a/docs/source/walkthrough.rst +++ b/docs/source/walkthrough.rst @@ -74,24 +74,22 @@ Converting a dataset from 🤗 datasets into Podium requires some work from your .. code-block:: python >>> import os - >>> from contextlib import redirect_stdout >>> from pprint import pprint >>> >>> from datasets import load_dataset >>> - >>> # Silence download logs - >>> with redirect_stdout(open(os.devnull, "w")): - >>> # Loading a huggingface dataset returns an instance of DatasetDict - >>> # which contains the dataset splits (usually: train, valid, test) - >>> imdb = load_dataset('imdb') + >>> # Loading a huggingface dataset returns an instance of DatasetDict + >>> # which contains the dataset splits (usually: train, valid, test) + >>> imdb = load_dataset('imdb') >>> >>> print(imdb.keys()) dict_keys(['train', 'test', 'unsupervised']) >>> >>> # Each dataset has a set of features which need to be mapped >>> # to Podium Fields. - >>> print(imdb['train'].features) - {'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)} + >>> pprint(imdb['train'].features) + {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None), + 'text': Value(dtype='string', id=None)} As is the case with loading your custom dataset, ``features`` of 🤗 datasets need to be mapped to Podium Fields in order to direct the data flow for preprocessing.