diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2ad8e665..d1258617 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -75,7 +75,10 @@ jobs: pip install nemo-toolkit[asr,nlp]==1.23.0 pip install nemo_text_processing pip install -r requirements/huggingface.txt + pip install certifi #this needed to avoid problems with certificates [COORAL] + export SSL_CERT_FILE=$(python -m certifi) python -m pip cache purge + - name: Run all tests env: @@ -83,6 +86,9 @@ jobs: AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} CLEAN_UP_TMP_PATH: 1 run: | + wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] + sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] + sudo update-ca-certificates # [cert for CORAL] set -o pipefail # this will make sure next line returns non-0 exit code if tests fail python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt diff --git a/dataset_configs/english/coraal/config.yaml b/dataset_configs/english/coraal/config.yaml index ec908c59..b2293dd2 100644 --- a/dataset_configs/english/coraal/config.yaml +++ b/dataset_configs/english/coraal/config.yaml @@ -18,7 +18,7 @@ documentation: | This config performs the following data processing. 1. Downloads CORAAL data based on the - `official file list `_. + `official file list `_. #Official mirror link There are a couple of errors in the links there, which are fixed in our code. 2. Drops all utterances which contain only pauses. Set ``drop_pauses=False`` to undo. 3. Groups all consecutive segments from the same speaker until 20 seconds duration diff --git a/docs/src/conf.py b/docs/src/conf.py index 9a5fb330..29269d37 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -189,3 +189,8 @@ def setup(app): ] # nitpick_ignore_regex = [('py:class', '*')] +#adding this especially for coraal, temporary +linkcheck_ignore = [ + r'https://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt', +] +# https://lingtools.uoregon.edu/coraal/coraal_download_list.txt \ No newline at end of file diff --git a/requirements/main.txt b/requirements/main.txt index d133867a..74ce0255 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -18,7 +18,7 @@ python-docx pydub dask distributed - +jiwer>=3.1.0,<4.0.0 # toloka-kit # Temporarily disabled due to Toloka's technical pause; keep as reference for past and future API support # for some processers, additionally https://github.com/NVIDIA/NeMo is required # for some processers, additionally nemo_text_processing is required diff --git a/sdp/processors/datasets/coraal/create_initial_manifest.py b/sdp/processors/datasets/coraal/create_initial_manifest.py index 16aa166a..3dcd0c2f 100644 --- a/sdp/processors/datasets/coraal/create_initial_manifest.py +++ b/sdp/processors/datasets/coraal/create_initial_manifest.py @@ -31,15 +31,15 @@ def get_coraal_url_list(): There are a few mistakes in the official url list that are fixed here. Can be overridden by tests to select a subset of urls. """ - dataset_url = "http://lingtools.uoregon.edu/coraal/coraal_download_list.txt" + dataset_url = "https://lingtools.uoregon.edu/coraal/coraal_download_list.txt" urls = [] for file_url in urllib.request.urlopen(dataset_url): file_url = file_url.decode('utf-8').strip() # fixing known errors in the urls - if file_url == 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt': - file_url = 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt' - if file_url == 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt': - file_url = 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt' + if file_url == 'https://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt': + file_url = 'https://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt' + if file_url == 'https://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt': + file_url = 'https://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt' urls.append(file_url) return urls