From 19af5879cfdbf953a0de77384016b054adb4c279 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Mon, 30 Jun 2025 00:08:33 -0700 Subject: [PATCH 01/11] fix tests (failed bcause of certs, and fixed jwer version) Signed-off-by: George Zelenfroind --- .github/workflows/tests.yml | 6 ++++++ requirements/main.txt | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2ad8e665..f1e4860a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -75,7 +75,10 @@ jobs: pip install nemo-toolkit[asr,nlp]==1.23.0 pip install nemo_text_processing pip install -r requirements/huggingface.txt + pip install certifi + export SSL_CERT_FILE=$(python -m certifi) python -m pip cache purge + - name: Run all tests env: @@ -83,6 +86,9 @@ jobs: AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} CLEAN_UP_TMP_PATH: 1 run: | + wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem + sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt + sudo update-ca-certificates set -o pipefail # this will make sure next line returns non-0 exit code if tests fail python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt diff --git a/requirements/main.txt b/requirements/main.txt index d133867a..74ce0255 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -18,7 +18,7 @@ python-docx pydub dask distributed - +jiwer>=3.1.0,<4.0.0 # toloka-kit # Temporarily disabled due to Toloka's technical pause; keep as reference for past and future API support # for some processers, additionally https://github.com/NVIDIA/NeMo is required # for some processers, additionally nemo_text_processing is required From 8fcdd13d458d4b81b3726142fc488a662bbc2cdc Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 06:33:19 -0700 Subject: [PATCH 02/11] add comments and addres doc problems Signed-off-by: George Zelenfroind --- .github/workflows/doc-build.yml | 5 +++++ .github/workflows/tests.yml | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index 694cc7f8..663f1dbd 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -34,6 +34,11 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements/docs.txt + - name: Get Certificate for COORAL + run: | + wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] + sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] + sudo update-ca-certificates # [cert for CORAL] - name: Build docs with sphinx run: | cd docs && make clean && make html diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f1e4860a..d1258617 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -75,7 +75,7 @@ jobs: pip install nemo-toolkit[asr,nlp]==1.23.0 pip install nemo_text_processing pip install -r requirements/huggingface.txt - pip install certifi + pip install certifi #this needed to avoid problems with certificates [COORAL] export SSL_CERT_FILE=$(python -m certifi) python -m pip cache purge @@ -86,9 +86,9 @@ jobs: AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} CLEAN_UP_TMP_PATH: 1 run: | - wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem - sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt - sudo update-ca-certificates + wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] + sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] + sudo update-ca-certificates # [cert for CORAL] set -o pipefail # this will make sure next line returns non-0 exit code if tests fail python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt From b69b2289e868440d6e94209a2782ec4c5a109f94 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 08:33:46 -0700 Subject: [PATCH 03/11] update link in docs to mirror Signed-off-by: George Zelenfroind --- .github/workflows/doc-build.yml | 5 ----- dataset_configs/english/coraal/config.yaml | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index 663f1dbd..694cc7f8 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -34,11 +34,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements/docs.txt - - name: Get Certificate for COORAL - run: | - wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] - sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] - sudo update-ca-certificates # [cert for CORAL] - name: Build docs with sphinx run: | cd docs && make clean && make html diff --git a/dataset_configs/english/coraal/config.yaml b/dataset_configs/english/coraal/config.yaml index ec908c59..5d106083 100644 --- a/dataset_configs/english/coraal/config.yaml +++ b/dataset_configs/english/coraal/config.yaml @@ -18,7 +18,7 @@ documentation: | This config performs the following data processing. 1. Downloads CORAAL data based on the - `official file list `_. + `official file list `_. #Official mirror link There are a couple of errors in the links there, which are fixed in our code. 2. Drops all utterances which contain only pauses. Set ``drop_pauses=False`` to undo. 3. Groups all consecutive segments from the same speaker until 20 seconds duration From 556138978c4c3a6257a96a024ec7d9a5b393960f Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 09:07:07 -0700 Subject: [PATCH 04/11] switch to https Signed-off-by: George Zelenfroind --- dataset_configs/english/coraal/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_configs/english/coraal/config.yaml b/dataset_configs/english/coraal/config.yaml index 5d106083..b2293dd2 100644 --- a/dataset_configs/english/coraal/config.yaml +++ b/dataset_configs/english/coraal/config.yaml @@ -18,7 +18,7 @@ documentation: | This config performs the following data processing. 1. Downloads CORAAL data based on the - `official file list `_. #Official mirror link + `official file list `_. #Official mirror link There are a couple of errors in the links there, which are fixed in our code. 2. Drops all utterances which contain only pauses. Set ``drop_pauses=False`` to undo. 3. Groups all consecutive segments from the same speaker until 20 seconds duration From 2b98e423c4b47fa3a884a9a513b208883e95028a Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 09:17:06 -0700 Subject: [PATCH 05/11] add cert manually Signed-off-by: George Zelenfroind --- .github/workflows/doc-build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index 694cc7f8..4ae6346f 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -36,6 +36,9 @@ jobs: pip install -r requirements/docs.txt - name: Build docs with sphinx run: | + wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] + sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] + sudo update-ca-certificates # [cert for CORAL] cd docs && make clean && make html - name: Upload artifact From 80b5014ea1f5f65ad075fe9d2940b607ff902e12 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 09:39:16 -0700 Subject: [PATCH 06/11] ipdate all links in the file Signed-off-by: George Zelenfroind --- .../datasets/coraal/create_initial_manifest.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sdp/processors/datasets/coraal/create_initial_manifest.py b/sdp/processors/datasets/coraal/create_initial_manifest.py index 16aa166a..3dcd0c2f 100644 --- a/sdp/processors/datasets/coraal/create_initial_manifest.py +++ b/sdp/processors/datasets/coraal/create_initial_manifest.py @@ -31,15 +31,15 @@ def get_coraal_url_list(): There are a few mistakes in the official url list that are fixed here. Can be overridden by tests to select a subset of urls. """ - dataset_url = "http://lingtools.uoregon.edu/coraal/coraal_download_list.txt" + dataset_url = "https://lingtools.uoregon.edu/coraal/coraal_download_list.txt" urls = [] for file_url in urllib.request.urlopen(dataset_url): file_url = file_url.decode('utf-8').strip() # fixing known errors in the urls - if file_url == 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt': - file_url = 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt' - if file_url == 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt': - file_url = 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt' + if file_url == 'https://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt': + file_url = 'https://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt' + if file_url == 'https://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt': + file_url = 'https://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt' urls.append(file_url) return urls From b3d853f7955624a1704c0a29511bd4b15e737615 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 09:55:24 -0700 Subject: [PATCH 07/11] cert manual download to correct file Signed-off-by: George Zelenfroind --- .github/workflows/doc-build.yml | 4 ++-- .github/workflows/tests.yml | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index 4ae6346f..c860e1a2 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -36,8 +36,8 @@ jobs: pip install -r requirements/docs.txt - name: Build docs with sphinx run: | - wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] - sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] + wget -q http://crt.usertrust.com/InCommonRSAServerCA_2.crt -O incommon-rsa-server-ca-2.pem # [cert for CORAL] + sudo cp incommon-rsa-server-ca-2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] sudo update-ca-certificates # [cert for CORAL] cd docs && make clean && make html diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d1258617..01454301 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -29,6 +29,9 @@ jobs: # we are being quite strict here, but hopefully that will not be too inconvenient - name: Checking that documentation builds with no warnings and all links are working run: | + wget -q http://crt.usertrust.com/InCommonRSAServerCA_2.crt -O incommon-rsa-server-ca-2.pem #[cert for CORAL] + sudo cp incommon-rsa-server-ca-2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt #[cert for CORAL] + sudo update-ca-certificates # [cert for CORAL] cd docs && make clean && make html SPHINXOPTS="-b linkcheck -W --keep-going -n" no-nemo-tests: From 3522a893028a0fd939dc5d6d39db8ade23baf33f Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 10:02:29 -0700 Subject: [PATCH 08/11] dwnld another cert Signed-off-by: George Zelenfroind --- .github/workflows/doc-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index c860e1a2..4ae6346f 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -36,8 +36,8 @@ jobs: pip install -r requirements/docs.txt - name: Build docs with sphinx run: | - wget -q http://crt.usertrust.com/InCommonRSAServerCA_2.crt -O incommon-rsa-server-ca-2.pem # [cert for CORAL] - sudo cp incommon-rsa-server-ca-2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] + wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] + sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] sudo update-ca-certificates # [cert for CORAL] cd docs && make clean && make html From 3a71240bc7f8dcda7a3ca4785e14556826e7070e Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 10:17:13 -0700 Subject: [PATCH 09/11] update all certs Signed-off-by: George Zelenfroind --- .github/workflows/doc-build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index 4ae6346f..c37b935c 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -36,9 +36,9 @@ jobs: pip install -r requirements/docs.txt - name: Build docs with sphinx run: | - wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] - sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] - sudo update-ca-certificates # [cert for CORAL] + sudo apt-get update + sudo apt-get install -y ca-certificates + sudo update-ca-certificates cd docs && make clean && make html - name: Upload artifact From 5d6d594ef473c0c6c3f5c88c8ebb5bdca01ca764 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 10:52:43 -0700 Subject: [PATCH 10/11] add ignoring of coraal link check Signed-off-by: George Zelenfroind --- .github/workflows/doc-build.yml | 3 --- .github/workflows/tests.yml | 3 --- docs/src/conf.py | 4 ++++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index c37b935c..694cc7f8 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -36,9 +36,6 @@ jobs: pip install -r requirements/docs.txt - name: Build docs with sphinx run: | - sudo apt-get update - sudo apt-get install -y ca-certificates - sudo update-ca-certificates cd docs && make clean && make html - name: Upload artifact diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 01454301..d1258617 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -29,9 +29,6 @@ jobs: # we are being quite strict here, but hopefully that will not be too inconvenient - name: Checking that documentation builds with no warnings and all links are working run: | - wget -q http://crt.usertrust.com/InCommonRSAServerCA_2.crt -O incommon-rsa-server-ca-2.pem #[cert for CORAL] - sudo cp incommon-rsa-server-ca-2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt #[cert for CORAL] - sudo update-ca-certificates # [cert for CORAL] cd docs && make clean && make html SPHINXOPTS="-b linkcheck -W --keep-going -n" no-nemo-tests: diff --git a/docs/src/conf.py b/docs/src/conf.py index 9a5fb330..7edcbf53 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -189,3 +189,7 @@ def setup(app): ] # nitpick_ignore_regex = [('py:class', '*')] +#adding this especially for coraal, temporary +linkcheck_ignore = [ + r'http://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt', +] \ No newline at end of file From 31c80dd271fb22b91c2225dbeec723fa42959877 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 2 Jul 2025 11:02:22 -0700 Subject: [PATCH 11/11] =?UTF-8?q?http=20to=20https=C3=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: George Zelenfroind --- docs/src/conf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index 7edcbf53..29269d37 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -191,5 +191,6 @@ def setup(app): #adding this especially for coraal, temporary linkcheck_ignore = [ - r'http://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt', -] \ No newline at end of file + r'https://lingtools\.uoregon\.edu/coraal/coraal_download_list\.txt', +] +# https://lingtools.uoregon.edu/coraal/coraal_download_list.txt \ No newline at end of file