From 2ef44f8d7758b846dc6e66cb620b370fccc6b818 Mon Sep 17 00:00:00 2001 From: Ryan Date: Tue, 22 Apr 2025 11:10:25 -0700 Subject: [PATCH 1/6] Add processors for downloading HiFiTTS2 dataset Signed-off-by: Ryan --- .../english/hifitts2/config_22khz.yaml | 60 ++++++++ .../english/hifitts2/config_44khz.yaml | 60 ++++++++ docs/src/sdp/existing_configs.rst | 13 ++ sdp/processors/__init__.py | 2 + sdp/processors/datasets/hifitts2/__init__.py | 0 .../datasets/hifitts2/download_dataset.py | 134 ++++++++++++++++++ .../hifitts2/remove_failed_chapters.py | 58 ++++++++ 7 files changed, 327 insertions(+) create mode 100644 dataset_configs/english/hifitts2/config_22khz.yaml create mode 100644 dataset_configs/english/hifitts2/config_44khz.yaml create mode 100644 sdp/processors/datasets/hifitts2/__init__.py create mode 100644 sdp/processors/datasets/hifitts2/download_dataset.py create mode 100644 sdp/processors/datasets/hifitts2/remove_failed_chapters.py diff --git a/dataset_configs/english/hifitts2/config_22khz.yaml b/dataset_configs/english/hifitts2/config_22khz.yaml new file mode 100644 index 00000000..e7d5eed7 --- /dev/null +++ b/dataset_configs/english/hifitts2/config_22khz.yaml @@ -0,0 +1,60 @@ +documentation: | + HiFiTTS-2 22kHz + ################## + + This config can be used to download the audio data for HiFiTTS-2 22kHz. + + 1. Downloads HiFiTTS-2 audio from LibriVox. + 2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they + were removed from the website) are removed. + + **Required arguments**. + + * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored. + + Note that you can customize any part of this config either directly or from command-line. + + **Output format**. + + This config outputs 2 manifest files: + + * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox. + * ``${workspace_dir}/manifest_22khz_filtered`` - input manifest file without utterances from failed chapters. + +processors_to_run: all +workspace_dir: ??? +manifest_filename: manifest_22khz.json +output_filename: manifest_22khz_filtered.json +chapter_filename: chapters_22khz.json +error_filename: errors_22khz.json +sample_rate: 22050 +delete_chapter_files: true +exit_on_error: false +use_dask: false +max_workers: 8 +chunksize: 50 + +input_manifest_file: ${workspace_dir}/${manifest_filename} +output_manifest_file: ${workspace_dir}/${output_filename} +chapter_file: ${workspace_dir}/${chapter_filename} +error_file: ${workspace_dir}/${error_filename} +audio_dir: ${workspace_dir}/audio +chapter_dir: ${workspace_dir}/chapters + +processors: + - _target_: sdp.processors.DownloadHiFiTTS2 + audio_dir: ${audio_dir} + chapter_dir: ${chapter_dir} + sample_rate: ${sample_rate} + delete_chapter_files: ${delete_chapter_files} + exit_on_error: ${exit_on_error} + input_manifest_file: ${chapter_file} + output_manifest_file: ${error_file} + use_dask: ${use_dask} + max_workers: ${max_workers} + chunksize: ${chunksize} + + - _target_: sdp.processors.RemovedFailedChapters + input_manifest_file: ${input_manifest_file} + output_manifest_file: ${output_manifest_file} + error_file: ${error_file} diff --git a/dataset_configs/english/hifitts2/config_44khz.yaml b/dataset_configs/english/hifitts2/config_44khz.yaml new file mode 100644 index 00000000..acb55585 --- /dev/null +++ b/dataset_configs/english/hifitts2/config_44khz.yaml @@ -0,0 +1,60 @@ +documentation: | + HiFiTTS-2 44kHz + ################## + + This config can be used to download the audio data for HiFiTTS-2 44kHz. + + 1. Downloads HiFiTTS-2 audio from LibriVox. + 2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they + were removed from the website) are removed. + + **Required arguments**. + + * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored. + + Note that you can customize any part of this config either directly or from command-line. + + **Output format**. + + This config outputs 2 manifest files: + + * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox. + * ``${workspace_dir}/manifest_44khz_filtered`` - input manifest file without utterances from failed chapters. + +processors_to_run: all +workspace_dir: ??? +manifest_filename: manifest_44khz.json +output_filename: manifest_44khz_filtered.json +chapter_filename: chapters_44khz.json +error_filename: errors_44khz.json +sample_rate: 44100 +delete_chapter_files: true +exit_on_error: false +use_dask: false +max_workers: 8 +chunksize: 50 + +input_manifest_file: ${workspace_dir}/${manifest_filename} +output_manifest_file: ${workspace_dir}/${output_filename} +chapter_file: ${workspace_dir}/${chapter_filename} +error_file: ${workspace_dir}/${error_filename} +audio_dir: ${workspace_dir}/audio +chapter_dir: ${workspace_dir}/chapters + +processors: + - _target_: sdp.processors.DownloadHiFiTTS2 + audio_dir: ${audio_dir} + chapter_dir: ${chapter_dir} + sample_rate: ${sample_rate} + delete_chapter_files: ${delete_chapter_files} + exit_on_error: ${exit_on_error} + input_manifest_file: ${chapter_file} + output_manifest_file: ${error_file} + use_dask: ${use_dask} + max_workers: ${max_workers} + chunksize: ${chunksize} + + - _target_: sdp.processors.RemovedFailedChapters + input_manifest_file: ${input_manifest_file} + output_manifest_file: ${output_manifest_file} + error_file: ${error_file} diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst index f8265816..652e633c 100644 --- a/docs/src/sdp/existing_configs.rst +++ b/docs/src/sdp/existing_configs.rst @@ -373,3 +373,16 @@ Armenian Toloka config-docs/armenian/toloka/pipeline_validate_answers config-docs/armenian/toloka/pipeline_get_final_res +HiFiTTS-2 +~~~~~~~~~~~~~~~~~~~~~~~ + +**Dataset link:** TODO + +`config `__ | +:doc:`documentation ` + +.. toctree:: + :hidden: + + config-docs/english/hifitts2/config_22khz + config-docs/english/hifitts2/config_44khz diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index 6788c88f..e71b7dff 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -24,6 +24,8 @@ from sdp.processors.datasets.fleurs.create_initial_manifest import ( CreateInitialManifestFleurs, ) +from sdp.processors.datasets.hifitts2.download_dataset import DownloadHiFiTTS2 +from sdp.processors.datasets.hifitts2.remove_failed_chapters import RemovedFailedChapters from sdp.processors.datasets.uzbekvoice.create_initial_manifest import ( CreateInitialManifestUzbekvoice, ) diff --git a/sdp/processors/datasets/hifitts2/__init__.py b/sdp/processors/datasets/hifitts2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py new file mode 100644 index 00000000..0c7e0b61 --- /dev/null +++ b/sdp/processors/datasets/hifitts2/download_dataset.py @@ -0,0 +1,134 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import librosa +from pathlib import Path +import soundfile as sf +import time +import urllib.error +import urllib.request + +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + + +class DownloadHiFiTTS2(BaseParallelProcessor): + """ + Downloads HiFiTTS-2 dataset to local machine. Unsegmented audiobook chapters are first downloaded at a + 48 kHz from LibriVox. Each chapter is then split into segmented utterance files based on precomputed + offsets and durations. + + To reduce disk use, the chapter files can be optionally deleted after they are segmented. + + Metadata for chapters which fail to download due to network errors are stored in an output manifest file, + which can be given as input to this processor to attempt the downloads again. + + Args: + audio_dir (str): Root directory where utterance files will be saved. + chapter_dir (str): Root directory where audiobook chapter files will be saved. + sample_rate (int): Sample rate to use for utterance files. + delete_chapter_files (bool): Whether to delete each chapter file after it is done being processed. + exit_on_error (bool): Whether to terminate the entire processor script if a single chapter downlaod fails. + num_retries (int): Number of times to retry chapter download after encountering intermittent HTTP errors. + + Returns: + Utterance files are stored under 'audio_dir' and chapter files are downloaded under 'chapter_dir'. + + If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod, + with error information stored under the 'error_code' and 'error_reason' fields. + """ + + def __init__( + self, + audio_dir: str, + chapter_dir: str, + sample_rate: int = 22050, + delete_chapter_files: bool = True, + exit_on_error: bool = False, + num_retries: int = 5, + **kwargs, + ): + super().__init__(**kwargs) + self.audio_dir = Path(audio_dir) + self.chapter_dir = Path(chapter_dir) + self.sample_rate = sample_rate + self.delete_chapter_files = delete_chapter_files + self.exit_on_error = exit_on_error + self.num_retries = num_retries + + def prepare(self): + # Create output directory structure + with open(self.input_manifest_file, "rt", encoding="utf-8") as fin: + dirs = set() + for line in fin: + row = json.loads(line) + audio_filepath = Path(row["utterances"][0]["audio_filepath"]) + chapter_dir = audio_filepath.parent + dirs.add(chapter_dir) + + for dir in dirs: + audio_dir = self.audio_dir / dir + chapter_dir = self.chapter_dir / dir + audio_dir.mkdir(exist_ok=True, parents=True) + chapter_dir.mkdir(exist_ok=True, parents=True) + + return + + def process_dataset_entry(self, data_entry): + url = data_entry["url"] + chapter_filepath = data_entry["chapter_filepath"] + utterances = data_entry["utterances"] + + chapter_path = self.chapter_dir / chapter_filepath + for i in range(1, self.num_retries + 1): + try: + urllib.request.urlretrieve(url=url, filename=chapter_path) + break + except (urllib.error.HTTPError, urllib.error.URLError) as http_error: + error_msg = f"Encountered HTTP error when downloading {url}: {http_error}" + print(error_msg) + if str(http_error.code).startswith("5") and i < self.num_retries: + print(f"Retry {i} for url {url}") + time.sleep(10) + continue + + if self.exit_on_error: + raise RuntimeError(error_msg) + + error_data = { + "url": url, + "chapter_filepath": chapter_filepath, + "error_code": http_error.code, + "error_reason": http_error.reason, + "utterances": utterances, + } + return [DataEntry(data=error_data)] + + chapter_audio, sr = librosa.load(path=chapter_path, sr=self.sample_rate) + + for utt in utterances: + audio_filepath = utt["audio_filepath"] + audio_path = self.audio_dir / audio_filepath + offset = utt["offset"] + dur = utt["duration"] + start_sample = librosa.time_to_samples(offset, sr=sr) + end_sample = librosa.time_to_samples(offset + dur, sr=sr) + audio = chapter_audio[start_sample:end_sample] + sf.write(file=audio_path, data=audio, samplerate=int(sr)) + + if self.delete_chapter_files: + chapter_path.unlink() + + return [] diff --git a/sdp/processors/datasets/hifitts2/remove_failed_chapters.py b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py new file mode 100644 index 00000000..7d6eaf07 --- /dev/null +++ b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py @@ -0,0 +1,58 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +from pathlib import Path +from tqdm import tqdm + +from sdp.processors.base_processor import BaseProcessor +from sdp.utils.common import load_manifest + + +class RemovedFailedChapters(BaseProcessor): + """ + Removes all utterances in the input chapter file from the input manifest. This processor is expected to be + run using the file output by the DownloadHiFiTTS2 containing failed chapter downloads. + + Args: + error_file (str): Path to file with chapter download errors. + + Returns: + This outputs a manifest which is the same as its input manifest but with utterances in 'error_file' removed. + """ + + def __init__( + self, + error_file: str, + **kwargs, + ): + super().__init__(**kwargs) + self.error_file = Path(error_file) + + def process(self): + chapter_rows = load_manifest(self.error_file) + audio_files_to_remove = set() + for chapter_row in chapter_rows: + for utt_list in chapter_row["utterances"]: + audio_files_to_remove.add(utt_list["audio_filepath"]) + + rows = load_manifest(Path(self.input_manifest_file)) + with open(self.output_manifest_file, "w", encoding="utf-8") as output_f: + for row in tqdm(rows): + if row["audio_filepath"] in audio_files_to_remove: + continue + + output_line = f"{json.dumps(row, ensure_ascii=False)}\n" + output_f.write(output_line) From 2f63993022c78232bc7390e0140d95badb05f165 Mon Sep 17 00:00:00 2001 From: Ryan Date: Mon, 12 May 2025 10:51:14 -0700 Subject: [PATCH 2/6] Add bandwidth estimation processor from HiFiTTS-2 Signed-off-by: Ryan --- .../english/hifitts2/config_22khz.yaml | 14 ++-- .../english/hifitts2/config_44khz.yaml | 14 ++-- .../english/hifitts2/config_bandwidth.yaml | 44 +++++++++++ docs/src/sdp/api.rst | 14 ++++ sdp/processors/__init__.py | 1 + .../datasets/hifitts2/download_dataset.py | 5 +- sdp/processors/nemo/estimate_bandwidth.py | 77 +++++++++++++++++++ .../prepare_hifitts2_data.py | 71 +++++++++++++++++ tests/test_cfg_end_to_end_tests.py | 26 ++++++- 9 files changed, 248 insertions(+), 18 deletions(-) create mode 100644 dataset_configs/english/hifitts2/config_bandwidth.yaml create mode 100644 sdp/processors/nemo/estimate_bandwidth.py create mode 100644 tests/prepare_test_data/prepare_hifitts2_data.py diff --git a/dataset_configs/english/hifitts2/config_22khz.yaml b/dataset_configs/english/hifitts2/config_22khz.yaml index e7d5eed7..09954bfe 100644 --- a/dataset_configs/english/hifitts2/config_22khz.yaml +++ b/dataset_configs/english/hifitts2/config_22khz.yaml @@ -19,14 +19,16 @@ documentation: | This config outputs 2 manifest files: * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox. - * ``${workspace_dir}/manifest_22khz_filtered`` - input manifest file without utterances from failed chapters. + * ``${workspace_dir}/manifest_filtered_22khz`` - input manifest file without utterances from failed chapters. processors_to_run: all workspace_dir: ??? manifest_filename: manifest_22khz.json -output_filename: manifest_22khz_filtered.json +output_filename: manifest_filtered_22khz.json chapter_filename: chapters_22khz.json error_filename: errors_22khz.json +audio_dir_name: audio_22khz +chapter_audio_dir_name: chapters sample_rate: 22050 delete_chapter_files: true exit_on_error: false @@ -35,11 +37,11 @@ max_workers: 8 chunksize: 50 input_manifest_file: ${workspace_dir}/${manifest_filename} -output_manifest_file: ${workspace_dir}/${output_filename} chapter_file: ${workspace_dir}/${chapter_filename} error_file: ${workspace_dir}/${error_filename} -audio_dir: ${workspace_dir}/audio -chapter_dir: ${workspace_dir}/chapters +audio_dir: ${workspace_dir}/${audio_dir_name} +chapter_dir: ${workspace_dir}/${chapter_audio_dir_name} +final_manifest: ${workspace_dir}/${output_filename} processors: - _target_: sdp.processors.DownloadHiFiTTS2 @@ -56,5 +58,5 @@ processors: - _target_: sdp.processors.RemovedFailedChapters input_manifest_file: ${input_manifest_file} - output_manifest_file: ${output_manifest_file} + output_manifest_file: ${final_manifest} error_file: ${error_file} diff --git a/dataset_configs/english/hifitts2/config_44khz.yaml b/dataset_configs/english/hifitts2/config_44khz.yaml index acb55585..33d79f98 100644 --- a/dataset_configs/english/hifitts2/config_44khz.yaml +++ b/dataset_configs/english/hifitts2/config_44khz.yaml @@ -19,14 +19,16 @@ documentation: | This config outputs 2 manifest files: * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox. - * ``${workspace_dir}/manifest_44khz_filtered`` - input manifest file without utterances from failed chapters. + * ``${workspace_dir}/manifest_filtered_44khz`` - input manifest file without utterances from failed chapters. processors_to_run: all workspace_dir: ??? manifest_filename: manifest_44khz.json -output_filename: manifest_44khz_filtered.json +output_filename: manifest_filtered_44khz.json chapter_filename: chapters_44khz.json error_filename: errors_44khz.json +audio_dir_name: audio_44khz +chapter_audio_dir_name: chapters sample_rate: 44100 delete_chapter_files: true exit_on_error: false @@ -35,11 +37,11 @@ max_workers: 8 chunksize: 50 input_manifest_file: ${workspace_dir}/${manifest_filename} -output_manifest_file: ${workspace_dir}/${output_filename} chapter_file: ${workspace_dir}/${chapter_filename} error_file: ${workspace_dir}/${error_filename} -audio_dir: ${workspace_dir}/audio -chapter_dir: ${workspace_dir}/chapters +audio_dir: ${workspace_dir}/${audio_dir_name} +chapter_dir: ${workspace_dir}/${chapter_audio_dir_name} +final_manifest: ${workspace_dir}/${output_filename} processors: - _target_: sdp.processors.DownloadHiFiTTS2 @@ -56,5 +58,5 @@ processors: - _target_: sdp.processors.RemovedFailedChapters input_manifest_file: ${input_manifest_file} - output_manifest_file: ${output_manifest_file} + output_manifest_file: ${final_manifest} error_file: ${error_file} diff --git a/dataset_configs/english/hifitts2/config_bandwidth.yaml b/dataset_configs/english/hifitts2/config_bandwidth.yaml new file mode 100644 index 00000000..59e1062e --- /dev/null +++ b/dataset_configs/english/hifitts2/config_bandwidth.yaml @@ -0,0 +1,44 @@ +documentation: | + HiFiTTS2 Bandwidth Estimation + ################## + + This config contains the bandwidth estimation code used for HiFiTTS and HiFiTTS-2. + This config can be used to estimate bandwidth for any dataset. For HiFiTTS-2 bandwidth + was estimated using the first 30 seconds of every audiobook chapter, but the estimate is still + reasonably accurate if run over a shorter duration or with individual utterances. + + **Required arguments**. + + * **workspace_dir**: The workspace folder where all audio files and manifests are stored. + * **audio_dir**: Folder in workspace containing audio files to estimate bandwidth of. + * **input_manifest_filename**: Manifest file in workspace containing relative paths to audio. + + **Output format**. + + This config outputs a single manifest with the following field(s): + + * **bandwidth (int)**: Estimated bandwidth of the audio file. + +processors_to_run: all +workspace_dir: ??? +audio_dir_name: ??? +input_manifest_filename: ??? +output_manifest_filename: manifest_bandwidth.json +audio_key: audio_filepath +use_dask: false +max_workers: 1 +chunksize: 1 + +input_manifest_file: ${workspace_dir}/${input_manifest_filename} +final_manifest: ${workspace_dir}/${output_manifest_filename} +audio_dir: ${workspace_dir}/${audio_dir_name} + +processors: + - _target_: sdp.processors.EstimateBandwidth + input_manifest_file: ${input_manifest_file} + output_manifest_file: ${final_manifest} + audio_dir: ${audio_dir} + input_audio_key: ${audio_key} + use_dask: ${use_dask} + max_workers: ${max_workers} + chunksize: ${chunksize} diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 4700f860..bfbac5ad 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -116,6 +116,17 @@ HuggingFace Datasets .. autodata:: sdp.processors.CreateInitialManifestHuggingFace :annotation: + +HiFiTTS-2 +'''''''''''''''''''' + +.. autodata:: sdp.processors.DownloadHiFiTTS2 + :annotation: + +.. autodata:: sdp.processors.RemovedFailedChapters + :annotation: + + Lhotse processors ################# @@ -145,6 +156,9 @@ used in the downstream processing for additional enhancement or filtering. .. autodata:: sdp.processors.ASRTransformers :annotation: +.. autodata:: sdp.processors.EstimateBandwidth + :annotation: + Text-only processors #################### diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index e71b7dff..fac13976 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -126,6 +126,7 @@ MakeLettersUppercaseAfterPeriod, ) from sdp.processors.nemo.asr_inference import ASRInference +from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth from sdp.processors.nemo.pc_inference import PCInference from sdp.processors.toloka.accept_if import AcceptIfWERLess from sdp.processors.toloka.create_pool import CreateTolokaPool diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py index 0c7e0b61..74c91bb9 100644 --- a/sdp/processors/datasets/hifitts2/download_dataset.py +++ b/sdp/processors/datasets/hifitts2/download_dataset.py @@ -21,6 +21,7 @@ import urllib.error import urllib.request +from sdp.logging import logger from sdp.processors.base_processor import BaseParallelProcessor, DataEntry @@ -98,9 +99,9 @@ def process_dataset_entry(self, data_entry): break except (urllib.error.HTTPError, urllib.error.URLError) as http_error: error_msg = f"Encountered HTTP error when downloading {url}: {http_error}" - print(error_msg) + logger.warning(error_msg) if str(http_error.code).startswith("5") and i < self.num_retries: - print(f"Retry {i} for url {url}") + logger.info(f"Retry {i} for url {url}") time.sleep(10) continue diff --git a/sdp/processors/nemo/estimate_bandwidth.py b/sdp/processors/nemo/estimate_bandwidth.py new file mode 100644 index 00000000..149ff0ba --- /dev/null +++ b/sdp/processors/nemo/estimate_bandwidth.py @@ -0,0 +1,77 @@ +import librosa +import numpy as np +from pathlib import Path + +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + + +class EstimateBandwidth(BaseParallelProcessor): + """Adds estimated bandwidth to each utterance in the input manifest file. + + Args: + audio_dir (str): Root directory where audio files are stored. + input_audio_key (str): Manifest key with relative audio paths. + output_bandwidth_key (str): Manifest key to store estimated bandwidth in. + max_seconds (float): The maximum length of audio to use for bandwidth estimation. + By default, uses the first 30 seconds. + sample_rate (int): Sample rate to resample audio to before doing bandwidth estimation. + Defaults to 44100, upsampling the input audio as needed. + n_fft (int): Number of FFT bins to use for bandwidth estimation. Defaults to 512. + hop_length (int): Audio frame hop length to use for bandwidth estimation. + Defaults to 441, corresponding to 0.01 seconds for 44100 sample rate. + top_db (float): top_db treshhold to use for bandwidth estimation. + frequency_threshold (float): Bandwidth estimation finds the highest frequency with mean power spectrum that is + within 'frequency_threshold' dB of its peak power. Defaults to -50 dB. + + Returns: + This processor estimates the bandwidth of the audio file in the`input_audio_key` field and saves the estimate + in the output_bandwidth_key` field. + """ + + def __init__( + self, + audio_dir: str, + input_audio_key: str = "audio_filepath", + output_bandwidth_key: str = "bandwidth", + max_seconds: float = 30.0, + sample_rate: int = 44100, + n_fft: int = 512, + hop_length: int = 441, + top_db: float = 100.0, + frequency_threshold: float = -50.0, + **kwargs, + ): + super().__init__(**kwargs) + self.audio_directory = Path(audio_dir) + self.input_audio_key = input_audio_key + self.output_bandwidth_key = output_bandwidth_key + self.max_seconds = max_seconds + self.sample_rate = sample_rate + self.n_fft = n_fft + self.hop_length = hop_length + self.top_db = top_db + self.frequency_threshold = frequency_threshold + + def _estimate_bandwidth(self, audio, sample_rate): + spec = librosa.stft(y=audio, n_fft=self.n_fft, hop_length=self.hop_length, window="blackmanharris") + power_spec = np.abs(spec) ** 2 + power_spec = np.mean(power_spec, axis=1) + power_spec = librosa.power_to_db(power_spec, ref=self.n_fft, top_db=self.top_db) + + bandwidth = 0 + peak = np.max(power_spec) + freq_width = sample_rate / self.n_fft + for idx in range(len(power_spec) - 1, -1, -1): + if power_spec[idx] - peak > self.frequency_threshold: + bandwidth = idx * freq_width + break + + return bandwidth + + def process_dataset_entry(self, data_entry): + audio_filename = data_entry[self.input_audio_key] + audio_file = self.audio_directory / audio_filename + audio, sr = librosa.load(path=audio_file, sr=self.sample_rate, duration=self.max_seconds) + bandwidth = self._estimate_bandwidth(audio=audio, sample_rate=sr) + data_entry[self.output_bandwidth_key] = int(bandwidth) + return [DataEntry(data=data_entry)] diff --git a/tests/prepare_test_data/prepare_hifitts2_data.py b/tests/prepare_test_data/prepare_hifitts2_data.py new file mode 100644 index 00000000..93e07132 --- /dev/null +++ b/tests/prepare_test_data/prepare_hifitts2_data.py @@ -0,0 +1,71 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Copies HiFiTTS-2 manifests and audio into a new directory with fewer entries.""" + +import argparse +import json +import os +from pathlib import Path +import shutil + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Preparing HiFiTTS-2 test data") + parser.add_argument( + "--workspace_folder", required=True, type=Path, help="Path to workspace where dataset was downloaded." + ) + parser.add_argument( + "--audio_folder", default="audio_22khz", type=Path, required=False, help="Name of root folder with audio." + ) + parser.add_argument("--test_data_folder", required=True, type=Path, help="Where to place the prepared data") + parser.add_argument( + "--manifest_filename", default="manifest_22khz.json", type=str, required=False, help="Name of manifest manifest." + ) + parser.add_argument( + "--chapters_filename", default="chapters_22khz.json", type=str, required=False, help="Name of chapter manifest." + ) + parser.add_argument( + "--error_filename", default="errors_22khz.json", type=str, required=False, help="Name of chapter error manifest." + ) + parser.add_argument("--num_entries", default=20, type=int, help="How many entries to keep from each manifest") + + args = parser.parse_args() + + files_to_copy = [args.manifest_filename, args.chapters_filename, args.error_filename] + + os.makedirs(args.test_data_folder, exist_ok=True) + # Copy manifest files + for filename in files_to_copy: + input_path = args.workspace_folder / filename + output_path = args.test_data_folder / filename + with open(input_path, "r", encoding="utf-8") as input_f: + with open(output_path, "w", encoding="utf-8") as output_f: + for i, line in enumerate(input_f): + if i >= args.num_entries: + break + output_f.write(line) + + # Copy audio + input_audio_dir = args.workspace_folder / args.audio_folder + output_audio_dir = args.test_data_folder / args.audio_folder + with open(args.manifest_filename, "r", encoding="utf-8") as input_f: + for i, line in enumerate(input_f): + if i >= args.num_entries: + break + row = json.loads(line) + audio_filepath = row["audio_filepath"] + input_path = input_audio_dir / audio_filepath + output_path = output_audio_dir / audio_filepath + output_path.parent.mkdir(exist_ok=True, parents=True) + shutil.copy(src=input_path, dst=output_path) \ No newline at end of file diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index db0425e7..e52678a7 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -36,8 +36,8 @@ class TestCase: """Class for keeping track of test cases.""" config_path: str data_check_fn: Callable - # Fields in the manifest to ignore (can be set when non-deterministic processor was used) reference_manifest_filename: str = "test_data_reference.json" + # Fields in the manifest to ignore (can be set when non-deterministic processor was used) fields_to_ignore: List[str] = field(default_factory=list) processors_to_run: str = "" @@ -232,19 +232,29 @@ def get_test_cases() -> List[Tuple[str, Callable]]: data_check_fn=partial(data_check_fn_generic, file_name="everyayah.hf") ), TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", data_check_fn=data_check_fn_armenian_toloka_pipeline_start, fields_to_ignore=['source_filepath'], processors_to_run="2:14", reference_manifest_filename="pipeline_start/test_data_reference.json" ), TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res, reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", fields_to_ignore=['audio_filepath', 'duration'], processors_to_run="1:6" - ) + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_22khz.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="manifest_filtered_22khz.json"), + processors_to_run="1:2" + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_bandwidth.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"), + reference_manifest_filename="test_data_reference_bandwidth.json", + ), ] def get_test_names(): @@ -357,6 +367,14 @@ def test_configs(setup_data, tmp_path): # Set input_manifest_file for ASRFileCheck to use the existing manifest.json cfg.processors[1].input_manifest_file = (data_dir / "pipeline_get_final_res" / "manifest.json").as_posix() + if "english/hifitts2/config_22khz" in config_path: + cfg.processors[1].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix() + cfg.processors[1].error_file = (data_dir / "errors_22khz.json").as_posix() + + if "english/hifitts2/config_bandwidth" in config_path: + cfg.processors[0].audio_dir = (data_dir / "audio_22khz").as_posix() + cfg.processors[0].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix() + run_processors(cfg) # additionally, let's test that final generated manifest matches the # reference file (ignoring the file paths and additional fields explicitly specified to ignore) From aa643f85ad2a6ece3acd7a3c56cc43ddfe17f2e6 Mon Sep 17 00:00:00 2001 From: Ryan Date: Mon, 12 May 2025 16:48:22 -0700 Subject: [PATCH 3/6] Fix bandwidth documentation Signed-off-by: Ryan --- dataset_configs/english/hifitts2/config_22khz.yaml | 2 +- .../english/hifitts2/config_bandwidth.yaml | 4 ++-- docs/src/sdp/existing_configs.rst | 12 ++++++++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/dataset_configs/english/hifitts2/config_22khz.yaml b/dataset_configs/english/hifitts2/config_22khz.yaml index 09954bfe..93506582 100644 --- a/dataset_configs/english/hifitts2/config_22khz.yaml +++ b/dataset_configs/english/hifitts2/config_22khz.yaml @@ -1,6 +1,6 @@ documentation: | HiFiTTS-2 22kHz - ################## + ############### This config can be used to download the audio data for HiFiTTS-2 22kHz. diff --git a/dataset_configs/english/hifitts2/config_bandwidth.yaml b/dataset_configs/english/hifitts2/config_bandwidth.yaml index 59e1062e..15c219bf 100644 --- a/dataset_configs/english/hifitts2/config_bandwidth.yaml +++ b/dataset_configs/english/hifitts2/config_bandwidth.yaml @@ -1,6 +1,6 @@ documentation: | - HiFiTTS2 Bandwidth Estimation - ################## + HiFiTTS-2 Bandwidth Estimation + ############################## This config contains the bandwidth estimation code used for HiFiTTS and HiFiTTS-2. This config can be used to estimate bandwidth for any dataset. For HiFiTTS-2 bandwidth diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst index 493b7233..5be69922 100644 --- a/docs/src/sdp/existing_configs.rst +++ b/docs/src/sdp/existing_configs.rst @@ -391,11 +391,19 @@ HiFiTTS-2 **Dataset link:** TODO -`config `__ | -:doc:`documentation ` +* **22kHz**: + `config `__ | + :doc:`documentation ` +* **44kHz**: + `config `__ | + :doc:`documentation ` +* **Bandwidth Estimation**: + `config `__ | + :doc:`documentation ` .. toctree:: :hidden: config-docs/english/hifitts2/config_22khz config-docs/english/hifitts2/config_44khz + config-docs/english/hifitts2/config_bandwidth From 64fb25ea5037b66768d42c271a960311c40a1e24 Mon Sep 17 00:00:00 2001 From: Ryan Date: Tue, 27 May 2025 12:05:58 -0700 Subject: [PATCH 4/6] Fix tests, add example to docstrings Signed-off-by: Ryan --- sdp/processors/datasets/hifitts2/download_dataset.py | 10 ++++++++++ .../datasets/hifitts2/remove_failed_chapters.py | 8 ++++++++ sdp/processors/nemo/estimate_bandwidth.py | 12 +++++++++++- tests/prepare_test_data/prepare_hifitts2_data.py | 3 ++- tests/test_cfg_end_to_end_tests.py | 3 ++- 5 files changed, 33 insertions(+), 3 deletions(-) diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py index 74c91bb9..b1982319 100644 --- a/sdp/processors/datasets/hifitts2/download_dataset.py +++ b/sdp/processors/datasets/hifitts2/download_dataset.py @@ -49,6 +49,16 @@ class DownloadHiFiTTS2(BaseParallelProcessor): If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod, with error information stored under the 'error_code' and 'error_reason' fields. + + Example: + .. code-block:: yaml + + - _target_: sdp.processors.DownloadHiFiTTS2 + input_manifest_file: ${workspace_dir}/manifest_22khz.json + output_manifest_file: ${workspace_dir}/errors_22khz.json + audio_dir: ${workspace_dir}/audio_22khz + chapter_dir: ${workspace_dir}/chapters + max_workers: 8 """ def __init__( diff --git a/sdp/processors/datasets/hifitts2/remove_failed_chapters.py b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py index 7d6eaf07..b4cd5a8b 100644 --- a/sdp/processors/datasets/hifitts2/remove_failed_chapters.py +++ b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py @@ -31,6 +31,14 @@ class RemovedFailedChapters(BaseProcessor): Returns: This outputs a manifest which is the same as its input manifest but with utterances in 'error_file' removed. + + Example: + .. code-block:: yaml + + - _target_: sdp.processors.RemovedFailedChapters + input_manifest_file: ${workspace_dir}/manifest_22khz.json + output_manifest_file: ${workspace_dir}/manifest_filtered_22khz.json + error_file: ${workspace_dir}/errors_22khz.json """ def __init__( diff --git a/sdp/processors/nemo/estimate_bandwidth.py b/sdp/processors/nemo/estimate_bandwidth.py index 149ff0ba..38b261e7 100644 --- a/sdp/processors/nemo/estimate_bandwidth.py +++ b/sdp/processors/nemo/estimate_bandwidth.py @@ -6,7 +6,8 @@ class EstimateBandwidth(BaseParallelProcessor): - """Adds estimated bandwidth to each utterance in the input manifest file. + """ + Adds estimated bandwidth to each utterance in the input manifest file. Args: audio_dir (str): Root directory where audio files are stored. @@ -26,6 +27,15 @@ class EstimateBandwidth(BaseParallelProcessor): Returns: This processor estimates the bandwidth of the audio file in the`input_audio_key` field and saves the estimate in the output_bandwidth_key` field. + + Example: + .. code-block:: yaml + + - _target_: sdp.processors.EstimateBandwidth + input_manifest_file: ${workspace_dir}/manifest.json + output_manifest_file: ${workspace_dir}/manifest_bandwidth.json + audio_dir: ${workspace_dir}/audio_22khz + max_workers: 8 """ def __init__( diff --git a/tests/prepare_test_data/prepare_hifitts2_data.py b/tests/prepare_test_data/prepare_hifitts2_data.py index 93e07132..9a83fad1 100644 --- a/tests/prepare_test_data/prepare_hifitts2_data.py +++ b/tests/prepare_test_data/prepare_hifitts2_data.py @@ -57,9 +57,10 @@ output_f.write(line) # Copy audio + manifest_path = args.test_data_folder / args.manifest_filename input_audio_dir = args.workspace_folder / args.audio_folder output_audio_dir = args.test_data_folder / args.audio_folder - with open(args.manifest_filename, "r", encoding="utf-8") as input_f: + with open(manifest_path, "r", encoding="utf-8") as input_f: for i, line in enumerate(input_f): if i >= args.num_entries: break diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index e52678a7..214096ee 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -247,7 +247,7 @@ def get_test_cases() -> List[Tuple[str, Callable]]: ), TestCase( config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_22khz.yaml", - data_check_fn=partial(data_check_fn_generic, file_name="manifest_filtered_22khz.json"), + data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"), processors_to_run="1:2" ), TestCase( @@ -257,6 +257,7 @@ def get_test_cases() -> List[Tuple[str, Callable]]: ), ] + def get_test_names(): config_names = [ Path(t.config_path).parent.relative_to(DATASET_CONFIGS_ROOT).as_posix() for t in get_test_cases() From 2fd724ed42aafa847578ecd595c50852f4498b62 Mon Sep 17 00:00:00 2001 From: Ryan Date: Mon, 2 Jun 2025 09:49:42 -0700 Subject: [PATCH 5/6] Fix exception handling for URLError Signed-off-by: Ryan --- sdp/processors/datasets/hifitts2/download_dataset.py | 6 ++++-- tests/test_cfg_end_to_end_tests.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py index b1982319..6e965e32 100644 --- a/sdp/processors/datasets/hifitts2/download_dataset.py +++ b/sdp/processors/datasets/hifitts2/download_dataset.py @@ -110,7 +110,9 @@ def process_dataset_entry(self, data_entry): except (urllib.error.HTTPError, urllib.error.URLError) as http_error: error_msg = f"Encountered HTTP error when downloading {url}: {http_error}" logger.warning(error_msg) - if str(http_error.code).startswith("5") and i < self.num_retries: + + error_code = getattr(http_error, "code", 0) + if (not error_code or str(error_code).startswith("5")) and i < self.num_retries: logger.info(f"Retry {i} for url {url}") time.sleep(10) continue @@ -121,7 +123,7 @@ def process_dataset_entry(self, data_entry): error_data = { "url": url, "chapter_filepath": chapter_filepath, - "error_code": http_error.code, + "error_code": error_code, "error_reason": http_error.reason, "utterances": utterances, } diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index 214096ee..562fc0c2 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -257,7 +257,6 @@ def get_test_cases() -> List[Tuple[str, Callable]]: ), ] - def get_test_names(): config_names = [ Path(t.config_path).parent.relative_to(DATASET_CONFIGS_ROOT).as_posix() for t in get_test_cases() From ed4124fae6cdbb530147ab77e9fbf11e35d2db16 Mon Sep 17 00:00:00 2001 From: Ryan Date: Tue, 3 Jun 2025 09:30:36 -0700 Subject: [PATCH 6/6] Add 44kHz config test Signed-off-by: Ryan --- tests/test_cfg_end_to_end_tests.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index 562fc0c2..9d860ce9 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -250,6 +250,11 @@ def get_test_cases() -> List[Tuple[str, Callable]]: data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"), processors_to_run="1:2" ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_44khz.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="manifest_44khz.json"), + processors_to_run="1:2" + ), TestCase( config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_bandwidth.yaml", data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"), @@ -371,6 +376,10 @@ def test_configs(setup_data, tmp_path): cfg.processors[1].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix() cfg.processors[1].error_file = (data_dir / "errors_22khz.json").as_posix() + if "english/hifitts2/config_44khz" in config_path: + cfg.processors[1].input_manifest_file = (data_dir / "manifest_44khz.json").as_posix() + cfg.processors[1].error_file = (data_dir / "errors_44khz.json").as_posix() + if "english/hifitts2/config_bandwidth" in config_path: cfg.processors[0].audio_dir = (data_dir / "audio_22khz").as_posix() cfg.processors[0].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix()