NVIDIA · karpnv · Jun 4, 2025 · Apr 22, 2025 · May 12, 2025 · May 12, 2025
diff --git a/dataset_configs/english/hifitts2/config_22khz.yaml b/dataset_configs/english/hifitts2/config_22khz.yaml
@@ -0,0 +1,62 @@
+documentation: |
+  HiFiTTS-2 22kHz
+  ###############
+
+  This config can be used to download the audio data for HiFiTTS-2 22kHz.
+
+  1. Downloads HiFiTTS-2 audio from LibriVox.
+  2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they
+     were removed from the website) are removed.
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+
+  **Output format**.
+
+  This config outputs 2 manifest files:
+
+  * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox.
+  * ``${workspace_dir}/manifest_filtered_22khz`` - input manifest file without utterances from failed chapters.
+
+processors_to_run: all
+workspace_dir: ???
+manifest_filename: manifest_22khz.json
+output_filename: manifest_filtered_22khz.json
+chapter_filename: chapters_22khz.json
+error_filename: errors_22khz.json
+audio_dir_name: audio_22khz
+chapter_audio_dir_name: chapters
+sample_rate: 22050
+delete_chapter_files: true
+exit_on_error: false
+use_dask: false
+max_workers: 8
+chunksize: 50
+
+input_manifest_file: ${workspace_dir}/${manifest_filename}
+chapter_file: ${workspace_dir}/${chapter_filename}
+error_file: ${workspace_dir}/${error_filename}
+audio_dir: ${workspace_dir}/${audio_dir_name}
+chapter_dir: ${workspace_dir}/${chapter_audio_dir_name}
+final_manifest: ${workspace_dir}/${output_filename}
+
+processors:
+  - _target_: sdp.processors.DownloadHiFiTTS2
+    audio_dir: ${audio_dir}
+    chapter_dir: ${chapter_dir}
+    sample_rate: ${sample_rate}
+    delete_chapter_files: ${delete_chapter_files}
+    exit_on_error: ${exit_on_error}
+    input_manifest_file: ${chapter_file}
+    output_manifest_file: ${error_file}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
+
+  - _target_: sdp.processors.RemovedFailedChapters
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${final_manifest}
+    error_file: ${error_file}
diff --git a/dataset_configs/english/hifitts2/config_44khz.yaml b/dataset_configs/english/hifitts2/config_44khz.yaml
@@ -0,0 +1,62 @@
+documentation: |
+  HiFiTTS-2 44kHz
+  ##################
+
+  This config can be used to download the audio data for HiFiTTS-2 44kHz.
+
+  1. Downloads HiFiTTS-2 audio from LibriVox.
+  2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they
+     were removed from the website) are removed.
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+
+  **Output format**.
+
+  This config outputs 2 manifest files:
+
+  * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox.
+  * ``${workspace_dir}/manifest_filtered_44khz`` - input manifest file without utterances from failed chapters.
+
+processors_to_run: all
+workspace_dir: ???
+manifest_filename: manifest_44khz.json
+output_filename: manifest_filtered_44khz.json
+chapter_filename: chapters_44khz.json
+error_filename: errors_44khz.json
+audio_dir_name: audio_44khz
+chapter_audio_dir_name: chapters
+sample_rate: 44100
+delete_chapter_files: true
+exit_on_error: false
+use_dask: false
+max_workers: 8
+chunksize: 50
+
+input_manifest_file: ${workspace_dir}/${manifest_filename}
+chapter_file: ${workspace_dir}/${chapter_filename}
+error_file: ${workspace_dir}/${error_filename}
+audio_dir: ${workspace_dir}/${audio_dir_name}
+chapter_dir: ${workspace_dir}/${chapter_audio_dir_name}
+final_manifest: ${workspace_dir}/${output_filename}
+
+processors:
+  - _target_: sdp.processors.DownloadHiFiTTS2
+    audio_dir: ${audio_dir}
+    chapter_dir: ${chapter_dir}
+    sample_rate: ${sample_rate}
+    delete_chapter_files: ${delete_chapter_files}
+    exit_on_error: ${exit_on_error}
+    input_manifest_file: ${chapter_file}
+    output_manifest_file: ${error_file}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
+
+  - _target_: sdp.processors.RemovedFailedChapters
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${final_manifest}
+    error_file: ${error_file}
diff --git a/dataset_configs/english/hifitts2/config_bandwidth.yaml b/dataset_configs/english/hifitts2/config_bandwidth.yaml
@@ -0,0 +1,44 @@
+documentation: |
+  HiFiTTS-2 Bandwidth Estimation
+  ##############################
+
+  This config contains the bandwidth estimation code used for HiFiTTS and HiFiTTS-2.
+  This config can be used to estimate bandwidth for any dataset. For HiFiTTS-2 bandwidth
+  was estimated using the first 30 seconds of every audiobook chapter, but the estimate is still
+  reasonably accurate if run over a shorter duration or with individual utterances.
+
+  **Required arguments**.
+
+  * **workspace_dir**: The workspace folder where all audio files and manifests are stored.
+  * **audio_dir**: Folder in workspace containing audio files to estimate bandwidth of.
+  * **input_manifest_filename**: Manifest file in workspace containing relative paths to audio.
+
+  **Output format**.
+
+  This config outputs a single manifest with the following field(s):
+
+  * **bandwidth (int)**: Estimated bandwidth of the audio file.
+
+processors_to_run: all
+workspace_dir: ???
+audio_dir_name: ???
+input_manifest_filename: ???
+output_manifest_filename: manifest_bandwidth.json
+audio_key: audio_filepath
+use_dask: false
+max_workers: 1
+chunksize: 1
+
+input_manifest_file: ${workspace_dir}/${input_manifest_filename}
+final_manifest: ${workspace_dir}/${output_manifest_filename}
+audio_dir: ${workspace_dir}/${audio_dir_name}
+
+processors:
+  - _target_: sdp.processors.EstimateBandwidth
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${final_manifest}
+    audio_dir: ${audio_dir}
+    input_audio_key: ${audio_key}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
@@ -116,12 +116,24 @@ HuggingFace Datasets
 .. autodata:: sdp.processors.CreateInitialManifestHuggingFace
    :annotation:
 
+
 YTC Datasets
 ''''''''''''
 
 .. autodata:: sdp.processors.datasets.ytc.create_initial_manifest.CreateInitialManifestYTC
    :annotation:
 
+
+HiFiTTS-2
+''''''''''''''''''''
+
+.. autodata:: sdp.processors.DownloadHiFiTTS2
+   :annotation:
+
+.. autodata:: sdp.processors.RemovedFailedChapters
+   :annotation:
+
+
 Lhotse processors
 #################
 
@@ -151,6 +163,9 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.ASRTransformers
    :annotation:
 
+.. autodata:: sdp.processors.EstimateBandwidth
+   :annotation:
+
 .. autodata:: sdp.processors.tts.pyannote.PyAnnoteDiarizationAndOverlapDetection
    :annotation:
 
@@ -166,7 +181,6 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.tts.metrics.BandwidthEstimationProcessor
    :annotation:
 
-
 Text-only processors
 ####################
 

diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
@@ -366,6 +366,13 @@ Armenian Toloka
    `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/armenian/toloka/pipeline_get_final_res.yaml>`__ |
    :doc:`documentation <config-docs/armenian/toloka/pipeline_get_final_res>`
 
+.. toctree::
+   :hidden:
+
+   config-docs/armenian/toloka/pipeline_start
+   config-docs/armenian/toloka/pipeline_validate_answers
+   config-docs/armenian/toloka/pipeline_get_final_res
+
 YouTube Commons (YTC)
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -377,8 +384,26 @@ YouTube Commons (YTC)
 .. toctree::
    :hidden:
 
-   config-docs/armenian/toloka/pipeline_start
-   config-docs/armenian/toloka/pipeline_validate_answers
-   config-docs/armenian/toloka/pipeline_get_final_res
-
    config-docs/tts/ytc/config
+
+HiFiTTS-2
+~~~~~~~~~~~~~~~~~~~~~~~
+
+**Dataset link:** TODO
+
+* **22kHz**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_22khz.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_22khz>`
+* **44kHz**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_44khz.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_44khz>`
+* **Bandwidth Estimation**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_bandwidth.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_bandwidth>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/english/hifitts2/config_22khz
+   config-docs/english/hifitts2/config_44khz
+   config-docs/english/hifitts2/config_bandwidth
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -24,6 +24,8 @@
 from sdp.processors.datasets.fleurs.create_initial_manifest import (
     CreateInitialManifestFleurs,
 )
+from sdp.processors.datasets.hifitts2.download_dataset import DownloadHiFiTTS2
+from sdp.processors.datasets.hifitts2.remove_failed_chapters import RemovedFailedChapters
 from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
     CreateInitialManifestUzbekvoice,
 )
@@ -127,6 +129,7 @@
     MakeLettersUppercaseAfterPeriod,
 )
 from sdp.processors.nemo.asr_inference import ASRInference
+from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth
 from sdp.processors.nemo.pc_inference import PCInference
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool

diff --git a/sdp/processors/datasets/hifitts2/__init__.py b/sdp/processors/datasets/hifitts2/__init__.py