From 2ef44f8d7758b846dc6e66cb620b370fccc6b818 Mon Sep 17 00:00:00 2001
From: Ryan <rlangman@nvidia.com>
Date: Tue, 22 Apr 2025 11:10:25 -0700
Subject: [PATCH 1/6] Add processors for downloading HiFiTTS2 dataset

Signed-off-by: Ryan <rlangman@nvidia.com>
---
 .../english/hifitts2/config_22khz.yaml        |  60 ++++++++
 .../english/hifitts2/config_44khz.yaml        |  60 ++++++++
 docs/src/sdp/existing_configs.rst             |  13 ++
 sdp/processors/__init__.py                    |   2 +
 sdp/processors/datasets/hifitts2/__init__.py  |   0
 .../datasets/hifitts2/download_dataset.py     | 134 ++++++++++++++++++
 .../hifitts2/remove_failed_chapters.py        |  58 ++++++++
 7 files changed, 327 insertions(+)
 create mode 100644 dataset_configs/english/hifitts2/config_22khz.yaml
 create mode 100644 dataset_configs/english/hifitts2/config_44khz.yaml
 create mode 100644 sdp/processors/datasets/hifitts2/__init__.py
 create mode 100644 sdp/processors/datasets/hifitts2/download_dataset.py
 create mode 100644 sdp/processors/datasets/hifitts2/remove_failed_chapters.py

diff --git a/dataset_configs/english/hifitts2/config_22khz.yaml b/dataset_configs/english/hifitts2/config_22khz.yaml
new file mode 100644
index 00000000..e7d5eed7
--- /dev/null
+++ b/dataset_configs/english/hifitts2/config_22khz.yaml
@@ -0,0 +1,60 @@
+documentation: |
+  HiFiTTS-2 22kHz
+  ##################
+  
+  This config can be used to download the audio data for HiFiTTS-2 22kHz.
+  
+  1. Downloads HiFiTTS-2 audio from LibriVox.
+  2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they
+     were removed from the website) are removed.
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+ 
+  **Output format**.
+
+  This config outputs 2 manifest files:
+
+  * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox.
+  * ``${workspace_dir}/manifest_22khz_filtered`` - input manifest file without utterances from failed chapters.
+
+processors_to_run: all
+workspace_dir: ???
+manifest_filename: manifest_22khz.json
+output_filename: manifest_22khz_filtered.json
+chapter_filename: chapters_22khz.json
+error_filename: errors_22khz.json
+sample_rate: 22050
+delete_chapter_files: true
+exit_on_error: false
+use_dask: false
+max_workers: 8
+chunksize: 50
+
+input_manifest_file: ${workspace_dir}/${manifest_filename}
+output_manifest_file: ${workspace_dir}/${output_filename}
+chapter_file: ${workspace_dir}/${chapter_filename}
+error_file: ${workspace_dir}/${error_filename}
+audio_dir: ${workspace_dir}/audio
+chapter_dir: ${workspace_dir}/chapters
+
+processors:
+  - _target_: sdp.processors.DownloadHiFiTTS2
+    audio_dir: ${audio_dir}
+    chapter_dir: ${chapter_dir}
+    sample_rate: ${sample_rate}
+    delete_chapter_files: ${delete_chapter_files}
+    exit_on_error: ${exit_on_error}
+    input_manifest_file: ${chapter_file}
+    output_manifest_file: ${error_file}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
+
+  - _target_: sdp.processors.RemovedFailedChapters
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${output_manifest_file}
+    error_file: ${error_file}
diff --git a/dataset_configs/english/hifitts2/config_44khz.yaml b/dataset_configs/english/hifitts2/config_44khz.yaml
new file mode 100644
index 00000000..acb55585
--- /dev/null
+++ b/dataset_configs/english/hifitts2/config_44khz.yaml
@@ -0,0 +1,60 @@
+documentation: |
+  HiFiTTS-2 44kHz
+  ##################
+  
+  This config can be used to download the audio data for HiFiTTS-2 44kHz.
+  
+  1. Downloads HiFiTTS-2 audio from LibriVox.
+  2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they
+     were removed from the website) are removed.
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+ 
+  **Output format**.
+
+  This config outputs 2 manifest files:
+
+  * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox.
+  * ``${workspace_dir}/manifest_44khz_filtered`` - input manifest file without utterances from failed chapters.
+
+processors_to_run: all
+workspace_dir: ???
+manifest_filename: manifest_44khz.json
+output_filename: manifest_44khz_filtered.json
+chapter_filename: chapters_44khz.json
+error_filename: errors_44khz.json
+sample_rate: 44100
+delete_chapter_files: true
+exit_on_error: false
+use_dask: false
+max_workers: 8
+chunksize: 50
+
+input_manifest_file: ${workspace_dir}/${manifest_filename}
+output_manifest_file: ${workspace_dir}/${output_filename}
+chapter_file: ${workspace_dir}/${chapter_filename}
+error_file: ${workspace_dir}/${error_filename}
+audio_dir: ${workspace_dir}/audio
+chapter_dir: ${workspace_dir}/chapters
+
+processors:
+  - _target_: sdp.processors.DownloadHiFiTTS2
+    audio_dir: ${audio_dir}
+    chapter_dir: ${chapter_dir}
+    sample_rate: ${sample_rate}
+    delete_chapter_files: ${delete_chapter_files}
+    exit_on_error: ${exit_on_error}
+    input_manifest_file: ${chapter_file}
+    output_manifest_file: ${error_file}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
+
+  - _target_: sdp.processors.RemovedFailedChapters
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${output_manifest_file}
+    error_file: ${error_file}
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index f8265816..652e633c 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -373,3 +373,16 @@ Armenian Toloka
    config-docs/armenian/toloka/pipeline_validate_answers
    config-docs/armenian/toloka/pipeline_get_final_res
 
+HiFiTTS-2
+~~~~~~~~~~~~~~~~~~~~~~~
+
+**Dataset link:** TODO
+
+`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/arabic/everyayah/config.yaml>`__ |
+:doc:`documentation <config-docs/arabic/everyayah/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/english/hifitts2/config_22khz
+   config-docs/english/hifitts2/config_44khz
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 6788c88f..e71b7dff 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -24,6 +24,8 @@
 from sdp.processors.datasets.fleurs.create_initial_manifest import (
     CreateInitialManifestFleurs,
 )
+from sdp.processors.datasets.hifitts2.download_dataset import DownloadHiFiTTS2
+from sdp.processors.datasets.hifitts2.remove_failed_chapters import RemovedFailedChapters
 from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
     CreateInitialManifestUzbekvoice,
 )
diff --git a/sdp/processors/datasets/hifitts2/__init__.py b/sdp/processors/datasets/hifitts2/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py
new file mode 100644
index 00000000..0c7e0b61
--- /dev/null
+++ b/sdp/processors/datasets/hifitts2/download_dataset.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import librosa
+from pathlib import Path
+import soundfile as sf
+import time
+import urllib.error
+import urllib.request
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class DownloadHiFiTTS2(BaseParallelProcessor):
+    """
+    Downloads HiFiTTS-2 dataset to local machine. Unsegmented audiobook chapters are first downloaded at a
+    48 kHz from LibriVox. Each chapter is then split into segmented utterance files based on precomputed
+    offsets and durations.
+
+    To reduce disk use, the chapter files can be optionally deleted after they are segmented.
+
+    Metadata for chapters which fail to download due to network errors are stored in an output manifest file,
+    which can be given as input to this processor to attempt the downloads again.
+
+    Args:
+        audio_dir (str): Root directory where utterance files will be saved.
+        chapter_dir (str): Root directory where audiobook chapter files will be saved.
+        sample_rate (int): Sample rate to use for utterance files.
+        delete_chapter_files (bool): Whether to delete each chapter file after it is done being processed.
+        exit_on_error (bool): Whether to terminate the entire processor script if a single chapter downlaod fails.
+        num_retries (int): Number of times to retry chapter download after encountering intermittent HTTP errors.
+
+    Returns:
+        Utterance files are stored under 'audio_dir' and chapter files are downloaded under 'chapter_dir'.
+        
+        If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod,
+        with error information stored under the 'error_code' and 'error_reason' fields.
+    """
+
+    def __init__(
+        self,
+        audio_dir: str,
+        chapter_dir: str,
+        sample_rate: int = 22050,
+        delete_chapter_files: bool = True,
+        exit_on_error: bool = False,
+        num_retries: int = 5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.audio_dir = Path(audio_dir)
+        self.chapter_dir = Path(chapter_dir)
+        self.sample_rate = sample_rate
+        self.delete_chapter_files = delete_chapter_files
+        self.exit_on_error = exit_on_error
+        self.num_retries = num_retries
+
+    def prepare(self):
+        # Create output directory structure
+        with open(self.input_manifest_file, "rt", encoding="utf-8") as fin:
+            dirs = set()
+            for line in fin:
+                row = json.loads(line)
+                audio_filepath = Path(row["utterances"][0]["audio_filepath"])
+                chapter_dir = audio_filepath.parent
+                dirs.add(chapter_dir)
+
+        for dir in dirs:
+            audio_dir = self.audio_dir / dir
+            chapter_dir = self.chapter_dir / dir
+            audio_dir.mkdir(exist_ok=True, parents=True)
+            chapter_dir.mkdir(exist_ok=True, parents=True)
+
+        return
+
+    def process_dataset_entry(self, data_entry):
+        url = data_entry["url"]
+        chapter_filepath = data_entry["chapter_filepath"]
+        utterances = data_entry["utterances"]
+
+        chapter_path = self.chapter_dir / chapter_filepath
+        for i in range(1, self.num_retries + 1):
+            try:
+                urllib.request.urlretrieve(url=url, filename=chapter_path)
+                break
+            except (urllib.error.HTTPError, urllib.error.URLError) as http_error:
+                error_msg = f"Encountered HTTP error when downloading {url}: {http_error}"
+                print(error_msg)
+                if str(http_error.code).startswith("5") and i < self.num_retries:
+                    print(f"Retry {i} for url {url}")
+                    time.sleep(10)
+                    continue
+
+                if self.exit_on_error:
+                    raise RuntimeError(error_msg)
+
+                error_data = {
+                    "url": url,
+                    "chapter_filepath": chapter_filepath,
+                    "error_code": http_error.code,
+                    "error_reason": http_error.reason,
+                    "utterances": utterances,
+                }
+                return [DataEntry(data=error_data)]
+
+        chapter_audio, sr = librosa.load(path=chapter_path, sr=self.sample_rate)
+
+        for utt in utterances:
+            audio_filepath = utt["audio_filepath"]
+            audio_path = self.audio_dir / audio_filepath
+            offset = utt["offset"]
+            dur = utt["duration"]
+            start_sample = librosa.time_to_samples(offset, sr=sr)
+            end_sample = librosa.time_to_samples(offset + dur, sr=sr)
+            audio = chapter_audio[start_sample:end_sample]
+            sf.write(file=audio_path, data=audio, samplerate=int(sr))
+
+        if self.delete_chapter_files:
+            chapter_path.unlink()
+
+        return []
diff --git a/sdp/processors/datasets/hifitts2/remove_failed_chapters.py b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py
new file mode 100644
index 00000000..7d6eaf07
--- /dev/null
+++ b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+from pathlib import Path
+from tqdm import tqdm
+
+from sdp.processors.base_processor import BaseProcessor
+from sdp.utils.common import load_manifest
+
+
+class RemovedFailedChapters(BaseProcessor):
+    """
+    Removes all utterances in the input chapter file from the input manifest. This processor is expected to be
+    run using the file output by the DownloadHiFiTTS2 containing failed chapter downloads.
+
+    Args:
+        error_file (str): Path to file with chapter download errors.
+
+    Returns:
+        This outputs a manifest which is the same as its input manifest but with utterances in 'error_file' removed.
+    """
+
+    def __init__(
+        self,
+        error_file: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.error_file = Path(error_file)
+
+    def process(self):
+        chapter_rows = load_manifest(self.error_file)
+        audio_files_to_remove = set()
+        for chapter_row in chapter_rows:
+            for utt_list in chapter_row["utterances"]:
+                audio_files_to_remove.add(utt_list["audio_filepath"])
+
+        rows = load_manifest(Path(self.input_manifest_file))
+        with open(self.output_manifest_file, "w", encoding="utf-8") as output_f:
+            for row in tqdm(rows):
+                if row["audio_filepath"] in audio_files_to_remove:
+                    continue
+
+                output_line = f"{json.dumps(row, ensure_ascii=False)}\n"
+                output_f.write(output_line)

From 2f63993022c78232bc7390e0140d95badb05f165 Mon Sep 17 00:00:00 2001
From: Ryan <rlangman@nvidia.com>
Date: Mon, 12 May 2025 10:51:14 -0700
Subject: [PATCH 2/6] Add bandwidth estimation processor from HiFiTTS-2

Signed-off-by: Ryan <rlangman@nvidia.com>
---
 .../english/hifitts2/config_22khz.yaml        | 14 ++--
 .../english/hifitts2/config_44khz.yaml        | 14 ++--
 .../english/hifitts2/config_bandwidth.yaml    | 44 +++++++++++
 docs/src/sdp/api.rst                          | 14 ++++
 sdp/processors/__init__.py                    |  1 +
 .../datasets/hifitts2/download_dataset.py     |  5 +-
 sdp/processors/nemo/estimate_bandwidth.py     | 77 +++++++++++++++++++
 .../prepare_hifitts2_data.py                  | 71 +++++++++++++++++
 tests/test_cfg_end_to_end_tests.py            | 26 ++++++-
 9 files changed, 248 insertions(+), 18 deletions(-)
 create mode 100644 dataset_configs/english/hifitts2/config_bandwidth.yaml
 create mode 100644 sdp/processors/nemo/estimate_bandwidth.py
 create mode 100644 tests/prepare_test_data/prepare_hifitts2_data.py

diff --git a/dataset_configs/english/hifitts2/config_22khz.yaml b/dataset_configs/english/hifitts2/config_22khz.yaml
index e7d5eed7..09954bfe 100644
--- a/dataset_configs/english/hifitts2/config_22khz.yaml
+++ b/dataset_configs/english/hifitts2/config_22khz.yaml
@@ -19,14 +19,16 @@ documentation: |
   This config outputs 2 manifest files:
 
   * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox.
-  * ``${workspace_dir}/manifest_22khz_filtered`` - input manifest file without utterances from failed chapters.
+  * ``${workspace_dir}/manifest_filtered_22khz`` - input manifest file without utterances from failed chapters.
 
 processors_to_run: all
 workspace_dir: ???
 manifest_filename: manifest_22khz.json
-output_filename: manifest_22khz_filtered.json
+output_filename: manifest_filtered_22khz.json
 chapter_filename: chapters_22khz.json
 error_filename: errors_22khz.json
+audio_dir_name: audio_22khz
+chapter_audio_dir_name: chapters
 sample_rate: 22050
 delete_chapter_files: true
 exit_on_error: false
@@ -35,11 +37,11 @@ max_workers: 8
 chunksize: 50
 
 input_manifest_file: ${workspace_dir}/${manifest_filename}
-output_manifest_file: ${workspace_dir}/${output_filename}
 chapter_file: ${workspace_dir}/${chapter_filename}
 error_file: ${workspace_dir}/${error_filename}
-audio_dir: ${workspace_dir}/audio
-chapter_dir: ${workspace_dir}/chapters
+audio_dir: ${workspace_dir}/${audio_dir_name}
+chapter_dir: ${workspace_dir}/${chapter_audio_dir_name}
+final_manifest: ${workspace_dir}/${output_filename}
 
 processors:
   - _target_: sdp.processors.DownloadHiFiTTS2
@@ -56,5 +58,5 @@ processors:
 
   - _target_: sdp.processors.RemovedFailedChapters
     input_manifest_file: ${input_manifest_file}
-    output_manifest_file: ${output_manifest_file}
+    output_manifest_file: ${final_manifest}
     error_file: ${error_file}
diff --git a/dataset_configs/english/hifitts2/config_44khz.yaml b/dataset_configs/english/hifitts2/config_44khz.yaml
index acb55585..33d79f98 100644
--- a/dataset_configs/english/hifitts2/config_44khz.yaml
+++ b/dataset_configs/english/hifitts2/config_44khz.yaml
@@ -19,14 +19,16 @@ documentation: |
   This config outputs 2 manifest files:
 
   * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox.
-  * ``${workspace_dir}/manifest_44khz_filtered`` - input manifest file without utterances from failed chapters.
+  * ``${workspace_dir}/manifest_filtered_44khz`` - input manifest file without utterances from failed chapters.
 
 processors_to_run: all
 workspace_dir: ???
 manifest_filename: manifest_44khz.json
-output_filename: manifest_44khz_filtered.json
+output_filename: manifest_filtered_44khz.json
 chapter_filename: chapters_44khz.json
 error_filename: errors_44khz.json
+audio_dir_name: audio_44khz
+chapter_audio_dir_name: chapters
 sample_rate: 44100
 delete_chapter_files: true
 exit_on_error: false
@@ -35,11 +37,11 @@ max_workers: 8
 chunksize: 50
 
 input_manifest_file: ${workspace_dir}/${manifest_filename}
-output_manifest_file: ${workspace_dir}/${output_filename}
 chapter_file: ${workspace_dir}/${chapter_filename}
 error_file: ${workspace_dir}/${error_filename}
-audio_dir: ${workspace_dir}/audio
-chapter_dir: ${workspace_dir}/chapters
+audio_dir: ${workspace_dir}/${audio_dir_name}
+chapter_dir: ${workspace_dir}/${chapter_audio_dir_name}
+final_manifest: ${workspace_dir}/${output_filename}
 
 processors:
   - _target_: sdp.processors.DownloadHiFiTTS2
@@ -56,5 +58,5 @@ processors:
 
   - _target_: sdp.processors.RemovedFailedChapters
     input_manifest_file: ${input_manifest_file}
-    output_manifest_file: ${output_manifest_file}
+    output_manifest_file: ${final_manifest}
     error_file: ${error_file}
diff --git a/dataset_configs/english/hifitts2/config_bandwidth.yaml b/dataset_configs/english/hifitts2/config_bandwidth.yaml
new file mode 100644
index 00000000..59e1062e
--- /dev/null
+++ b/dataset_configs/english/hifitts2/config_bandwidth.yaml
@@ -0,0 +1,44 @@
+documentation: |
+  HiFiTTS2 Bandwidth Estimation
+  ##################
+  
+  This config contains the bandwidth estimation code used for HiFiTTS and HiFiTTS-2.
+  This config can be used to estimate bandwidth for any dataset. For HiFiTTS-2 bandwidth
+  was estimated using the first 30 seconds of every audiobook chapter, but the estimate is still
+  reasonably accurate if run over a shorter duration or with individual utterances.
+
+  **Required arguments**.
+
+  * **workspace_dir**: The workspace folder where all audio files and manifests are stored.
+  * **audio_dir**: Folder in workspace containing audio files to estimate bandwidth of.
+  * **input_manifest_filename**: Manifest file in workspace containing relative paths to audio.
+ 
+  **Output format**.
+  
+  This config outputs a single manifest with the following field(s):
+
+  * **bandwidth (int)**: Estimated bandwidth of the audio file.
+
+processors_to_run: all
+workspace_dir: ???
+audio_dir_name: ???
+input_manifest_filename: ???
+output_manifest_filename: manifest_bandwidth.json
+audio_key: audio_filepath
+use_dask: false
+max_workers: 1
+chunksize: 1
+
+input_manifest_file: ${workspace_dir}/${input_manifest_filename}
+final_manifest: ${workspace_dir}/${output_manifest_filename}
+audio_dir: ${workspace_dir}/${audio_dir_name}
+
+processors:
+  - _target_: sdp.processors.EstimateBandwidth
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: ${final_manifest}
+    audio_dir: ${audio_dir}
+    input_audio_key: ${audio_key}
+    use_dask: ${use_dask}
+    max_workers: ${max_workers}
+    chunksize: ${chunksize}
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 4700f860..bfbac5ad 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -116,6 +116,17 @@ HuggingFace Datasets
 .. autodata:: sdp.processors.CreateInitialManifestHuggingFace
    :annotation:
 
+
+HiFiTTS-2
+''''''''''''''''''''
+
+.. autodata:: sdp.processors.DownloadHiFiTTS2
+   :annotation:
+
+.. autodata:: sdp.processors.RemovedFailedChapters
+   :annotation:
+
+
 Lhotse processors
 #################
 
@@ -145,6 +156,9 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.ASRTransformers
    :annotation:
 
+.. autodata:: sdp.processors.EstimateBandwidth
+   :annotation:
+
 Text-only processors
 ####################
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index e71b7dff..fac13976 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -126,6 +126,7 @@
     MakeLettersUppercaseAfterPeriod,
 )
 from sdp.processors.nemo.asr_inference import ASRInference
+from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth
 from sdp.processors.nemo.pc_inference import PCInference
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool
diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py
index 0c7e0b61..74c91bb9 100644
--- a/sdp/processors/datasets/hifitts2/download_dataset.py
+++ b/sdp/processors/datasets/hifitts2/download_dataset.py
@@ -21,6 +21,7 @@
 import urllib.error
 import urllib.request
 
+from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
 
 
@@ -98,9 +99,9 @@ def process_dataset_entry(self, data_entry):
                 break
             except (urllib.error.HTTPError, urllib.error.URLError) as http_error:
                 error_msg = f"Encountered HTTP error when downloading {url}: {http_error}"
-                print(error_msg)
+                logger.warning(error_msg)
                 if str(http_error.code).startswith("5") and i < self.num_retries:
-                    print(f"Retry {i} for url {url}")
+                    logger.info(f"Retry {i} for url {url}")
                     time.sleep(10)
                     continue
 
diff --git a/sdp/processors/nemo/estimate_bandwidth.py b/sdp/processors/nemo/estimate_bandwidth.py
new file mode 100644
index 00000000..149ff0ba
--- /dev/null
+++ b/sdp/processors/nemo/estimate_bandwidth.py
@@ -0,0 +1,77 @@
+import librosa
+import numpy as np
+from pathlib import Path
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class EstimateBandwidth(BaseParallelProcessor):
+    """Adds estimated bandwidth to each utterance in the input manifest file.
+
+    Args:
+        audio_dir (str): Root directory where audio files are stored.
+        input_audio_key (str): Manifest key with relative audio paths.
+        output_bandwidth_key (str): Manifest key to store estimated bandwidth in.
+        max_seconds (float): The maximum length of audio to use for bandwidth estimation.
+            By default, uses the first 30 seconds.
+        sample_rate (int): Sample rate to resample audio to before doing bandwidth estimation.
+            Defaults to 44100, upsampling the input audio as needed.
+        n_fft (int): Number of FFT bins to use for bandwidth estimation. Defaults to 512.
+        hop_length (int): Audio frame hop length to use for bandwidth estimation.
+            Defaults to 441, corresponding to 0.01 seconds for 44100 sample rate.
+        top_db (float): top_db treshhold to use for bandwidth estimation.
+        frequency_threshold (float): Bandwidth estimation finds the highest frequency with mean power spectrum that is
+            within 'frequency_threshold' dB of its peak power. Defaults to -50 dB.
+
+    Returns:
+        This processor estimates the bandwidth of the audio file in the`input_audio_key` field and saves the estimate
+            in the output_bandwidth_key` field.
+    """
+
+    def __init__(
+        self,
+        audio_dir: str,
+        input_audio_key: str = "audio_filepath",
+        output_bandwidth_key: str = "bandwidth",
+        max_seconds: float = 30.0,
+        sample_rate: int = 44100,
+        n_fft: int = 512,
+        hop_length: int = 441,
+        top_db: float = 100.0,
+        frequency_threshold: float = -50.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.audio_directory = Path(audio_dir)
+        self.input_audio_key = input_audio_key
+        self.output_bandwidth_key = output_bandwidth_key
+        self.max_seconds = max_seconds
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.top_db = top_db
+        self.frequency_threshold = frequency_threshold
+
+    def _estimate_bandwidth(self, audio, sample_rate):
+        spec = librosa.stft(y=audio, n_fft=self.n_fft, hop_length=self.hop_length, window="blackmanharris")
+        power_spec = np.abs(spec) ** 2
+        power_spec = np.mean(power_spec, axis=1)
+        power_spec = librosa.power_to_db(power_spec, ref=self.n_fft, top_db=self.top_db)
+
+        bandwidth = 0
+        peak = np.max(power_spec)
+        freq_width = sample_rate / self.n_fft
+        for idx in range(len(power_spec) - 1, -1, -1):
+            if power_spec[idx] - peak > self.frequency_threshold:
+                bandwidth = idx * freq_width
+                break
+
+        return bandwidth
+
+    def process_dataset_entry(self, data_entry):
+        audio_filename = data_entry[self.input_audio_key]
+        audio_file = self.audio_directory / audio_filename
+        audio, sr = librosa.load(path=audio_file, sr=self.sample_rate, duration=self.max_seconds)
+        bandwidth = self._estimate_bandwidth(audio=audio, sample_rate=sr)
+        data_entry[self.output_bandwidth_key] = int(bandwidth)
+        return [DataEntry(data=data_entry)]
diff --git a/tests/prepare_test_data/prepare_hifitts2_data.py b/tests/prepare_test_data/prepare_hifitts2_data.py
new file mode 100644
index 00000000..93e07132
--- /dev/null
+++ b/tests/prepare_test_data/prepare_hifitts2_data.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Copies HiFiTTS-2 manifests and audio into a new directory with fewer entries."""
+
+import argparse
+import json
+import os
+from pathlib import Path
+import shutil
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Preparing HiFiTTS-2 test data")
+    parser.add_argument(
+        "--workspace_folder", required=True, type=Path, help="Path to workspace where dataset was downloaded."
+    )
+    parser.add_argument(
+        "--audio_folder", default="audio_22khz", type=Path, required=False, help="Name of root folder with audio."
+    )
+    parser.add_argument("--test_data_folder", required=True, type=Path, help="Where to place the prepared data")
+    parser.add_argument(
+        "--manifest_filename", default="manifest_22khz.json", type=str, required=False, help="Name of manifest manifest."
+    )
+    parser.add_argument(
+        "--chapters_filename", default="chapters_22khz.json", type=str, required=False, help="Name of chapter manifest."
+    )
+    parser.add_argument(
+        "--error_filename", default="errors_22khz.json", type=str, required=False, help="Name of chapter error manifest."
+    )
+    parser.add_argument("--num_entries", default=20, type=int, help="How many entries to keep from each manifest")
+
+    args = parser.parse_args()
+
+    files_to_copy = [args.manifest_filename, args.chapters_filename, args.error_filename]
+
+    os.makedirs(args.test_data_folder, exist_ok=True)
+    # Copy manifest files
+    for filename in files_to_copy:
+        input_path = args.workspace_folder / filename
+        output_path = args.test_data_folder / filename
+        with open(input_path, "r", encoding="utf-8") as input_f:
+            with open(output_path, "w", encoding="utf-8") as output_f:
+                for i, line in enumerate(input_f):
+                    if i >= args.num_entries:
+                        break
+                    output_f.write(line)
+
+    # Copy audio
+    input_audio_dir = args.workspace_folder / args.audio_folder
+    output_audio_dir = args.test_data_folder / args.audio_folder
+    with open(args.manifest_filename, "r", encoding="utf-8") as input_f:
+        for i, line in enumerate(input_f):
+            if i >= args.num_entries:
+                break
+            row = json.loads(line)
+            audio_filepath = row["audio_filepath"]
+            input_path = input_audio_dir / audio_filepath
+            output_path = output_audio_dir / audio_filepath
+            output_path.parent.mkdir(exist_ok=True, parents=True)
+            shutil.copy(src=input_path, dst=output_path)
\ No newline at end of file
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index db0425e7..e52678a7 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -36,8 +36,8 @@ class TestCase:
     """Class for keeping track of test cases."""
     config_path: str
     data_check_fn: Callable
-    # Fields in the manifest to ignore (can be set when non-deterministic processor was used)
     reference_manifest_filename: str = "test_data_reference.json"
+    # Fields in the manifest to ignore (can be set when non-deterministic processor was used)
     fields_to_ignore: List[str] = field(default_factory=list)
     processors_to_run: str = ""
 
@@ -232,19 +232,29 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
             data_check_fn=partial(data_check_fn_generic, file_name="everyayah.hf")
         ),
         TestCase(
-            config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", 
+            config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml",
             data_check_fn=data_check_fn_armenian_toloka_pipeline_start,
             fields_to_ignore=['source_filepath'],
             processors_to_run="2:14",
             reference_manifest_filename="pipeline_start/test_data_reference.json"
         ),
         TestCase(
-            config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", 
+            config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml",
             data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res,
             reference_manifest_filename="pipeline_get_final_res/test_data_reference.json",
             fields_to_ignore=['audio_filepath', 'duration'],
             processors_to_run="1:6"
-        )
+        ),
+        TestCase(
+            config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_22khz.yaml",
+            data_check_fn=partial(data_check_fn_generic, file_name="manifest_filtered_22khz.json"),
+            processors_to_run="1:2"
+        ),
+        TestCase(
+            config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_bandwidth.yaml",
+            data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"),
+            reference_manifest_filename="test_data_reference_bandwidth.json",
+        ),
     ]
 
 def get_test_names():
@@ -357,6 +367,14 @@ def test_configs(setup_data, tmp_path):
         # Set input_manifest_file for ASRFileCheck to use the existing manifest.json
         cfg.processors[1].input_manifest_file = (data_dir / "pipeline_get_final_res" / "manifest.json").as_posix()
 
+    if "english/hifitts2/config_22khz" in config_path:
+        cfg.processors[1].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix()
+        cfg.processors[1].error_file = (data_dir / "errors_22khz.json").as_posix()
+
+    if "english/hifitts2/config_bandwidth" in config_path:
+        cfg.processors[0].audio_dir = (data_dir / "audio_22khz").as_posix()
+        cfg.processors[0].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix()
+
     run_processors(cfg)
     # additionally, let's test that final generated manifest matches the
     # reference file (ignoring the file paths and additional fields explicitly specified to ignore)

From aa643f85ad2a6ece3acd7a3c56cc43ddfe17f2e6 Mon Sep 17 00:00:00 2001
From: Ryan <rlangman@nvidia.com>
Date: Mon, 12 May 2025 16:48:22 -0700
Subject: [PATCH 3/6] Fix bandwidth documentation

Signed-off-by: Ryan <rlangman@nvidia.com>
---
 dataset_configs/english/hifitts2/config_22khz.yaml   |  2 +-
 .../english/hifitts2/config_bandwidth.yaml           |  4 ++--
 docs/src/sdp/existing_configs.rst                    | 12 ++++++++++--
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/dataset_configs/english/hifitts2/config_22khz.yaml b/dataset_configs/english/hifitts2/config_22khz.yaml
index 09954bfe..93506582 100644
--- a/dataset_configs/english/hifitts2/config_22khz.yaml
+++ b/dataset_configs/english/hifitts2/config_22khz.yaml
@@ -1,6 +1,6 @@
 documentation: |
   HiFiTTS-2 22kHz
-  ##################
+  ###############
   
   This config can be used to download the audio data for HiFiTTS-2 22kHz.
   
diff --git a/dataset_configs/english/hifitts2/config_bandwidth.yaml b/dataset_configs/english/hifitts2/config_bandwidth.yaml
index 59e1062e..15c219bf 100644
--- a/dataset_configs/english/hifitts2/config_bandwidth.yaml
+++ b/dataset_configs/english/hifitts2/config_bandwidth.yaml
@@ -1,6 +1,6 @@
 documentation: |
-  HiFiTTS2 Bandwidth Estimation
-  ##################
+  HiFiTTS-2 Bandwidth Estimation
+  ##############################
   
   This config contains the bandwidth estimation code used for HiFiTTS and HiFiTTS-2.
   This config can be used to estimate bandwidth for any dataset. For HiFiTTS-2 bandwidth
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 493b7233..5be69922 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -391,11 +391,19 @@ HiFiTTS-2
 
 **Dataset link:** TODO
 
-`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/arabic/everyayah/config.yaml>`__ |
-:doc:`documentation <config-docs/arabic/everyayah/config>`
+* **22kHz**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_22khz.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_22khz>`
+* **44kHz**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_44khz.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_44khz>`
+* **Bandwidth Estimation**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/hifitts2/config_bandwidth.yaml>`__ |
+   :doc:`documentation <config-docs/english/hifitts2/config_bandwidth>`
 
 .. toctree::
    :hidden:
 
    config-docs/english/hifitts2/config_22khz
    config-docs/english/hifitts2/config_44khz
+   config-docs/english/hifitts2/config_bandwidth

From 64fb25ea5037b66768d42c271a960311c40a1e24 Mon Sep 17 00:00:00 2001
From: Ryan <rlangman@nvidia.com>
Date: Tue, 27 May 2025 12:05:58 -0700
Subject: [PATCH 4/6] Fix tests, add example to docstrings

Signed-off-by: Ryan <rlangman@nvidia.com>
---
 sdp/processors/datasets/hifitts2/download_dataset.py | 10 ++++++++++
 .../datasets/hifitts2/remove_failed_chapters.py      |  8 ++++++++
 sdp/processors/nemo/estimate_bandwidth.py            | 12 +++++++++++-
 tests/prepare_test_data/prepare_hifitts2_data.py     |  3 ++-
 tests/test_cfg_end_to_end_tests.py                   |  3 ++-
 5 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py
index 74c91bb9..b1982319 100644
--- a/sdp/processors/datasets/hifitts2/download_dataset.py
+++ b/sdp/processors/datasets/hifitts2/download_dataset.py
@@ -49,6 +49,16 @@ class DownloadHiFiTTS2(BaseParallelProcessor):
         
         If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod,
         with error information stored under the 'error_code' and 'error_reason' fields.
+
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.DownloadHiFiTTS2
+              input_manifest_file: ${workspace_dir}/manifest_22khz.json
+              output_manifest_file: ${workspace_dir}/errors_22khz.json
+              audio_dir: ${workspace_dir}/audio_22khz
+              chapter_dir: ${workspace_dir}/chapters
+              max_workers: 8
     """
 
     def __init__(
diff --git a/sdp/processors/datasets/hifitts2/remove_failed_chapters.py b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py
index 7d6eaf07..b4cd5a8b 100644
--- a/sdp/processors/datasets/hifitts2/remove_failed_chapters.py
+++ b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py
@@ -31,6 +31,14 @@ class RemovedFailedChapters(BaseProcessor):
 
     Returns:
         This outputs a manifest which is the same as its input manifest but with utterances in 'error_file' removed.
+
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.RemovedFailedChapters
+              input_manifest_file: ${workspace_dir}/manifest_22khz.json
+              output_manifest_file: ${workspace_dir}/manifest_filtered_22khz.json
+              error_file: ${workspace_dir}/errors_22khz.json
     """
 
     def __init__(
diff --git a/sdp/processors/nemo/estimate_bandwidth.py b/sdp/processors/nemo/estimate_bandwidth.py
index 149ff0ba..38b261e7 100644
--- a/sdp/processors/nemo/estimate_bandwidth.py
+++ b/sdp/processors/nemo/estimate_bandwidth.py
@@ -6,7 +6,8 @@
 
 
 class EstimateBandwidth(BaseParallelProcessor):
-    """Adds estimated bandwidth to each utterance in the input manifest file.
+    """
+    Adds estimated bandwidth to each utterance in the input manifest file.
 
     Args:
         audio_dir (str): Root directory where audio files are stored.
@@ -26,6 +27,15 @@ class EstimateBandwidth(BaseParallelProcessor):
     Returns:
         This processor estimates the bandwidth of the audio file in the`input_audio_key` field and saves the estimate
             in the output_bandwidth_key` field.
+
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.EstimateBandwidth
+              input_manifest_file: ${workspace_dir}/manifest.json
+              output_manifest_file: ${workspace_dir}/manifest_bandwidth.json
+              audio_dir: ${workspace_dir}/audio_22khz
+              max_workers: 8
     """
 
     def __init__(
diff --git a/tests/prepare_test_data/prepare_hifitts2_data.py b/tests/prepare_test_data/prepare_hifitts2_data.py
index 93e07132..9a83fad1 100644
--- a/tests/prepare_test_data/prepare_hifitts2_data.py
+++ b/tests/prepare_test_data/prepare_hifitts2_data.py
@@ -57,9 +57,10 @@
                     output_f.write(line)
 
     # Copy audio
+    manifest_path = args.test_data_folder / args.manifest_filename
     input_audio_dir = args.workspace_folder / args.audio_folder
     output_audio_dir = args.test_data_folder / args.audio_folder
-    with open(args.manifest_filename, "r", encoding="utf-8") as input_f:
+    with open(manifest_path, "r", encoding="utf-8") as input_f:
         for i, line in enumerate(input_f):
             if i >= args.num_entries:
                 break
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index e52678a7..214096ee 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -247,7 +247,7 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
         ),
         TestCase(
             config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_22khz.yaml",
-            data_check_fn=partial(data_check_fn_generic, file_name="manifest_filtered_22khz.json"),
+            data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"),
             processors_to_run="1:2"
         ),
         TestCase(
@@ -257,6 +257,7 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
         ),
     ]
 
+
 def get_test_names():
     config_names = [
         Path(t.config_path).parent.relative_to(DATASET_CONFIGS_ROOT).as_posix() for t in get_test_cases()

From 2fd724ed42aafa847578ecd595c50852f4498b62 Mon Sep 17 00:00:00 2001
From: Ryan <rlangman@nvidia.com>
Date: Mon, 2 Jun 2025 09:49:42 -0700
Subject: [PATCH 5/6] Fix exception handling for URLError

Signed-off-by: Ryan <rlangman@nvidia.com>
---
 sdp/processors/datasets/hifitts2/download_dataset.py | 6 ++++--
 tests/test_cfg_end_to_end_tests.py                   | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py
index b1982319..6e965e32 100644
--- a/sdp/processors/datasets/hifitts2/download_dataset.py
+++ b/sdp/processors/datasets/hifitts2/download_dataset.py
@@ -110,7 +110,9 @@ def process_dataset_entry(self, data_entry):
             except (urllib.error.HTTPError, urllib.error.URLError) as http_error:
                 error_msg = f"Encountered HTTP error when downloading {url}: {http_error}"
                 logger.warning(error_msg)
-                if str(http_error.code).startswith("5") and i < self.num_retries:
+
+                error_code = getattr(http_error, "code", 0)
+                if (not error_code or str(error_code).startswith("5")) and i < self.num_retries:
                     logger.info(f"Retry {i} for url {url}")
                     time.sleep(10)
                     continue
@@ -121,7 +123,7 @@ def process_dataset_entry(self, data_entry):
                 error_data = {
                     "url": url,
                     "chapter_filepath": chapter_filepath,
-                    "error_code": http_error.code,
+                    "error_code": error_code,
                     "error_reason": http_error.reason,
                     "utterances": utterances,
                 }
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index 214096ee..562fc0c2 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -257,7 +257,6 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
         ),
     ]
 
-
 def get_test_names():
     config_names = [
         Path(t.config_path).parent.relative_to(DATASET_CONFIGS_ROOT).as_posix() for t in get_test_cases()

From ed4124fae6cdbb530147ab77e9fbf11e35d2db16 Mon Sep 17 00:00:00 2001
From: Ryan <rlangman@nvidia.com>
Date: Tue, 3 Jun 2025 09:30:36 -0700
Subject: [PATCH 6/6] Add 44kHz config test

Signed-off-by: Ryan <rlangman@nvidia.com>
---
 tests/test_cfg_end_to_end_tests.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index 562fc0c2..9d860ce9 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -250,6 +250,11 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
             data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"),
             processors_to_run="1:2"
         ),
+        TestCase(
+            config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_44khz.yaml",
+            data_check_fn=partial(data_check_fn_generic, file_name="manifest_44khz.json"),
+            processors_to_run="1:2"
+        ),
         TestCase(
             config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_bandwidth.yaml",
             data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"),
@@ -371,6 +376,10 @@ def test_configs(setup_data, tmp_path):
         cfg.processors[1].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix()
         cfg.processors[1].error_file = (data_dir / "errors_22khz.json").as_posix()
 
+    if "english/hifitts2/config_44khz" in config_path:
+        cfg.processors[1].input_manifest_file = (data_dir / "manifest_44khz.json").as_posix()
+        cfg.processors[1].error_file = (data_dir / "errors_44khz.json").as_posix()
+
     if "english/hifitts2/config_bandwidth" in config_path:
         cfg.processors[0].audio_dir = (data_dir / "audio_22khz").as_posix()
         cfg.processors[0].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix()