diff --git a/dataset_configs/english/hifitts2/config_22khz.yaml b/dataset_configs/english/hifitts2/config_22khz.yaml new file mode 100644 index 00000000..93506582 --- /dev/null +++ b/dataset_configs/english/hifitts2/config_22khz.yaml @@ -0,0 +1,62 @@ +documentation: | + HiFiTTS-2 22kHz + ############### + + This config can be used to download the audio data for HiFiTTS-2 22kHz. + + 1. Downloads HiFiTTS-2 audio from LibriVox. + 2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they + were removed from the website) are removed. + + **Required arguments**. + + * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored. + + Note that you can customize any part of this config either directly or from command-line. + + **Output format**. + + This config outputs 2 manifest files: + + * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox. + * ``${workspace_dir}/manifest_filtered_22khz`` - input manifest file without utterances from failed chapters. + +processors_to_run: all +workspace_dir: ??? +manifest_filename: manifest_22khz.json +output_filename: manifest_filtered_22khz.json +chapter_filename: chapters_22khz.json +error_filename: errors_22khz.json +audio_dir_name: audio_22khz +chapter_audio_dir_name: chapters +sample_rate: 22050 +delete_chapter_files: true +exit_on_error: false +use_dask: false +max_workers: 8 +chunksize: 50 + +input_manifest_file: ${workspace_dir}/${manifest_filename} +chapter_file: ${workspace_dir}/${chapter_filename} +error_file: ${workspace_dir}/${error_filename} +audio_dir: ${workspace_dir}/${audio_dir_name} +chapter_dir: ${workspace_dir}/${chapter_audio_dir_name} +final_manifest: ${workspace_dir}/${output_filename} + +processors: + - _target_: sdp.processors.DownloadHiFiTTS2 + audio_dir: ${audio_dir} + chapter_dir: ${chapter_dir} + sample_rate: ${sample_rate} + delete_chapter_files: ${delete_chapter_files} + exit_on_error: ${exit_on_error} + input_manifest_file: ${chapter_file} + output_manifest_file: ${error_file} + use_dask: ${use_dask} + max_workers: ${max_workers} + chunksize: ${chunksize} + + - _target_: sdp.processors.RemovedFailedChapters + input_manifest_file: ${input_manifest_file} + output_manifest_file: ${final_manifest} + error_file: ${error_file} diff --git a/dataset_configs/english/hifitts2/config_44khz.yaml b/dataset_configs/english/hifitts2/config_44khz.yaml new file mode 100644 index 00000000..33d79f98 --- /dev/null +++ b/dataset_configs/english/hifitts2/config_44khz.yaml @@ -0,0 +1,62 @@ +documentation: | + HiFiTTS-2 44kHz + ################## + + This config can be used to download the audio data for HiFiTTS-2 44kHz. + + 1. Downloads HiFiTTS-2 audio from LibriVox. + 2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they + were removed from the website) are removed. + + **Required arguments**. + + * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored. + + Note that you can customize any part of this config either directly or from command-line. + + **Output format**. + + This config outputs 2 manifest files: + + * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox. + * ``${workspace_dir}/manifest_filtered_44khz`` - input manifest file without utterances from failed chapters. + +processors_to_run: all +workspace_dir: ??? +manifest_filename: manifest_44khz.json +output_filename: manifest_filtered_44khz.json +chapter_filename: chapters_44khz.json +error_filename: errors_44khz.json +audio_dir_name: audio_44khz +chapter_audio_dir_name: chapters +sample_rate: 44100 +delete_chapter_files: true +exit_on_error: false +use_dask: false +max_workers: 8 +chunksize: 50 + +input_manifest_file: ${workspace_dir}/${manifest_filename} +chapter_file: ${workspace_dir}/${chapter_filename} +error_file: ${workspace_dir}/${error_filename} +audio_dir: ${workspace_dir}/${audio_dir_name} +chapter_dir: ${workspace_dir}/${chapter_audio_dir_name} +final_manifest: ${workspace_dir}/${output_filename} + +processors: + - _target_: sdp.processors.DownloadHiFiTTS2 + audio_dir: ${audio_dir} + chapter_dir: ${chapter_dir} + sample_rate: ${sample_rate} + delete_chapter_files: ${delete_chapter_files} + exit_on_error: ${exit_on_error} + input_manifest_file: ${chapter_file} + output_manifest_file: ${error_file} + use_dask: ${use_dask} + max_workers: ${max_workers} + chunksize: ${chunksize} + + - _target_: sdp.processors.RemovedFailedChapters + input_manifest_file: ${input_manifest_file} + output_manifest_file: ${final_manifest} + error_file: ${error_file} diff --git a/dataset_configs/english/hifitts2/config_bandwidth.yaml b/dataset_configs/english/hifitts2/config_bandwidth.yaml new file mode 100644 index 00000000..15c219bf --- /dev/null +++ b/dataset_configs/english/hifitts2/config_bandwidth.yaml @@ -0,0 +1,44 @@ +documentation: | + HiFiTTS-2 Bandwidth Estimation + ############################## + + This config contains the bandwidth estimation code used for HiFiTTS and HiFiTTS-2. + This config can be used to estimate bandwidth for any dataset. For HiFiTTS-2 bandwidth + was estimated using the first 30 seconds of every audiobook chapter, but the estimate is still + reasonably accurate if run over a shorter duration or with individual utterances. + + **Required arguments**. + + * **workspace_dir**: The workspace folder where all audio files and manifests are stored. + * **audio_dir**: Folder in workspace containing audio files to estimate bandwidth of. + * **input_manifest_filename**: Manifest file in workspace containing relative paths to audio. + + **Output format**. + + This config outputs a single manifest with the following field(s): + + * **bandwidth (int)**: Estimated bandwidth of the audio file. + +processors_to_run: all +workspace_dir: ??? +audio_dir_name: ??? +input_manifest_filename: ??? +output_manifest_filename: manifest_bandwidth.json +audio_key: audio_filepath +use_dask: false +max_workers: 1 +chunksize: 1 + +input_manifest_file: ${workspace_dir}/${input_manifest_filename} +final_manifest: ${workspace_dir}/${output_manifest_filename} +audio_dir: ${workspace_dir}/${audio_dir_name} + +processors: + - _target_: sdp.processors.EstimateBandwidth + input_manifest_file: ${input_manifest_file} + output_manifest_file: ${final_manifest} + audio_dir: ${audio_dir} + input_audio_key: ${audio_key} + use_dask: ${use_dask} + max_workers: ${max_workers} + chunksize: ${chunksize} diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index c285c3b8..bfa2bc62 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -116,12 +116,24 @@ HuggingFace Datasets .. autodata:: sdp.processors.CreateInitialManifestHuggingFace :annotation: + YTC Datasets '''''''''''' .. autodata:: sdp.processors.datasets.ytc.create_initial_manifest.CreateInitialManifestYTC :annotation: + +HiFiTTS-2 +'''''''''''''''''''' + +.. autodata:: sdp.processors.DownloadHiFiTTS2 + :annotation: + +.. autodata:: sdp.processors.RemovedFailedChapters + :annotation: + + Lhotse processors ################# @@ -151,6 +163,9 @@ used in the downstream processing for additional enhancement or filtering. .. autodata:: sdp.processors.ASRTransformers :annotation: +.. autodata:: sdp.processors.EstimateBandwidth + :annotation: + .. autodata:: sdp.processors.tts.pyannote.PyAnnoteDiarizationAndOverlapDetection :annotation: @@ -166,7 +181,6 @@ used in the downstream processing for additional enhancement or filtering. .. autodata:: sdp.processors.tts.metrics.BandwidthEstimationProcessor :annotation: - Text-only processors #################### diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst index 233a05dd..5be69922 100644 --- a/docs/src/sdp/existing_configs.rst +++ b/docs/src/sdp/existing_configs.rst @@ -366,6 +366,13 @@ Armenian Toloka `config `__ | :doc:`documentation ` +.. toctree:: + :hidden: + + config-docs/armenian/toloka/pipeline_start + config-docs/armenian/toloka/pipeline_validate_answers + config-docs/armenian/toloka/pipeline_get_final_res + YouTube Commons (YTC) ~~~~~~~~~~~~~~~~~~~~~~ @@ -377,8 +384,26 @@ YouTube Commons (YTC) .. toctree:: :hidden: - config-docs/armenian/toloka/pipeline_start - config-docs/armenian/toloka/pipeline_validate_answers - config-docs/armenian/toloka/pipeline_get_final_res - config-docs/tts/ytc/config + +HiFiTTS-2 +~~~~~~~~~~~~~~~~~~~~~~~ + +**Dataset link:** TODO + +* **22kHz**: + `config `__ | + :doc:`documentation ` +* **44kHz**: + `config `__ | + :doc:`documentation ` +* **Bandwidth Estimation**: + `config `__ | + :doc:`documentation ` + +.. toctree:: + :hidden: + + config-docs/english/hifitts2/config_22khz + config-docs/english/hifitts2/config_44khz + config-docs/english/hifitts2/config_bandwidth diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index df860331..c3ff70b6 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -24,6 +24,8 @@ from sdp.processors.datasets.fleurs.create_initial_manifest import ( CreateInitialManifestFleurs, ) +from sdp.processors.datasets.hifitts2.download_dataset import DownloadHiFiTTS2 +from sdp.processors.datasets.hifitts2.remove_failed_chapters import RemovedFailedChapters from sdp.processors.datasets.uzbekvoice.create_initial_manifest import ( CreateInitialManifestUzbekvoice, ) @@ -127,6 +129,7 @@ MakeLettersUppercaseAfterPeriod, ) from sdp.processors.nemo.asr_inference import ASRInference +from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth from sdp.processors.nemo.pc_inference import PCInference from sdp.processors.toloka.accept_if import AcceptIfWERLess from sdp.processors.toloka.create_pool import CreateTolokaPool diff --git a/sdp/processors/datasets/hifitts2/__init__.py b/sdp/processors/datasets/hifitts2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py new file mode 100644 index 00000000..6e965e32 --- /dev/null +++ b/sdp/processors/datasets/hifitts2/download_dataset.py @@ -0,0 +1,147 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import librosa +from pathlib import Path +import soundfile as sf +import time +import urllib.error +import urllib.request + +from sdp.logging import logger +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + + +class DownloadHiFiTTS2(BaseParallelProcessor): + """ + Downloads HiFiTTS-2 dataset to local machine. Unsegmented audiobook chapters are first downloaded at a + 48 kHz from LibriVox. Each chapter is then split into segmented utterance files based on precomputed + offsets and durations. + + To reduce disk use, the chapter files can be optionally deleted after they are segmented. + + Metadata for chapters which fail to download due to network errors are stored in an output manifest file, + which can be given as input to this processor to attempt the downloads again. + + Args: + audio_dir (str): Root directory where utterance files will be saved. + chapter_dir (str): Root directory where audiobook chapter files will be saved. + sample_rate (int): Sample rate to use for utterance files. + delete_chapter_files (bool): Whether to delete each chapter file after it is done being processed. + exit_on_error (bool): Whether to terminate the entire processor script if a single chapter downlaod fails. + num_retries (int): Number of times to retry chapter download after encountering intermittent HTTP errors. + + Returns: + Utterance files are stored under 'audio_dir' and chapter files are downloaded under 'chapter_dir'. + + If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod, + with error information stored under the 'error_code' and 'error_reason' fields. + + Example: + .. code-block:: yaml + + - _target_: sdp.processors.DownloadHiFiTTS2 + input_manifest_file: ${workspace_dir}/manifest_22khz.json + output_manifest_file: ${workspace_dir}/errors_22khz.json + audio_dir: ${workspace_dir}/audio_22khz + chapter_dir: ${workspace_dir}/chapters + max_workers: 8 + """ + + def __init__( + self, + audio_dir: str, + chapter_dir: str, + sample_rate: int = 22050, + delete_chapter_files: bool = True, + exit_on_error: bool = False, + num_retries: int = 5, + **kwargs, + ): + super().__init__(**kwargs) + self.audio_dir = Path(audio_dir) + self.chapter_dir = Path(chapter_dir) + self.sample_rate = sample_rate + self.delete_chapter_files = delete_chapter_files + self.exit_on_error = exit_on_error + self.num_retries = num_retries + + def prepare(self): + # Create output directory structure + with open(self.input_manifest_file, "rt", encoding="utf-8") as fin: + dirs = set() + for line in fin: + row = json.loads(line) + audio_filepath = Path(row["utterances"][0]["audio_filepath"]) + chapter_dir = audio_filepath.parent + dirs.add(chapter_dir) + + for dir in dirs: + audio_dir = self.audio_dir / dir + chapter_dir = self.chapter_dir / dir + audio_dir.mkdir(exist_ok=True, parents=True) + chapter_dir.mkdir(exist_ok=True, parents=True) + + return + + def process_dataset_entry(self, data_entry): + url = data_entry["url"] + chapter_filepath = data_entry["chapter_filepath"] + utterances = data_entry["utterances"] + + chapter_path = self.chapter_dir / chapter_filepath + for i in range(1, self.num_retries + 1): + try: + urllib.request.urlretrieve(url=url, filename=chapter_path) + break + except (urllib.error.HTTPError, urllib.error.URLError) as http_error: + error_msg = f"Encountered HTTP error when downloading {url}: {http_error}" + logger.warning(error_msg) + + error_code = getattr(http_error, "code", 0) + if (not error_code or str(error_code).startswith("5")) and i < self.num_retries: + logger.info(f"Retry {i} for url {url}") + time.sleep(10) + continue + + if self.exit_on_error: + raise RuntimeError(error_msg) + + error_data = { + "url": url, + "chapter_filepath": chapter_filepath, + "error_code": error_code, + "error_reason": http_error.reason, + "utterances": utterances, + } + return [DataEntry(data=error_data)] + + chapter_audio, sr = librosa.load(path=chapter_path, sr=self.sample_rate) + + for utt in utterances: + audio_filepath = utt["audio_filepath"] + audio_path = self.audio_dir / audio_filepath + offset = utt["offset"] + dur = utt["duration"] + start_sample = librosa.time_to_samples(offset, sr=sr) + end_sample = librosa.time_to_samples(offset + dur, sr=sr) + audio = chapter_audio[start_sample:end_sample] + sf.write(file=audio_path, data=audio, samplerate=int(sr)) + + if self.delete_chapter_files: + chapter_path.unlink() + + return [] diff --git a/sdp/processors/datasets/hifitts2/remove_failed_chapters.py b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py new file mode 100644 index 00000000..b4cd5a8b --- /dev/null +++ b/sdp/processors/datasets/hifitts2/remove_failed_chapters.py @@ -0,0 +1,66 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +from pathlib import Path +from tqdm import tqdm + +from sdp.processors.base_processor import BaseProcessor +from sdp.utils.common import load_manifest + + +class RemovedFailedChapters(BaseProcessor): + """ + Removes all utterances in the input chapter file from the input manifest. This processor is expected to be + run using the file output by the DownloadHiFiTTS2 containing failed chapter downloads. + + Args: + error_file (str): Path to file with chapter download errors. + + Returns: + This outputs a manifest which is the same as its input manifest but with utterances in 'error_file' removed. + + Example: + .. code-block:: yaml + + - _target_: sdp.processors.RemovedFailedChapters + input_manifest_file: ${workspace_dir}/manifest_22khz.json + output_manifest_file: ${workspace_dir}/manifest_filtered_22khz.json + error_file: ${workspace_dir}/errors_22khz.json + """ + + def __init__( + self, + error_file: str, + **kwargs, + ): + super().__init__(**kwargs) + self.error_file = Path(error_file) + + def process(self): + chapter_rows = load_manifest(self.error_file) + audio_files_to_remove = set() + for chapter_row in chapter_rows: + for utt_list in chapter_row["utterances"]: + audio_files_to_remove.add(utt_list["audio_filepath"]) + + rows = load_manifest(Path(self.input_manifest_file)) + with open(self.output_manifest_file, "w", encoding="utf-8") as output_f: + for row in tqdm(rows): + if row["audio_filepath"] in audio_files_to_remove: + continue + + output_line = f"{json.dumps(row, ensure_ascii=False)}\n" + output_f.write(output_line) diff --git a/sdp/processors/nemo/estimate_bandwidth.py b/sdp/processors/nemo/estimate_bandwidth.py new file mode 100644 index 00000000..38b261e7 --- /dev/null +++ b/sdp/processors/nemo/estimate_bandwidth.py @@ -0,0 +1,87 @@ +import librosa +import numpy as np +from pathlib import Path + +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + + +class EstimateBandwidth(BaseParallelProcessor): + """ + Adds estimated bandwidth to each utterance in the input manifest file. + + Args: + audio_dir (str): Root directory where audio files are stored. + input_audio_key (str): Manifest key with relative audio paths. + output_bandwidth_key (str): Manifest key to store estimated bandwidth in. + max_seconds (float): The maximum length of audio to use for bandwidth estimation. + By default, uses the first 30 seconds. + sample_rate (int): Sample rate to resample audio to before doing bandwidth estimation. + Defaults to 44100, upsampling the input audio as needed. + n_fft (int): Number of FFT bins to use for bandwidth estimation. Defaults to 512. + hop_length (int): Audio frame hop length to use for bandwidth estimation. + Defaults to 441, corresponding to 0.01 seconds for 44100 sample rate. + top_db (float): top_db treshhold to use for bandwidth estimation. + frequency_threshold (float): Bandwidth estimation finds the highest frequency with mean power spectrum that is + within 'frequency_threshold' dB of its peak power. Defaults to -50 dB. + + Returns: + This processor estimates the bandwidth of the audio file in the`input_audio_key` field and saves the estimate + in the output_bandwidth_key` field. + + Example: + .. code-block:: yaml + + - _target_: sdp.processors.EstimateBandwidth + input_manifest_file: ${workspace_dir}/manifest.json + output_manifest_file: ${workspace_dir}/manifest_bandwidth.json + audio_dir: ${workspace_dir}/audio_22khz + max_workers: 8 + """ + + def __init__( + self, + audio_dir: str, + input_audio_key: str = "audio_filepath", + output_bandwidth_key: str = "bandwidth", + max_seconds: float = 30.0, + sample_rate: int = 44100, + n_fft: int = 512, + hop_length: int = 441, + top_db: float = 100.0, + frequency_threshold: float = -50.0, + **kwargs, + ): + super().__init__(**kwargs) + self.audio_directory = Path(audio_dir) + self.input_audio_key = input_audio_key + self.output_bandwidth_key = output_bandwidth_key + self.max_seconds = max_seconds + self.sample_rate = sample_rate + self.n_fft = n_fft + self.hop_length = hop_length + self.top_db = top_db + self.frequency_threshold = frequency_threshold + + def _estimate_bandwidth(self, audio, sample_rate): + spec = librosa.stft(y=audio, n_fft=self.n_fft, hop_length=self.hop_length, window="blackmanharris") + power_spec = np.abs(spec) ** 2 + power_spec = np.mean(power_spec, axis=1) + power_spec = librosa.power_to_db(power_spec, ref=self.n_fft, top_db=self.top_db) + + bandwidth = 0 + peak = np.max(power_spec) + freq_width = sample_rate / self.n_fft + for idx in range(len(power_spec) - 1, -1, -1): + if power_spec[idx] - peak > self.frequency_threshold: + bandwidth = idx * freq_width + break + + return bandwidth + + def process_dataset_entry(self, data_entry): + audio_filename = data_entry[self.input_audio_key] + audio_file = self.audio_directory / audio_filename + audio, sr = librosa.load(path=audio_file, sr=self.sample_rate, duration=self.max_seconds) + bandwidth = self._estimate_bandwidth(audio=audio, sample_rate=sr) + data_entry[self.output_bandwidth_key] = int(bandwidth) + return [DataEntry(data=data_entry)] diff --git a/tests/prepare_test_data/prepare_hifitts2_data.py b/tests/prepare_test_data/prepare_hifitts2_data.py new file mode 100644 index 00000000..9a83fad1 --- /dev/null +++ b/tests/prepare_test_data/prepare_hifitts2_data.py @@ -0,0 +1,72 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Copies HiFiTTS-2 manifests and audio into a new directory with fewer entries.""" + +import argparse +import json +import os +from pathlib import Path +import shutil + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Preparing HiFiTTS-2 test data") + parser.add_argument( + "--workspace_folder", required=True, type=Path, help="Path to workspace where dataset was downloaded." + ) + parser.add_argument( + "--audio_folder", default="audio_22khz", type=Path, required=False, help="Name of root folder with audio." + ) + parser.add_argument("--test_data_folder", required=True, type=Path, help="Where to place the prepared data") + parser.add_argument( + "--manifest_filename", default="manifest_22khz.json", type=str, required=False, help="Name of manifest manifest." + ) + parser.add_argument( + "--chapters_filename", default="chapters_22khz.json", type=str, required=False, help="Name of chapter manifest." + ) + parser.add_argument( + "--error_filename", default="errors_22khz.json", type=str, required=False, help="Name of chapter error manifest." + ) + parser.add_argument("--num_entries", default=20, type=int, help="How many entries to keep from each manifest") + + args = parser.parse_args() + + files_to_copy = [args.manifest_filename, args.chapters_filename, args.error_filename] + + os.makedirs(args.test_data_folder, exist_ok=True) + # Copy manifest files + for filename in files_to_copy: + input_path = args.workspace_folder / filename + output_path = args.test_data_folder / filename + with open(input_path, "r", encoding="utf-8") as input_f: + with open(output_path, "w", encoding="utf-8") as output_f: + for i, line in enumerate(input_f): + if i >= args.num_entries: + break + output_f.write(line) + + # Copy audio + manifest_path = args.test_data_folder / args.manifest_filename + input_audio_dir = args.workspace_folder / args.audio_folder + output_audio_dir = args.test_data_folder / args.audio_folder + with open(manifest_path, "r", encoding="utf-8") as input_f: + for i, line in enumerate(input_f): + if i >= args.num_entries: + break + row = json.loads(line) + audio_filepath = row["audio_filepath"] + input_path = input_audio_dir / audio_filepath + output_path = output_audio_dir / audio_filepath + output_path.parent.mkdir(exist_ok=True, parents=True) + shutil.copy(src=input_path, dst=output_path) \ No newline at end of file diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index db0425e7..9d860ce9 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -36,8 +36,8 @@ class TestCase: """Class for keeping track of test cases.""" config_path: str data_check_fn: Callable - # Fields in the manifest to ignore (can be set when non-deterministic processor was used) reference_manifest_filename: str = "test_data_reference.json" + # Fields in the manifest to ignore (can be set when non-deterministic processor was used) fields_to_ignore: List[str] = field(default_factory=list) processors_to_run: str = "" @@ -232,19 +232,34 @@ def get_test_cases() -> List[Tuple[str, Callable]]: data_check_fn=partial(data_check_fn_generic, file_name="everyayah.hf") ), TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", data_check_fn=data_check_fn_armenian_toloka_pipeline_start, fields_to_ignore=['source_filepath'], processors_to_run="2:14", reference_manifest_filename="pipeline_start/test_data_reference.json" ), TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", + config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res, reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", fields_to_ignore=['audio_filepath', 'duration'], processors_to_run="1:6" - ) + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_22khz.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"), + processors_to_run="1:2" + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_44khz.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="manifest_44khz.json"), + processors_to_run="1:2" + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/english/hifitts2/config_bandwidth.yaml", + data_check_fn=partial(data_check_fn_generic, file_name="manifest_22khz.json"), + reference_manifest_filename="test_data_reference_bandwidth.json", + ), ] def get_test_names(): @@ -357,6 +372,18 @@ def test_configs(setup_data, tmp_path): # Set input_manifest_file for ASRFileCheck to use the existing manifest.json cfg.processors[1].input_manifest_file = (data_dir / "pipeline_get_final_res" / "manifest.json").as_posix() + if "english/hifitts2/config_22khz" in config_path: + cfg.processors[1].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix() + cfg.processors[1].error_file = (data_dir / "errors_22khz.json").as_posix() + + if "english/hifitts2/config_44khz" in config_path: + cfg.processors[1].input_manifest_file = (data_dir / "manifest_44khz.json").as_posix() + cfg.processors[1].error_file = (data_dir / "errors_44khz.json").as_posix() + + if "english/hifitts2/config_bandwidth" in config_path: + cfg.processors[0].audio_dir = (data_dir / "audio_22khz").as_posix() + cfg.processors[0].input_manifest_file = (data_dir / "manifest_22khz.json").as_posix() + run_processors(cfg) # additionally, let's test that final generated manifest matches the # reference file (ignoring the file paths and additional fields explicitly specified to ignore)