From 242d993318c8f2d2bcf0c9028d4fe1d08198607e Mon Sep 17 00:00:00 2001 From: Ryan Date: Fri, 13 Jun 2025 15:26:45 -0700 Subject: [PATCH 1/2] Validate chapter duration in HiFiTTS-2 download Signed-off-by: Ryan --- .../datasets/hifitts2/download_dataset.py | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/sdp/processors/datasets/hifitts2/download_dataset.py b/sdp/processors/datasets/hifitts2/download_dataset.py index 6e965e32..493fdf97 100644 --- a/sdp/processors/datasets/hifitts2/download_dataset.py +++ b/sdp/processors/datasets/hifitts2/download_dataset.py @@ -46,7 +46,7 @@ class DownloadHiFiTTS2(BaseParallelProcessor): Returns: Utterance files are stored under 'audio_dir' and chapter files are downloaded under 'chapter_dir'. - + If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod, with error information stored under the 'error_code' and 'error_reason' fields. @@ -107,12 +107,11 @@ def process_dataset_entry(self, data_entry): try: urllib.request.urlretrieve(url=url, filename=chapter_path) break - except (urllib.error.HTTPError, urllib.error.URLError) as http_error: - error_msg = f"Encountered HTTP error when downloading {url}: {http_error}" + except Exception as ex: + error_msg = f"Encountered exception when downloading {url}: {ex}" logger.warning(error_msg) - error_code = getattr(http_error, "code", 0) - if (not error_code or str(error_code).startswith("5")) and i < self.num_retries: + if i < self.num_retries: logger.info(f"Retry {i} for url {url}") time.sleep(10) continue @@ -120,16 +119,39 @@ def process_dataset_entry(self, data_entry): if self.exit_on_error: raise RuntimeError(error_msg) + if isinstance(ex, urllib.error.URLError): + error_reason = ex.reason + else: + error_reason = repr(ex) + error_data = { "url": url, "chapter_filepath": chapter_filepath, - "error_code": error_code, - "error_reason": http_error.reason, + "error_reason": error_reason, "utterances": utterances, } return [DataEntry(data=error_data)] chapter_audio, sr = librosa.load(path=chapter_path, sr=self.sample_rate) + chapter_duration = librosa.get_duration(y=chapter_audio, sr=sr) + + original_duration = data_entry["duration"] + duration_diff = abs(chapter_duration - original_duration) + if duration_diff > 0.1: + error_msg = f"Duration mismatch for {url}: original duration={original_duration}; " \ + f"downloaded duration={round(chapter_duration, 2)}" + logger.warning(error_msg) + + if self.exit_on_error: + raise RuntimeError(error_msg) + + error_data = { + "url": url, + "chapter_filepath": chapter_filepath, + "error_reason": error_msg, + "utterances": utterances, + } + return [DataEntry(data=error_data)] for utt in utterances: audio_filepath = utt["audio_filepath"] From 96e1becf775057def91ac7887074c1d52228bbde Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Tue, 17 Jun 2025 11:36:38 -0700 Subject: [PATCH 2/2] ipdate Dockerfile Signed-off-by: George Zelenfroind --- docker/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 3dc68216..e224faf9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,6 +21,9 @@ RUN apt-get update \ # Update pip RUN pip install --upgrade pip +#install typing-ext manually +RUN pip install typing-extensions + # Clone the NeMo SDP repository COPY . /src/NeMo-speech-data-processor RUN rm -rf /src/NeMo-speech-data-processor/.git @@ -34,4 +37,4 @@ RUN find requirements/ -name "*.txt" -exec pip install -r {} \; WORKDIR /src/NeMo-speech-data-processor # Set up entrypoint -CMD ["bash"] \ No newline at end of file +CMD ["bash"]