NVIDIA · Jorjeous · May 8, 2025 · Feb 26, 2025 · Apr 25, 2025 · Apr 28, 2025
diff --git a/.github/workflows/docker_pull.yml b/.github/workflows/docker_pull.yml
@@ -38,4 +38,7 @@ jobs:
         name: test-results
         path: |
           pytest.xml
-          coverage.xml
+          coverage.xml
+
+    - name: Docker cleanup
+      run: docker system prune -af
diff --git a/.github/workflows/docker_tts_sdp_test.yml b/.github/workflows/docker_tts_sdp_test.yml
@@ -0,0 +1,54 @@
+name: SDP TTS Docker Build and Test
+
+on:
+  pull_request:
+    branches: [ "main" ]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Build Docker image
+      run: |
+        docker build -t sdp-test-image:${{ github.sha }} -f docker/Dockerfile.tts_sdp .
+
+    - name: Run sdp tts tests
+      env:
+        AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
+        AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
+        HF_SECRET_KEY: ${{ secrets.HF_SECRET_KEY }}
+        CLEAN_UP_TMP_PATH: 1
+      run: |
+        docker run --rm \
+          -v ${{ github.workspace }}:/workspace \
+          -w /workspace \
+          --shm-size=4g \
+          -e AWS_SECRET_KEY="${AWS_SECRET_KEY}" \
+          -e AWS_ACCESS_KEY="${AWS_ACCESS_KEY}" \
+          -e HF_SECRET_KEY="${HF_SECRET_KEY}" \
+          -e CLEAN_UP_TMP_PATH="${CLEAN_UP_TMP_PATH}" \
+          sdp-test-image:${{ github.sha }} \
+          bash -c "python -m pytest tests/test_tts_sdp_end_to_end.py -v"
+
+    - name: Get test results
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-results
+        path: |
+          pytest.xml
+          coverage.xml
+
+    - name: Docker cleanup
+      run: docker system prune -af
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -84,7 +84,7 @@ jobs:
         CLEAN_UP_TMP_PATH: 1
       run: |
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
-        python -m pytest tests/ --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
+        python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
 
 
 # TODO: add some way to see if e2e tests were skipped

diff --git a/dataset_configs/arabic/readme.md b/dataset_configs/arabic/readme.md
@@ -2,4 +2,5 @@
 
 This folder is designated for Arabic speech processing configuration files will be added soon. It is associated with a forthcoming paper, which will detail the work done within this project.
 
-Note: This folder is a work in progress.
+
+Note: This folder is a work in progress. 
diff --git a/dataset_configs/tts/ytc/config.yaml b/dataset_configs/tts/ytc/config.yaml
@@ -0,0 +1,102 @@
+documentation: |
+  TTS data processing pipeline
+  ############################
+
+  This pipeline processes YouTube Commons (YTC) data for text-to-speech (TTS) training.
+
+  The pipeline performs the following steps:
+  1. Creates initial manifest by resampling audio to 16kHz mono WAV format
+  2. Runs speaker diarization and overlap detection using pyannote
+  3. Splits long audio segments 
+  4. Aligns text and audio using NeMo ASR models
+  5. Joins split audio metadata back together
+  6. Merges alignment and diarization information
+  7. Performs inverse text normalization
+  8. Calculates audio quality metrics using TorchSQUIM
+  9. Estimates audio bandwidth 
+  10. Prepares TTS segments
+
+  Required inputs:
+
+  - input_manifest_file: Path to input manifest json file
+    - manifest must contain "audio_filepath" and "audio_item_id" fields
+    - example: {"audio_filepath": "path/to/raw/audio/file.wav", "audio_item_id": "some_unique_id"}
+
+  - hf_token: HuggingFace token for pyannote access
+  - data_split: Data split name (train/dev/test)
+  - workspace_dir: Directory for intermediate files
+  - language_short: 2-letter language code
+  - nemo_path: Path to NeMo installation
+  - final_manifest: Path for final output manifest
+
+processors_to_run: all
+data_split: ???
+workspace_dir: /tmp
+language_short: ???
+input_manifest_file: ???
+final_manifest: ???
+nemo_path: ???
+resampled_audio_dir: /tmp/audio_resampled
+hf_token: ???
+max_segment_length: 40
+device: cuda
+
+processors:
+  - _target_: sdp.processors.datasets.ytc.create_initial_manifest.CreateInitialManifestYTC
+    input_manifest_file: ${input_manifest_file}
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_initial.json"
+    resampled_audio_dir: ${resampled_audio_dir}
+    input_format: "wav"
+    target_sample_rate: 16000
+    target_format: "wav"
+    target_nchannels: 1
+
+  - _target_: sdp.processors.tts.pyannote.PyAnnoteDiarizationAndOverlapDetection
+    hf_token: ${hf_token}
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_initial.json"
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_diarized.json"
+    max_length: ${max_segment_length}
+    device: ${device}
+
+  - _target_: sdp.processors.tts.split.SplitLongAudio
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_diarized.json"
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_split.json"
+    suggested_max_len: ${max_segment_length}
+    min_pause_len: 1
+
+  - _target_: sdp.processors.tts.nemo_asr_align.NeMoASRAligner
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_split.json"
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_aligned.json"
+    parakeet: True
+    ctc: False
+    batch_size: 16
+    device: ${device}
+
+  - _target_: sdp.processors.tts.split.JoinSplitAudioMetadata
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_aligned.json"
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_joined.json"
+
+  - _target_: sdp.processors.tts.merge_alignment_diarization.MergeAlignmentDiarization
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_joined.json"
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_merged.json"
+
+  - _target_: sdp.processors.tts.text.InverseTextNormalizationProcessor
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_merged.json"
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_ITN.json"
+    language: ${language_short}
+
+  - _target_: sdp.processors.tts.metrics.TorchSquimObjectiveQualityMetricsProcessor
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_ITN.json"
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_squim.json"
+    device: ${device}
+
+  - _target_: sdp.processors.tts.metrics.BandwidthEstimationProcessor
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_squim.json"
+    output_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_bandwidth.json"
+
+  - _target_: sdp.processors.tts.prepare_tts_segments.PrepareTTSSegmentsProcessor
+    input_manifest_file: "${workspace_dir}/tts_processed/${data_split}_manifest_bandwidth.json"
+    output_manifest_file: "${final_manifest}"
+    terminal_punct_marks:  ".!?"
+
+
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -22,24 +22,14 @@ RUN apt-get update \
 RUN pip install --upgrade pip
 
 # Clone the NeMo SDP repository
-WORKDIR /src
-RUN git clone https://github.com/NVIDIA/NeMo-speech-data-processor.git
+COPY . /src/NeMo-speech-data-processor
+RUN rm -rf /src/NeMo-speech-data-processor/.git
+
 
 WORKDIR /src/NeMo-speech-data-processor
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
 RUN pip install numpy 
 RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
-RUN wget https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-RUN pip install mamba_ssm-2.2.2+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-
-
-# Update NeMo to allow timestamps
-RUN git clone https://github.com/NVIDIA/NeMo.git /src/NeMo/
-WORKDIR /src/NeMo
-RUN git reset --hard 0547550ad803fce1e4a019f92e9c59f4c902e7e0
-RUN ./reinstall.sh
-RUN pip install python-swiftclient ffmpeg-python
-
 # Set working directory back to NeMo-speech-data-processor
 WORKDIR /src/NeMo-speech-data-processor
 

diff --git a/docker/Dockerfile.tts_sdp b/docker/Dockerfile.tts_sdp
@@ -0,0 +1,46 @@
+FROM pytorch/pytorch:2.4.1-cuda12.1-cudnn9-devel
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ENV TZ=America/Los_Angeles
+
+# Install basics
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    bzip2 \
+    ca-certificates \
+    libsox-fmt-mp3 \
+    cmake \
+    curl \
+    ffmpeg \
+    g++ \
+    sox \
+    unzip \
+    vim \
+    wget
+
+# Update pip
+RUN pip install --upgrade pip
+
+# Link all cudnn .so libraries for runtime
+RUN ln -s /opt/conda/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn*.h /usr/include/
+RUN mkdir -p /usr/local/cuda/lib64
+RUN ln -s /opt/conda/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn*.so* /usr/local/cuda/lib64/
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+
+# Copy NeMo SDP
+WORKDIR /src
+COPY . /src/NeMo-speech-data-processor
+RUN rm -rf /src/NeMo-speech-data-processor/.git
+
+# Install requirements
+WORKDIR /src/NeMo-speech-data-processor
+RUN pip install -r requirements/main.txt 
+RUN pip install -r requirements/tts.txt
+RUN pip install flash-attn --no-build-isolation
+RUN pip install https://github.com/LahiLuk/YouTokenToMe/archive/master.zip
+RUN pip install megatron-core transformer_engine[pytorch]
+RUN pip install nemo_toolkit['all']==2.1.0 
+
+WORKDIR /src/NeMo-speech-data-processor
diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -28,10 +28,30 @@
 
 templates_path = ["_templates"]
 
+
 autodoc_mock_imports = [
+    # TTS pipeline
+    "pyannote.audio",
+    "pyannote.audio.pipelines.utils.hook",
+    "whisperx.audio",
+    "whisperx.vad",
+    "torch",
+    "torchaudio",
+    "nemo",
+    "nemo.collections",
+    "nemo.collections.asr",
+    "nemo_text_processing.inverse_text_normalization.inverse_normalize",
+    "librosa",
+    "numpy",
+    "tqdm",
     "soundfile",
+    "ndjson",
+    "boto3",
+    "webvtt_py",
+    "python_docx",
     "webvtt",
-    "docx"
+    "docx",    
+    "pyannote"
 ]
 
 _skipped_autodoc_mock_imports = []

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
@@ -116,6 +116,12 @@ HuggingFace Datasets
 .. autodata:: sdp.processors.CreateInitialManifestHuggingFace
    :annotation:
 
+YTC Datasets
+''''''''''''
+
+.. autodata:: sdp.processors.datasets.ytc.create_initial_manifest.CreateInitialManifestYTC
+   :annotation:
+
 Lhotse processors
 #################
 
@@ -145,6 +151,22 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.ASRTransformers
    :annotation:
 
+.. autodata:: sdp.processors.tts.pyannote.PyAnnoteDiarizationAndOverlapDetection
+   :annotation:
+
+.. autodata:: sdp.processors.tts.nemo_asr_align.NeMoASRAligner
+   :annotation:
+
+.. autodata:: sdp.processors.tts.text.InverseTextNormalizationProcessor
+   :annotation:
+
+.. autodata:: sdp.processors.tts.metrics.TorchSquimObjectiveQualityMetricsProcessor
+   :annotation:
+
+.. autodata:: sdp.processors.tts.metrics.BandwidthEstimationProcessor
+   :annotation:
+
+
 Text-only processors
 ####################
 
@@ -331,6 +353,19 @@ Miscellaneous
 .. autodata:: sdp.processors.CreateCombinedManifests
    :annotation:
 
+.. autodata:: sdp.processors.tts.split.SplitLongAudio
+   :annotation:
+
+.. autodata:: sdp.processors.tts.split.JoinSplitAudioMetadata
+   :annotation:
+
+.. autodata:: sdp.processors.tts.merge_alignment_diarization.MergeAlignmentDiarization
+   :annotation:
+
+.. autodata:: sdp.processors.tts.prepare_tts_segments.PrepareTTSSegmentsProcessor
+   :annotation:
+
+
 .. _sdp-base-classes:
 
 Base classes

diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
@@ -366,10 +366,19 @@ Armenian Toloka
    `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/armenian/toloka/pipeline_get_final_res.yaml>`__ |
    :doc:`documentation <config-docs/armenian/toloka/pipeline_get_final_res>`
 
+YouTube Commons (YTC)
+~~~~~~~~~~~~~~~~~~~~~~
+
+**Dataset link:** https://huggingface.co/datasets/PleIAs/YouTube-Commons
+
+`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/tts/ytc/config.yaml>`__ |
+:doc:`documentation <config-docs/tts/ytc/config>`
+
 .. toctree::
    :hidden:
 
    config-docs/armenian/toloka/pipeline_start
    config-docs/armenian/toloka/pipeline_validate_answers
    config-docs/armenian/toloka/pipeline_get_final_res
 
+   config-docs/tts/ytc/config
diff --git a/requirements/tts.txt b/requirements/tts.txt
@@ -0,0 +1,7 @@
+ndjson
+transformers
+accelerate
+torchaudio
+pyannote-audio
+ffmpeg-python
+whisperx==3.3.1
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -64,6 +64,9 @@
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
+from sdp.processors.datasets.ytc.create_initial_manifest import (
+    CreateInitialManifestYTC,
+)
 from sdp.processors.huggingface.speech_recognition import ASRTransformers
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
 

diff --git a/sdp/processors/datasets/ytc/__init__.py b/sdp/processors/datasets/ytc/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,5 @@

		This folder is designated for Arabic speech processing configuration files will be added soon. It is associated with a forthcoming paper, which will detail the work done within this project.

		Note: This folder is a work in progress.

		Note: This folder is a work in progress.