From 89c7400ddb00dd9b7325819b7190cce69dc20a00 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 14 May 2025 14:02:24 +0400
Subject: [PATCH 01/36] Ipl processors

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/IPL/__init__.py       |   0
 sdp/processors/IPL/ipl_processors.py | 338 +++++++++++++++++++++++++++
 sdp/processors/IPL/smth.py           | 258 ++++++++++++++++++++
 3 files changed, 596 insertions(+)
 create mode 100644 sdp/processors/IPL/__init__.py
 create mode 100644 sdp/processors/IPL/ipl_processors.py
 create mode 100644 sdp/processors/IPL/smth.py

diff --git a/sdp/processors/IPL/__init__.py b/sdp/processors/IPL/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/sdp/processors/IPL/ipl_processors.py b/sdp/processors/IPL/ipl_processors.py
new file mode 100644
index 00000000..6fed2702
--- /dev/null
+++ b/sdp/processors/IPL/ipl_processors.py
@@ -0,0 +1,338 @@
+# Standard library imports
+import os
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+# Third-party imports
+from omegaconf import DictConfig, OmegaConf, open_dict
+import logging
+import json
+# Local imports
+from sdp.processors.base_processor import BaseProcessor
+
+
+class TrainingCommandGenerator(BaseProcessor):
+    """
+    A processor that generates training commands for NeMo models with support for both local and cluster configurations.
+    Handles manifest file updates and tarred audio filepath management for training datasets.
+
+    Args:
+        training_config_local (str): Path to the local machine configuration file
+        training_config_cluster (str): Path to the cluster configuration file
+        training_script_path (str): Path to the training script relative to nemo_directory
+        nemo_directory (str): Base directory for NeMo framework
+        new_manifest_files (str, optional): New manifest files to add to the training configuration
+        new_tarred_audio_filepaths (str, optional): New tarred audio filepaths to add to the training configuration
+        **kwargs: Additional arguments passed to the parent BaseProcessor class
+    """
+
+    def __init__(
+        self,
+        training_config_local: str,      # Local machine config path
+        training_config_cluster: str,    # Cluster config path
+        training_script_path: str,       # Path to training script
+        nemo_directory: str,             # Base directory for NeMo
+        new_manifest_files: str = None,  # New manifest files to add
+        new_tarred_audio_filepaths: str = None,  # New tarred audio paths
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        # Paths on the current machine
+        self.training_config_local = OmegaConf.load(training_config_local)
+        self.training_config_cluster = training_config_cluster
+        self.training_script_path = os.path.join(nemo_directory, training_script_path)
+        self.nemo_directory = nemo_directory
+        self.new_manifest_files = new_manifest_files
+        self.new_tarred_audio_filepaths = new_tarred_audio_filepaths
+
+    def process(self) -> str:
+        """
+        Generates the training command based on the processor's configuration.
+        If new manifest files are provided, updates the training configuration accordingly.
+
+        Returns:
+            str: The complete training command to be executed on the cluster
+        """
+        
+        if self.new_manifest_files is None:
+            cmd = self.get_execution_script(
+                cluster_script_path=self.training_script_path,
+                local_config=self.training_config_local,
+                cluster_config_path=self.training_config_cluster
+            )
+        else:
+            updated_manifest_filepaths, updated_tarred_audio_filepaths = self.update_training_sets(
+                config=self.training_config_local,
+                updated_manifest_filepaths=self.new_manifest_files,
+                updated_tarred_audio_filepaths=self.new_tarred_audio_filepaths
+            )
+            cmd = self.get_execution_script(
+                cluster_script_path=self.training_script_path,
+                local_config=self.training_config_local,
+                cluster_config_path=self.training_config_cluster,
+                updated_manifest_filepaths=updated_manifest_filepaths,
+                updated_tarred_filepaths=updated_tarred_audio_filepaths
+            )
+        return cmd
+
+    def get_execution_script(
+        self,
+        cluster_script_path: str,
+        local_config: DictConfig,
+        cluster_config_path: str,
+        updated_manifest_filepaths: Optional[str] = None,
+        updated_tarred_filepaths: Optional[str] = None
+    ) -> str:
+        """
+        Create the command to run the script on the cluster.
+
+        Args:
+            cluster_script_path (str): Path to the script to run on the cluster
+            local_config (DictConfig): Local configuration loaded from training_config_local
+            cluster_config_path (str): Path to the cluster configuration file
+            updated_manifest_filepaths (str, optional): Path to the updated manifest file
+            updated_tarred_filepaths (str, optional): Path to the updated tarred audio filepaths
+
+        Returns:
+            str: Command to run the script on the cluster
+        """
+        # Get the WANDB API key from the environment variables
+        wandb_key = os.environ.get("WANDB_API_KEY") or os.environ.get("WANDB") or os.environ.get("WANDB_KEY", "")
+        if not wandb_key:
+            logging.warning("WANDB key not found in environment variables. WANDB logging will not work.")
+
+            # Check if WANDB logging is enabled in the exp_manager config
+            if local_config.get('exp_manager', {}).get('create_wandb_logger', False):
+                raise ValueError(
+                    "WANDB key is required for logging but was not found in environment variables. "
+                    "Please set WANDB_API_KEY to enable WANDB logging."
+                )
+
+        # Prepare the base command
+        config_path = os.path.dirname(cluster_config_path)
+        config_name = os.path.basename(cluster_config_path)
+        cmd = (
+            "nvidia-smi && "
+            f"cd {os.path.dirname(cluster_script_path)} && "
+            f"python -u -B {os.path.basename(cluster_script_path)} "
+            f"--config-path {config_path} --config-name \"{config_name}\""
+        )
+
+        # Add additional parameters if provided
+        if updated_manifest_filepaths:
+            cmd += f" model.train_ds.manifest_filepath={updated_manifest_filepaths}"
+        if updated_tarred_filepaths:
+            cmd += f" model.train_ds.tarred_audio_filepaths={updated_tarred_filepaths}"
+        output_data = {"training_command": cmd}
+        with open(self.output_manifest_file, 'w') as f:
+            json.dump(output_data, f, indent=4)
+        return cmd
+    
+    def get_transcribed_names(self, manifest_filepaths: List[str], is_tarred: bool=False) -> List[List[str]]:
+        """
+        Generates a list of modified file paths by prepending 'transcribed_' to the filenames.
+        The use case is for non AIStore datasets
+
+        Args:
+            manifest_filepaths (list of str): A list of file paths to be modified.
+
+        Returns:
+            list of list of str: A list where each element is a single-item list containing the updated file path.
+        Example:
+            >>> manifest_filepaths = [
+            ...     "/path/to/manifest_1.json",
+            ...     "/path/to/manifest_2.json"
+            ... ]
+            >>> get_transcribed_names(manifest_filepaths)
+            [
+                ["/path/to/prefix_transcribed_manifest_1.json"],
+                ["/path/to/prefix_transcribed_manifest_2.json"]
+            ]
+        """
+        # For manifest_filepath, modify the filenames by prepending 'prefix_transcribed_'
+        transcribed_paths = []
+
+        for file_path in manifest_filepaths:
+            directory, filename = os.path.split(file_path)
+            
+            new_filename = (
+                f"transcribed_{filename}" if is_tarred 
+                else f"transcribed_manifest.json"
+            )
+            transcribed_paths.append([os.path.join(directory, new_filename)])
+
+        return transcribed_paths
+
+    def update_training_sets(
+        self,
+        config: DictConfig,
+        updated_manifest_filepaths: List[str],
+        updated_tarred_audio_filepaths: Optional[List[str]] = None
+    ) -> Tuple[str, str]:
+        """
+        Updates the training dataset configuration by adding pseudo-labeled datasets
+        to the training paths based on the dataset type.
+
+        Args:
+            config (DictConfig): Training config file to be updated
+            updated_manifest_filepaths (List[str]): List of updated manifest file paths to be included
+            updated_tarred_audio_filepaths (Optional[List[str]]): List of updated tarred audio filepaths to be included
+
+        Returns:
+            Tuple[str, str]: A tuple containing:
+                - Updated manifest file paths as a string, formatted for Omegaconf
+                - Updated tarred audio file paths as a string, formatted for Omegaconf
+        """
+        print(f"updated_manifest_filepaths {updated_manifest_filepaths}")
+        updated_manifest_filepaths = self.get_transcribed_names(updated_manifest_filepaths,is_tarred=config.model.train_ds.get("is_tarred", False))
+        manifest_filepath = config.model.train_ds.manifest_filepath
+        if updated_tarred_audio_filepaths:
+            updated_tarred_audio_filepaths = [[path] for path in updated_tarred_audio_filepaths]
+
+        # Updating the configuration based on dataset types
+        if config.model.train_ds.get("is_tarred", False):
+            tarred_audio_filepaths = config.model.train_ds.tarred_audio_filepaths
+            if isinstance(tarred_audio_filepaths, str):
+                updated_tarred_audio_filepaths.append([tarred_audio_filepaths])
+                updated_manifest_filepaths.append([manifest_filepath])
+            else:
+                updated_tarred_audio_filepaths += tarred_audio_filepaths
+                updated_manifest_filepaths += manifest_filepath
+        else:
+            print(f"config.model.train_ds.get {config.model.train_ds.get('use_lhotse')}")
+            if config.model.train_ds.get("use_lhotse", False):
+                if isinstance(manifest_filepath, str):
+                    updated_manifest_filepaths.append([manifest_filepath])
+                else:
+                    updated_manifest_filepaths += manifest_filepath
+            else:
+                updated_manifest_filepaths = [item for sublist in updated_manifest_filepaths for item in sublist]
+                if isinstance(manifest_filepath, str):
+                    updated_manifest_filepaths.append(manifest_filepath)
+                else:
+                    updated_manifest_filepaths += manifest_filepath
+
+        # Returning strings formatted for Omegaconf
+        return (
+            str(updated_manifest_filepaths).replace(", ", ","),
+            str(updated_tarred_audio_filepaths).replace(", ", ",") if updated_tarred_audio_filepaths else None,
+        )
+
+
+class InferenceCommandGenerator(BaseProcessor):
+    """
+    A processor that generates inference commands for pseudo-labeling.
+
+    Args:
+        nemo_directory (str): Base directory for NeMo framework
+        inference_local_config (str): Path to the local configuration file
+        inference_config_paths (str): Path to the inference configuration files
+        manifests (str): Path to the manifest files
+        p_cache (float): What part of pseudo-labels to update
+        num_gpus (int): Number of GPUs to use
+        is_tarred (bool): Whether the audio is tarred
+        first_run (bool): Whether this is the first run of pseudo-labeling
+        **kwargs: Additional arguments passed to the parent BaseProcessor class
+    """
+
+    def __init__(
+        self,
+        nemo_directory: str, 
+        inference_config_paths: str,
+        manifests:  str,
+        p_cache: float,
+        num_gpus, int,
+        is_tarred: bool = False,
+        first_run: bool = False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        # Paths on the current machine
+        self.inference_config_paths = inference_config_paths
+        self.nemo_directory = nemo_directory
+        self.inference_script_path = os.path.join(nemo_directory, "examples/asr/transcribe_speech_parallel.py")
+        self.first_run = first_run  
+        self.manifests = manifests
+        self.p_cache = p_cache
+        self.num_gpus = num_gpus
+        self.is_tarred = is_tarred
+
+    def process(self): 
+        """
+        Generate the pseudo-labeling command for the given configuration and training parameters.
+
+        Args:
+            merged_config (Dict): Merged configuration containing model and dataset settings.
+            config_name (str): Name of the configuration file to be used.
+            cluster_script_path (str): Path to the cluster execution script.
+            config_dir (str): Directory containing the configuration files.
+            ipl_training (Dict[str, any]): Dictionary containing:
+                - first_run (bool): Whether this is the first run of pseudo-labeling.
+                - num_gpus (int): Number of GPUs to use.
+                - inference_config_paths (List[str]): List of inference configuration file paths.
+                - manifests (List[str]): List of manifest file paths.
+                - tarr_paths (List[str]): List of tarred audio file paths.
+                - num_ipl_epochs (int): Number of epochs to train with pseudo-labels.
+                - p_cache (float): What part of pseudo-labels to update.
+
+        Returns:
+            str: The constructed pseudo-labeling command.
+        """
+        cmd = ""
+        prediction_directories_str = " ".join([os.path.dirname(path) for path in self.manifests])
+        inference_config_paths_str = " ".join(self.inference_config_paths)        
+        write_transcription_path = os.path.join(self.nemo_directory, "scripts/pseudo_labeling/write_transcribed_files.py")
+        update_inference_config_path = os.path.join(self.nemo_directory, "scripts/pseudo_labeling/update_inference_config.pys")
+        if self.first_run:
+            cmd += f"{self.get_pl_inference_command(self.inference_config_paths, shuffle=False)}"
+            cmd += (
+                f" && python {write_transcription_path} "
+                f"--prediction_filepaths {prediction_directories_str} --full_pass"
+            )
+            if self.is_tarred:
+                cmd += " --is_tarred"
+            cmd += (
+                f" && python {update_inference_config_path} "
+                f"--inference_configs {inference_config_paths_str} --p_cache {self.p_cache} --num_gpus {self.num_gpus}"
+            )
+
+       
+        cmd += f" && {self.get_pl_inference_command(self.inference_config_paths, shuffle=True)}"
+        cmd += (
+            f" && python {write_transcription_path} "
+            f"--prediction_filepaths {prediction_directories_str} "
+        )
+        if self.is_tarred:
+            cmd += " --is_tarred"
+
+        output_data = {"inference_command": cmd}
+        with open(self.output_manifest_file, 'w') as f:
+            json.dump(output_data, f, indent=4)
+
+        return cmd
+
+
+    def get_pl_inference_command(self, inference_configs, shuffle=None):
+        """
+        Generate a command to run PL inference with multiple configuration files.
+        Args:
+            inference_configs (list): List of configuration file paths.
+            shuffle (bool, optional): Whether to enable shuffling in predict_ds.
+
+        Returns:
+            str: Combined command string to execute PL inference.
+        """
+        cmd_list = []
+        for config in inference_configs:
+            config_path = os.path.dirname(config)
+            config_name = os.path.basename(config)
+            cmd = f"python {self.inference_script_path} --config-path {config_path} --config-name {config_name}"
+            if shuffle is not None:
+                cmd += f" predict_ds.shuffle={shuffle}"
+            cmd_list.append(cmd)
+
+        return " && ".join(cmd_list)
+    
\ No newline at end of file
diff --git a/sdp/processors/IPL/smth.py b/sdp/processors/IPL/smth.py
new file mode 100644
index 00000000..b3d823c6
--- /dev/null
+++ b/sdp/processors/IPL/smth.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import json
+import os
+from filelock import FileLock
+from typing import List
+
+import torch.distributed as dist
+
+from nemo.utils import logging
+
+
+def create_transcribed_shard_manifests(prediction_filepaths: List[str]) -> List[str]:
+    """
+    Creates transcribed shard manifest files by processing predictions and organizing them by shard ID.
+
+    This function reads a `predictions_all.json` file from each given directory, organizes the data by
+    shard IDs, and writes the entries to separate shard manifest files. For each shard, the `pred_text`
+    field is updated as the main transcription (`text`), and the original transcription (`text`) is
+    stored as `orig_text`.
+
+    Args:
+        prediction_filepaths (List[str]): A list of file paths to directories containing
+            `predictions_all.json` files with prediction data, including shard IDs.
+
+    Returns:
+        List[str]: A list of file paths to the combined manifest files (`transcribed_manifest__OP_0..CL_.json`)
+        created for each directory.
+    """
+    all_manifest_filepaths = []
+    for prediction_filepath in prediction_filepaths:
+        max_shard_id = 0
+        shard_data = {}
+        full_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(full_path, 'r') as f:
+            for line in f.readlines():
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                if max_shard_id < shard_id:
+                    max_shard_id = shard_id
+                if shard_id not in shard_data:
+                    shard_data[shard_id] = []
+                shard_data[shard_id].append(data_entry)
+        for shard_id, entries in shard_data.items():
+            output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
+            with open(output_filename, 'w') as f:
+                for data_entry in entries:
+                    if data_entry['audio_filepath'].endswith(".wav"):
+                        if 'text' in data_entry:
+                            data_entry['orig_text'] = data_entry.pop('text')
+                        data_entry['text'] = data_entry.pop('pred_text')
+                        json.dump(data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+        shard_manifest_filepath = os.path.join(
+            prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json"
+        )
+        all_manifest_filepaths.append(shard_manifest_filepath)
+    return all_manifest_filepaths
+
+
+def create_transcribed_manifests(prediction_filepaths: List[str]) -> List[str]:
+    """
+    Creates updated transcribed manifest files by processing predictions.
+
+    This function reads prediction files (`predictions_all.json`) from the provided directories,
+    updates the transcription data by renaming the `pred_text` field to `text`, and stores the
+    original `text` field as `orig_text`. The updated data is written to new transcribed manifest
+    files (`transcribed_manifest.json`) in each directory.
+
+    Args:
+        prediction_filepaths (List[str]): A list of file paths to directories containing
+            prediction files (`predictions_all.json`).
+
+    Returns:
+        List[str]: A list of file paths to the newly created transcribed manifest files
+        (`transcribed_manifest.json`).
+    """
+    all_manifest_filepaths = []
+    for prediction_filepath in prediction_filepaths:
+        prediction_name = os.path.join(prediction_filepath, "predictions_all.json")
+        transcripted_name = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+
+        # Open and read the original predictions_all.json file
+        with open(transcripted_name, 'w', encoding='utf-8') as f:
+            with open(prediction_name, 'r', encoding='utf-8') as pred_f:
+
+                for line in pred_f.readlines():
+                    data_entry = json.loads(line)
+                    if 'text' in data_entry:
+                        data_entry['orig_text'] = data_entry.pop('text')
+                    data_entry['text'] = data_entry.pop('pred_text')
+                    json.dump(data_entry, f, ensure_ascii=False)
+                    f.write("\n")
+            # Append the path of the new manifest file to the list
+            all_manifest_filepaths.append(transcripted_name)
+
+    return all_manifest_filepaths
+
+
+def write_sampled_shard_transcriptions(manifest_filepaths: List[str]) -> List[List[str]]:
+    """
+    Updates transcriptions by merging predicted shard data and transcribed manifest data.
+    This function processes prediction and transcribed manifest files, merges them
+    by matching the shard_id and audio file paths. For each shard, the corresponding
+    data entries are written to a new file.
+    Args:
+        manifest_filepaths (List[str]): A list of file paths to directories containing
+            prediction and transcribed manifest files.
+    Returns:
+        List[List[str]]: A list of lists containing the file paths to the generated
+            transcribed shard manifest files.
+    """
+    all_manifest_filepaths = []
+
+    # Process each prediction directory
+    for prediction_filepath in manifest_filepaths:
+        predicted_shard_data = {}
+        # Collect entries from prediction files based on shard id
+        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(prediction_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                audio_filepath = data_entry['audio_filepath']
+                predicted_shard_data.setdefault(shard_id, {})[audio_filepath] = data_entry
+    max_shard_id = 0
+    for full_path in glob.glob(os.path.join(prediction_filepath, f"transcribed_manifest_[0-9]*.json")):
+        all_data_entries = []
+        with open(full_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                max_shard_id = max(max_shard_id, shard_id)
+                all_data_entries.append(data_entry)
+        # Write the merged data to a new manifest file keeping new transcriptions
+        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
+        with open(output_filename, 'w') as f:
+            for data_entry in all_data_entries:
+                audio_filepath = data_entry['audio_filepath']
+                # Escape duplicated audio files that end with *dup
+                if audio_filepath.endswith(".wav"):
+                    if shard_id in predicted_shard_data and audio_filepath in predicted_shard_data[shard_id]:
+                        predicted_data_entry = predicted_shard_data[shard_id][audio_filepath]
+                        if 'text' in predicted_data_entry:
+                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
+                        if "pred_text" in predicted_data_entry:
+                            predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
+                        json.dump(predicted_data_entry, f, ensure_ascii=False)
+                    else:
+                        json.dump(data_entry, f, ensure_ascii=False)
+                    f.write("\n")
+
+    shard_manifest_filepath = os.path.join(prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json")
+    all_manifest_filepaths.append([shard_manifest_filepath])
+
+    return all_manifest_filepaths
+
+
+def write_sampled_transcriptions(manifest_filepaths: List[str]) -> List[str]:
+    """
+    Updates transcriptions by merging predicted data with transcribed manifest data.
+
+    This function processes prediction and transcribed manifest files within given directories.
+    It matches audio file paths to update transcriptions with predictions, ensuring each audio file
+    is properly transcribed. The updated data is written to the transcribed manifest file.
+
+    Args:
+        manifest_filepaths (List[str]): A list of file paths to directories containing
+            the prediction file (`predictions_all.json`) and the transcribed manifest file
+            (`transcribed_manifest.json`).
+
+    Returns:
+        List[str]: A list of file paths to the updated transcribed manifest files.
+    """
+    all_manifest_filepaths = []
+    for prediction_filepath in manifest_filepaths:
+        predicted_data = {}
+        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(prediction_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                path = data_entry['audio_filepath']
+                predicted_data[path] = data_entry
+
+        full_path = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+        all_data_entries = []
+        with open(full_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                all_data_entries.append(data_entry)
+                
+        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+        with open(output_filename, 'w') as f:
+            for data_entry in all_data_entries:
+                audio_filepath = data_entry['audio_filepath']
+                if audio_filepath.endswith(".wav"):
+                    if audio_filepath in predicted_data:
+                        predicted_data_entry = predicted_data[audio_filepath]
+                        if 'text' in predicted_data_entry:
+                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
+                        predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
+                        json.dump(predicted_data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+                    else:
+                        json.dump(data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+        all_manifest_filepaths.append(output_filename)
+    return all_manifest_filepaths
+
+
+
+if __name__ == "__main__":
+    rank = int(os.environ.get("RANK", 0))  # Default to 0 if not set
+
+    parser = argparse.ArgumentParser(description="Script to create or write transcriptions")
+    parser.add_argument("--is_tarred", action="store_true", help="If true, processes tarred manifests")
+    parser.add_argument("--full_pass", action="store_true", help="If true, processes full pass manifests")
+    parser.add_argument(
+        "--prediction_filepaths",
+        type=str,
+        nargs='+',  # Accepts one or more values as a list
+        required=True,
+        help="Paths to one or more inference config YAML files."
+    )
+    
+    args = parser.parse_args()
+
+    lock_dir = os.path.dirname(args.prediction_filepaths[0])
+    lock_file = lock_dir + "/my_script.lock" 
+
+    with FileLock(lock_file):
+        if rank == 0:
+            if args.is_tarred:
+                result = ( 
+                    write_sampled_shard_transcriptions(args.prediction_filepaths)
+                    if not args.full_pass
+                    else create_transcribed_shard_manifests(args.prediction_filepaths)
+                )
+            else:
+                result = (
+                    write_sampled_transcriptions(args.prediction_filepaths)
+                    if not args.full_pass
+                    else create_transcribed_manifests(args.prediction_filepaths)
+                )
+    

From b9471e3abf499a9d4f9955553b488d2e7740c071 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 14 May 2025 14:03:00 +0400
Subject: [PATCH 02/36] remove

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/IPL/smth.py | 258 -------------------------------------
 1 file changed, 258 deletions(-)
 delete mode 100644 sdp/processors/IPL/smth.py

diff --git a/sdp/processors/IPL/smth.py b/sdp/processors/IPL/smth.py
deleted file mode 100644
index b3d823c6..00000000
--- a/sdp/processors/IPL/smth.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import glob
-import json
-import os
-from filelock import FileLock
-from typing import List
-
-import torch.distributed as dist
-
-from nemo.utils import logging
-
-
-def create_transcribed_shard_manifests(prediction_filepaths: List[str]) -> List[str]:
-    """
-    Creates transcribed shard manifest files by processing predictions and organizing them by shard ID.
-
-    This function reads a `predictions_all.json` file from each given directory, organizes the data by
-    shard IDs, and writes the entries to separate shard manifest files. For each shard, the `pred_text`
-    field is updated as the main transcription (`text`), and the original transcription (`text`) is
-    stored as `orig_text`.
-
-    Args:
-        prediction_filepaths (List[str]): A list of file paths to directories containing
-            `predictions_all.json` files with prediction data, including shard IDs.
-
-    Returns:
-        List[str]: A list of file paths to the combined manifest files (`transcribed_manifest__OP_0..CL_.json`)
-        created for each directory.
-    """
-    all_manifest_filepaths = []
-    for prediction_filepath in prediction_filepaths:
-        max_shard_id = 0
-        shard_data = {}
-        full_path = os.path.join(prediction_filepath, "predictions_all.json")
-        with open(full_path, 'r') as f:
-            for line in f.readlines():
-                data_entry = json.loads(line)
-                shard_id = data_entry.get("shard_id")
-                if max_shard_id < shard_id:
-                    max_shard_id = shard_id
-                if shard_id not in shard_data:
-                    shard_data[shard_id] = []
-                shard_data[shard_id].append(data_entry)
-        for shard_id, entries in shard_data.items():
-            output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
-            with open(output_filename, 'w') as f:
-                for data_entry in entries:
-                    if data_entry['audio_filepath'].endswith(".wav"):
-                        if 'text' in data_entry:
-                            data_entry['orig_text'] = data_entry.pop('text')
-                        data_entry['text'] = data_entry.pop('pred_text')
-                        json.dump(data_entry, f, ensure_ascii=False)
-                        f.write("\n")
-        shard_manifest_filepath = os.path.join(
-            prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json"
-        )
-        all_manifest_filepaths.append(shard_manifest_filepath)
-    return all_manifest_filepaths
-
-
-def create_transcribed_manifests(prediction_filepaths: List[str]) -> List[str]:
-    """
-    Creates updated transcribed manifest files by processing predictions.
-
-    This function reads prediction files (`predictions_all.json`) from the provided directories,
-    updates the transcription data by renaming the `pred_text` field to `text`, and stores the
-    original `text` field as `orig_text`. The updated data is written to new transcribed manifest
-    files (`transcribed_manifest.json`) in each directory.
-
-    Args:
-        prediction_filepaths (List[str]): A list of file paths to directories containing
-            prediction files (`predictions_all.json`).
-
-    Returns:
-        List[str]: A list of file paths to the newly created transcribed manifest files
-        (`transcribed_manifest.json`).
-    """
-    all_manifest_filepaths = []
-    for prediction_filepath in prediction_filepaths:
-        prediction_name = os.path.join(prediction_filepath, "predictions_all.json")
-        transcripted_name = os.path.join(prediction_filepath, f"transcribed_manifest.json")
-
-        # Open and read the original predictions_all.json file
-        with open(transcripted_name, 'w', encoding='utf-8') as f:
-            with open(prediction_name, 'r', encoding='utf-8') as pred_f:
-
-                for line in pred_f.readlines():
-                    data_entry = json.loads(line)
-                    if 'text' in data_entry:
-                        data_entry['orig_text'] = data_entry.pop('text')
-                    data_entry['text'] = data_entry.pop('pred_text')
-                    json.dump(data_entry, f, ensure_ascii=False)
-                    f.write("\n")
-            # Append the path of the new manifest file to the list
-            all_manifest_filepaths.append(transcripted_name)
-
-    return all_manifest_filepaths
-
-
-def write_sampled_shard_transcriptions(manifest_filepaths: List[str]) -> List[List[str]]:
-    """
-    Updates transcriptions by merging predicted shard data and transcribed manifest data.
-    This function processes prediction and transcribed manifest files, merges them
-    by matching the shard_id and audio file paths. For each shard, the corresponding
-    data entries are written to a new file.
-    Args:
-        manifest_filepaths (List[str]): A list of file paths to directories containing
-            prediction and transcribed manifest files.
-    Returns:
-        List[List[str]]: A list of lists containing the file paths to the generated
-            transcribed shard manifest files.
-    """
-    all_manifest_filepaths = []
-
-    # Process each prediction directory
-    for prediction_filepath in manifest_filepaths:
-        predicted_shard_data = {}
-        # Collect entries from prediction files based on shard id
-        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
-        with open(prediction_path, 'r') as f:
-            for line in f:
-                data_entry = json.loads(line)
-                shard_id = data_entry.get("shard_id")
-                audio_filepath = data_entry['audio_filepath']
-                predicted_shard_data.setdefault(shard_id, {})[audio_filepath] = data_entry
-    max_shard_id = 0
-    for full_path in glob.glob(os.path.join(prediction_filepath, f"transcribed_manifest_[0-9]*.json")):
-        all_data_entries = []
-        with open(full_path, 'r') as f:
-            for line in f:
-                data_entry = json.loads(line)
-                shard_id = data_entry.get("shard_id")
-                max_shard_id = max(max_shard_id, shard_id)
-                all_data_entries.append(data_entry)
-        # Write the merged data to a new manifest file keeping new transcriptions
-        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
-        with open(output_filename, 'w') as f:
-            for data_entry in all_data_entries:
-                audio_filepath = data_entry['audio_filepath']
-                # Escape duplicated audio files that end with *dup
-                if audio_filepath.endswith(".wav"):
-                    if shard_id in predicted_shard_data and audio_filepath in predicted_shard_data[shard_id]:
-                        predicted_data_entry = predicted_shard_data[shard_id][audio_filepath]
-                        if 'text' in predicted_data_entry:
-                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
-                        if "pred_text" in predicted_data_entry:
-                            predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
-                        json.dump(predicted_data_entry, f, ensure_ascii=False)
-                    else:
-                        json.dump(data_entry, f, ensure_ascii=False)
-                    f.write("\n")
-
-    shard_manifest_filepath = os.path.join(prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json")
-    all_manifest_filepaths.append([shard_manifest_filepath])
-
-    return all_manifest_filepaths
-
-
-def write_sampled_transcriptions(manifest_filepaths: List[str]) -> List[str]:
-    """
-    Updates transcriptions by merging predicted data with transcribed manifest data.
-
-    This function processes prediction and transcribed manifest files within given directories.
-    It matches audio file paths to update transcriptions with predictions, ensuring each audio file
-    is properly transcribed. The updated data is written to the transcribed manifest file.
-
-    Args:
-        manifest_filepaths (List[str]): A list of file paths to directories containing
-            the prediction file (`predictions_all.json`) and the transcribed manifest file
-            (`transcribed_manifest.json`).
-
-    Returns:
-        List[str]: A list of file paths to the updated transcribed manifest files.
-    """
-    all_manifest_filepaths = []
-    for prediction_filepath in manifest_filepaths:
-        predicted_data = {}
-        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
-        with open(prediction_path, 'r') as f:
-            for line in f:
-                data_entry = json.loads(line)
-                path = data_entry['audio_filepath']
-                predicted_data[path] = data_entry
-
-        full_path = os.path.join(prediction_filepath, f"transcribed_manifest.json")
-        all_data_entries = []
-        with open(full_path, 'r') as f:
-            for line in f:
-                data_entry = json.loads(line)
-                all_data_entries.append(data_entry)
-                
-        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest.json")
-        with open(output_filename, 'w') as f:
-            for data_entry in all_data_entries:
-                audio_filepath = data_entry['audio_filepath']
-                if audio_filepath.endswith(".wav"):
-                    if audio_filepath in predicted_data:
-                        predicted_data_entry = predicted_data[audio_filepath]
-                        if 'text' in predicted_data_entry:
-                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
-                        predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
-                        json.dump(predicted_data_entry, f, ensure_ascii=False)
-                        f.write("\n")
-                    else:
-                        json.dump(data_entry, f, ensure_ascii=False)
-                        f.write("\n")
-        all_manifest_filepaths.append(output_filename)
-    return all_manifest_filepaths
-
-
-
-if __name__ == "__main__":
-    rank = int(os.environ.get("RANK", 0))  # Default to 0 if not set
-
-    parser = argparse.ArgumentParser(description="Script to create or write transcriptions")
-    parser.add_argument("--is_tarred", action="store_true", help="If true, processes tarred manifests")
-    parser.add_argument("--full_pass", action="store_true", help="If true, processes full pass manifests")
-    parser.add_argument(
-        "--prediction_filepaths",
-        type=str,
-        nargs='+',  # Accepts one or more values as a list
-        required=True,
-        help="Paths to one or more inference config YAML files."
-    )
-    
-    args = parser.parse_args()
-
-    lock_dir = os.path.dirname(args.prediction_filepaths[0])
-    lock_file = lock_dir + "/my_script.lock" 
-
-    with FileLock(lock_file):
-        if rank == 0:
-            if args.is_tarred:
-                result = ( 
-                    write_sampled_shard_transcriptions(args.prediction_filepaths)
-                    if not args.full_pass
-                    else create_transcribed_shard_manifests(args.prediction_filepaths)
-                )
-            else:
-                result = (
-                    write_sampled_transcriptions(args.prediction_filepaths)
-                    if not args.full_pass
-                    else create_transcribed_manifests(args.prediction_filepaths)
-                )
-    

From b4dc91c5dbcd24abd977741f7d1179d34e801386 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 15 May 2025 15:59:24 +0400
Subject: [PATCH 03/36] some commits

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 1                                       |    4 +
 inference_output_manifest_filepath.json |    3 +
 run_ipl.py                              |  187 ++++
 run_ipl.yaml                            |    5 +
 run_pt_mcv.yaml                         |   57 ++
 run_pt_mcv_cs_you.yaml                  |   64 ++
 sdp/processors/IPL/ipl_processors.py    |   45 +-
 sdp/processors/__init__.py              |    7 +
 sdp/processors/nemo/ipl_command.py      |  184 ++++
 sdp/processors/nemo/ipl_training.py     |   13 +
 sdp/processors/nemo/ipl_utils.py        |  330 ++++++
 sdp/processors/nemo/nemo_run_ipl.py     |  386 +++++++
 sdp/utils/ipl_utils.py                  |  142 +++
 sdp/utils/nemo_run_utils.py             |  406 ++++++++
 sdp/utils/skills_utils.py               | 1226 +++++++++++++++++++++++
 15 files changed, 3031 insertions(+), 28 deletions(-)
 create mode 100644 1
 create mode 100644 inference_output_manifest_filepath.json
 create mode 100644 run_ipl.py
 create mode 100644 run_ipl.yaml
 create mode 100644 run_pt_mcv.yaml
 create mode 100644 run_pt_mcv_cs_you.yaml
 create mode 100644 sdp/processors/nemo/ipl_command.py
 create mode 100644 sdp/processors/nemo/ipl_training.py
 create mode 100644 sdp/processors/nemo/ipl_utils.py
 create mode 100644 sdp/processors/nemo/nemo_run_ipl.py
 create mode 100644 sdp/utils/ipl_utils.py
 create mode 100644 sdp/utils/nemo_run_utils.py
 create mode 100644 sdp/utils/skills_utils.py

diff --git a/1 b/1
new file mode 100644
index 00000000..94f612ba
--- /dev/null
+++ b/1
@@ -0,0 +1,4 @@
+script: asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py 
+script_config: /home/ntadevosyan/code/fork_ipl/NeMo/examples/asr/conf/update_script_config.yaml 
+inference_config: /home/ntadevosyan/code/nemo_run_ipl_PR/NeMo/examples/asr/inference_config_local_non_tarred.yaml 
+nemo_path: /home/ntadevosyan/code/pr_iplmixin/NeMo/examples/asr/
diff --git a/inference_output_manifest_filepath.json b/inference_output_manifest_filepath.json
new file mode 100644
index 00000000..df932017
--- /dev/null
+++ b/inference_output_manifest_filepath.json
@@ -0,0 +1,3 @@
+{
+    "inference_command": " && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_0.yaml predict_ds.shuffle=True && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_1.yaml predict_ds.shuffle=True && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_2.yaml predict_ds.shuffle=True && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_3.yaml predict_ds.shuffle=True && python /workspace/nemo/scripts/pseudo_labeling/write_transcribed_files.py --prediction_filepaths /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket1/sharded_manifests /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket2/sharded_manifests /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket3/sharded_manifests /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket4/sharded_manifests  --is_tarred"
+}
\ No newline at end of file
diff --git a/run_ipl.py b/run_ipl.py
new file mode 100644
index 00000000..f0512c01
--- /dev/null
+++ b/run_ipl.py
@@ -0,0 +1,187 @@
+import copy
+import glob
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any, Dict
+import torch
+from typing import List, Optional, Tuple, Union
+from omegaconf import OmegaConf, open_dict
+#import sdp.processors.nemo.ipl_utils as ipl_utils
+#from nemo.core.config import hydra_runner
+from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator
+
+# def check_training_finished(log_dir):
+#     """
+#     Searches to see ig lightning finished training .
+#     Parameters:
+#         log_dir (str): Directory where logs are stored.
+#     """
+#     print(f"************************************************")
+#     print(f"************************************************")
+
+#     if not os.path.exists(log_dir):
+#         print(f"Log directory '{log_dir}' does not exist.")
+#         return
+#     print(f"")
+#     log_pattern = os.path.join(log_dir, f"lightning_logs.txt")
+#     command = f"grep -ri '`Trainer.fit` stopped:' {log_pattern}"
+
+#     result = subprocess.run(command, shell=True, capture_output=True, text=True)
+#     if result.stdout:
+#         print("Stopping reasons found:")
+#         print(result.stdout)
+#         return True
+#     else:
+#         print("No stopping reasons found in the logs.")
+#         return False
+    
+# def get_command_for_inference(
+#     inference_config: str, inference_config_dir: Union[str, Path], p_cache: float, checkpoint: str, nemo_path: str
+# ) -> Tuple[str, List[str], List[str]]:
+#     """
+#     Generates the command string for running speech inference with transcribe_speech_parallel.
+#     Args:
+#         inference_config (str): Path to the base inference configuration file.
+#         inference_config_dir (Union[str, Path]): Directory to store temporary modified configurations.
+#         p_cache (float): Proportion of the dataset to be cached for pseudo-labeling.
+#         checkpoint (str): Path to the model checkpoint to use for inference.
+#     Returns:
+#         Tuple[str, List[str], List[str]]:
+#             - The command string to execute inference for all specified manifests.
+#             - List of output directories corresponding to each manifest.
+#             - List of completed full pass transcribed manifest paths, if any.
+#     """
+#     """"""
+    
+#     manifests, tarr_audio_files = ipl_utils.separate_multiple_transcriptions(inference_config)
+#     num_gpus = torch.cuda.device_count()
+#     output_dirs = []
+#     cmd = ""
+#     for i in range(len(manifests)):
+#         print()
+#         print(f"manifests  {manifests[i]}")
+#         output_dir = os.path.dirname(manifests[i])
+#         output_dirs.append(output_dir)
+#         print(f"output_dir {output_dir}")
+#         base_cfg = OmegaConf.load(inference_config)
+#         print(f"inference_config_dir {inference_config_dir}")
+#         print()
+#         temp_config_dir = Path(str(inference_config_dir) + "/temp_configs").absolute()
+#         os.makedirs(temp_config_dir, exist_ok=True)
+#         modified_cfg = copy.deepcopy(base_cfg)
+
+#         # Check if we need to run inference on the whole set or update part of it
+#         full_pass_done = glob.glob(os.path.join(output_dir, 'transcribed_manifest*'))
+#         if full_pass_done:
+#             number_of_files = ipl_utils.count_files_for_pseudo_labeling(manifests[i], bool(tarr_audio_files))
+#             limit_predict_batches = int((number_of_files * p_cache) / (modified_cfg.predict_ds.batch_size * num_gpus))
+#             OmegaConf.update(modified_cfg, "trainer.limit_predict_batches", limit_predict_batches)
+
+#         # Replace OmegaConf updates with simple assignments
+#         OmegaConf.update(modified_cfg, "output_path", output_dir)
+#         OmegaConf.update(modified_cfg, "predict_ds.manifest_filepath", manifests[i])
+#         if tarr_audio_files:
+#             OmegaConf.update(modified_cfg, "predict_ds.tarred_audio_filepaths", tarr_audio_files[i])
+#         OmegaConf.update(modified_cfg, "model", checkpoint)
+
+#         temp_config_file = os.path.join(temp_config_dir, f"modified_config_{i}.yaml")
+#         OmegaConf.save(modified_cfg, temp_config_file)
+#         trancribe_script = nemo_path + "/" + "transcribe_speech_parallel.py"
+#         cmd += f"python {trancribe_script} --config-path {temp_config_dir} --config-name modified_config_{i}.yaml && "
+
+#     # Remove trailing '&&' from the final command string
+#     cmd = cmd.rstrip(" &&")
+
+#     print(f"Inference command: {cmd}")
+#     return cmd, output_dirs, full_pass_done
+
+
+# def merge_configs(script_config_path, run_config):
+#     # Load the configurations
+#     script_config = OmegaConf.load(script_config_path)
+
+#     print(run_config)
+
+#     # Keep track of the original keys in script_config
+#     original_script_keys = set(script_config.keys())
+
+#     # Merge only the 'training' part of run_config with script_config
+#     result = OmegaConf.merge(script_config, run_config)
+
+#     with open_dict(result):
+#         for k in run_config.keys():
+#             if k in result and k not in original_script_keys:
+#                 del result[k]
+
+#     def check_missing_values(cfg):
+#         if hasattr(cfg, 'items'):
+#             for k, v in cfg.items():
+#                 if hasattr(v, 'items'):
+#                     check_missing_values(v)
+#                 elif v == '???':
+#                     raise ValueError(f"Missing value for key {k} in the config file")
+
+#     check_missing_values(result)
+#     result.exp_manager.resume_if_exists = True
+#     return result
+
+
+# def get_execution_script(cluster_script_path: str, config_name: str, config_path: str, nemo_path: str) -> str:
+#     """
+#     Constructs a command string to execute a training with the specified configuration.
+#     Args:
+#         cluster_script_path (str): Path to the cluster script to be executed.
+#         config_name (str): Name of the configuration file or object to be passed as a parameter.
+#         config_path (str): Path to the directory where the configuration resides.
+#     Returns:
+#         str: A formatted command string ready for execution.
+#     """
+#     # Create the command to run the script
+#     cluster_script_path = nemo_path + "/" + cluster_script_path
+#     cmd = """
+#         python {cluster_script_path} --config-path {config_path} --config-name "{config_name}" 
+#     """
+#     print("in get_execution_script")
+#     print(f"cluster_script_path {cluster_script_path}")
+#     format_dict = dict(
+#         cluster_script_path=cluster_script_path,
+#         config_path=config_path,
+#         config_name=config_name,
+#     )
+#     cmd = cmd.format(**format_dict)
+#     print(f"format cmd {cmd}")
+
+#     return cmd
+
+
+# def find_checkpoint_dir(base_path):
+#     """
+#     Find the 'checkpoints' folder in the directory structure.
+#     Parameters:
+#         base_path (str): The base directory path to search from.
+#     """
+#     for root, dirs, files in os.walk(base_path):
+#         for dir_name in dirs:
+#             if dir_name == "checkpoints":
+#                 return os.path.join(root, dir_name), root
+#     return None, None
+
+
+def main():
+    config = {
+        "training_config_local": "/home/ntadevosyan/code/canary_ngpt/NeMo/ngpt_rnnt_bpe.yaml",
+        "training_config_cluster": "path/to/your/cluster/config.yaml",
+        "training_script_path": "path/to/training/script.py",
+        "nemo_directory": "path/to/nemo/directory",
+        "output_manifest_file": "path/to/output/manifest.json",
+        "new_manifest_files": None,  # or list of manifest files if you have them
+        "new_tarred_audio_filepaths": None  # or list of tarred audio paths if you have them
+    }
+    processor = TrainingCommandGenerator(**config)
+    cmd = processor.process(param="str")
+    print("Generated command:", cmd)
+
+if __name__ == '__main__':
+    main()
diff --git a/run_ipl.yaml b/run_ipl.yaml
new file mode 100644
index 00000000..eaff04ca
--- /dev/null
+++ b/run_ipl.yaml
@@ -0,0 +1,5 @@
+script: asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py 
+num_epochs: 2
+script_config: /home/ntadevosyan/code/fork_ipl/NeMo/examples/asr/conf/update_script_config.yaml 
+inference_config: /home/ntadevosyan/code/nemo_run_ipl_PR/NeMo/examples/asr/inference_config_local_non_tarred.yaml 
+nemo_path: /home/ntadevosyan/code/pr_iplmixin/NeMo/examples/asr/
diff --git a/run_pt_mcv.yaml b/run_pt_mcv.yaml
new file mode 100644
index 00000000..1d299241
--- /dev/null
+++ b/run_pt_mcv.yaml
@@ -0,0 +1,57 @@
+# The script to be run.
+script: "examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py"
+script_config: "/home/ntadevosyan/code/nemo_run_ipl_PR/NeMo/examples/asr/conf/mcv_scratch_cs_you.yaml"
+
+exp_name: null  # populated by exp_manager.name if not provided
+results_dir: '/lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth'  # Where to store the results of the run
+
+# Optional arguments
+num_runs: 1
+num_tasks_per_node: 1
+num_gpus: 1
+max_runtime: "00:03:45:00"
+
+########################################################################################################################
+
+executor: slurm
+
+USER: ntadevosyan
+ssh_tunnel:
+  host: draco-oci-login-01.draco-oci-iad.nvidia.com
+  # ------------------------------- Fill this up! -------------------------------
+  user: "${USER}"  # your username; or resolved from ${USER} environment variable ; or can be null which resolved from ${USER} environment variable
+  job_dir: "/lustre/fsw/portfolios/convai/users/${USER}/nemo-run/"
+  identity: "${NEMO_OCI_IAD_SSH_IDENTITY}"
+  # -----------------------------------------------------------------------------
+
+account: convai_convaird_nemo-speech
+partition: batch_block1,batch_block3,batch_block4
+job_name_prefix: "convai_convaird_nemo-speech-pt"
+
+containers:
+  # asr: /lustre/fsw/portfolios/llmservice/users/kpuvvada/local_containers/nemo_dev_20240717_aistore.sqsh
+  asr: /lustre/fsw/portfolios/llmservice/users/pzelasko/containers/nemo-nightly-24jul24-oomptimizer.sqsh
+
+env_vars:
+  - 'TOKENIZERS_PARALLELISM=false'
+  - 'AIS_ENDPOINT="http://asr.iad.oci.aistore.nvidia.com:51080"'
+  - 'LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE=0.3'
+  - 'TORCH_CUDNN_V8_API_ENABLED=1'
+  - 'PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True'
+  - 'HYDRA_FULL_ERROR=1'
+
+required_env_vars:
+  - 'HF_TOKEN'
+  - 'WANDB_KEY'
+
+mounts:
+  # Replace with your own paths in your cluster config
+  - /lustre/fsw:/lustre/fsw
+  - /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data:/data
+  #- /lustre/fsw/portfolios/convai/users/ntadevosyan:/asr_checkpoints
+  - /lustre/fsw/portfolios/convai/users/ntadevosyan:/lustre/fsw/portfolios/convai/users/ntadevosyan
+
+timeouts:
+  batch_block1,batch_block3,batch_block4: 04:00:00
+  interactive: 04:00:00
+  interactive_singlenode: 04:00:00
diff --git a/run_pt_mcv_cs_you.yaml b/run_pt_mcv_cs_you.yaml
new file mode 100644
index 00000000..9614a1da
--- /dev/null
+++ b/run_pt_mcv_cs_you.yaml
@@ -0,0 +1,64 @@
+# The script to be run.
+script: "examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py"
+script_config: "/home/ntadevosyan/code/nemo_run_ipl_PR/NeMo/examples/asr/conf/mcv_scratch_cs_you.yaml"
+
+exp_name: null  # populated by exp_manager.name if not provided
+results_dir: '/lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth'  # Where to store the results of the run
+nemo_directory: "/workspace/nemo"
+
+# Optional arguments
+num_runs: 6
+num_gpus: 8
+num_tasks_per_node: 8
+max_runtime: "00:03:45:00"
+
+########################################################################################################################
+
+executor: slurm
+ipl_training:
+  inference_config: inference_config_cs_you.yaml
+  p_cache: 0.2
+  num_ipl_epochs: 100
+  prefix: mcv_you_3
+
+USER: ntadevosyan
+
+ssh_tunnel:
+  host: cs-oci-ord-login-01.nvidia.com
+  # ------------------------------- Fill this up! -------------------------------
+  user: "${USER}"  # your username; or resolved from ${USER} environment variable ; or can be null which resolved from ${USER} environment variable
+  job_dir: "//lustre/fsw/portfolios/convai/users/${USER}/nemo-run/"
+  identity: ""
+  # -----------------------------------------------------------------------------
+
+account: convai_convaird_nemo-speech
+partition: polar,polar3 
+job_name_prefix: "convai_convaird_nemo-speech-pt"
+
+containers:
+  # asr: /lustre/fsw/portfolios/llmservice/users/kpuvvada/local_containers/nemo_dev_20240717_aistore.sqsh
+  asr: nvcr.io/nvidian/ac-aiapps/nemo_ntad:ipl
+
+env_vars:
+  - 'TOKENIZERS_PARALLELISM=false'
+  - 'AIS_ENDPOINT="http://asr.iad.oci.aistore.nvidia.com:51080"'
+  - 'LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE=0.3'
+  - 'TORCH_CUDNN_V8_API_ENABLED=1'
+  - 'PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True'
+  - 'HYDRA_FULL_ERROR=1'
+
+required_env_vars:
+  - 'HF_TOKEN'
+  - 'WANDB_KEY=037abd530ba9fc776c9d617c95c91f5dd0340471' 
+
+mounts:
+  # Replace with your own paths in your cluster config
+  - /lustre/fsw/:/lustre/fsw/
+  #- /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data:/data
+  #- /lustre/fsw/portfolios/convai/users/ntadevosyan:/asr_checkpoints
+  - /lustre/fsw/portfolios/convai/users/ntadevosyan:/lustre/fsw/portfolios/convai/users/ntadevosyan
+
+timeouts:
+  polar,polar3: 04:00:00
+  interactive: 04:00:00
+  interactive_singlenode: 04:00:00
diff --git a/sdp/processors/IPL/ipl_processors.py b/sdp/processors/IPL/ipl_processors.py
index 6fed2702..b8373f01 100644
--- a/sdp/processors/IPL/ipl_processors.py
+++ b/sdp/processors/IPL/ipl_processors.py
@@ -40,14 +40,18 @@ def __init__(
         super().__init__(**kwargs)
 
         # Paths on the current machine
-        self.training_config_local = OmegaConf.load(training_config_local)
+        self.training_config_local = training_config_local
         self.training_config_cluster = training_config_cluster
         self.training_script_path = os.path.join(nemo_directory, training_script_path)
         self.nemo_directory = nemo_directory
         self.new_manifest_files = new_manifest_files
         self.new_tarred_audio_filepaths = new_tarred_audio_filepaths
 
-    def process(self) -> str:
+    def process(
+        self, 
+        new_manifest_files=None, 
+        new_tarred_audio_filepaths=None
+    ) -> str:
         """
         Generates the training command based on the processor's configuration.
         If new manifest files are provided, updates the training configuration accordingly.
@@ -55,8 +59,7 @@ def process(self) -> str:
         Returns:
             str: The complete training command to be executed on the cluster
         """
-        
-        if self.new_manifest_files is None:
+        if new_manifest_files is None:
             cmd = self.get_execution_script(
                 cluster_script_path=self.training_script_path,
                 local_config=self.training_config_local,
@@ -65,8 +68,8 @@ def process(self) -> str:
         else:
             updated_manifest_filepaths, updated_tarred_audio_filepaths = self.update_training_sets(
                 config=self.training_config_local,
-                updated_manifest_filepaths=self.new_manifest_files,
-                updated_tarred_audio_filepaths=self.new_tarred_audio_filepaths
+                updated_manifest_filepaths=new_manifest_files,
+                updated_tarred_audio_filepaths=new_tarred_audio_filepaths
             )
             cmd = self.get_execution_script(
                 cluster_script_path=self.training_script_path,
@@ -110,7 +113,7 @@ def get_execution_script(
                     "Please set WANDB_API_KEY to enable WANDB logging."
                 )
 
-        # Prepare the base command
+
         config_path = os.path.dirname(cluster_config_path)
         config_name = os.path.basename(cluster_config_path)
         cmd = (
@@ -126,8 +129,9 @@ def get_execution_script(
         if updated_tarred_filepaths:
             cmd += f" model.train_ds.tarred_audio_filepaths={updated_tarred_filepaths}"
         output_data = {"training_command": cmd}
-        with open(self.output_manifest_file, 'w') as f:
-            json.dump(output_data, f, indent=4)
+
+        # with open(self.output_manifest_file, 'w') as f:
+        #     json.dump(output_data, f, indent=4)
         return cmd
     
     def get_transcribed_names(self, manifest_filepaths: List[str], is_tarred: bool=False) -> List[List[str]]:
@@ -185,7 +189,6 @@ def update_training_sets(
                 - Updated manifest file paths as a string, formatted for Omegaconf
                 - Updated tarred audio file paths as a string, formatted for Omegaconf
         """
-        print(f"updated_manifest_filepaths {updated_manifest_filepaths}")
         updated_manifest_filepaths = self.get_transcribed_names(updated_manifest_filepaths,is_tarred=config.model.train_ds.get("is_tarred", False))
         manifest_filepath = config.model.train_ds.manifest_filepath
         if updated_tarred_audio_filepaths:
@@ -201,7 +204,6 @@ def update_training_sets(
                 updated_tarred_audio_filepaths += tarred_audio_filepaths
                 updated_manifest_filepaths += manifest_filepath
         else:
-            print(f"config.model.train_ds.get {config.model.train_ds.get('use_lhotse')}")
             if config.model.train_ds.get("use_lhotse", False):
                 if isinstance(manifest_filepath, str):
                     updated_manifest_filepaths.append([manifest_filepath])
@@ -243,9 +245,8 @@ def __init__(
         inference_config_paths: str,
         manifests:  str,
         p_cache: float,
-        num_gpus, int,
+        num_gpus: int,
         is_tarred: bool = False,
-        first_run: bool = False,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -254,29 +255,17 @@ def __init__(
         self.inference_config_paths = inference_config_paths
         self.nemo_directory = nemo_directory
         self.inference_script_path = os.path.join(nemo_directory, "examples/asr/transcribe_speech_parallel.py")
-        self.first_run = first_run  
         self.manifests = manifests
         self.p_cache = p_cache
         self.num_gpus = num_gpus
         self.is_tarred = is_tarred
 
-    def process(self): 
+    def process(self, first_run=False): 
         """
         Generate the pseudo-labeling command for the given configuration and training parameters.
 
         Args:
-            merged_config (Dict): Merged configuration containing model and dataset settings.
-            config_name (str): Name of the configuration file to be used.
-            cluster_script_path (str): Path to the cluster execution script.
-            config_dir (str): Directory containing the configuration files.
-            ipl_training (Dict[str, any]): Dictionary containing:
-                - first_run (bool): Whether this is the first run of pseudo-labeling.
-                - num_gpus (int): Number of GPUs to use.
-                - inference_config_paths (List[str]): List of inference configuration file paths.
-                - manifests (List[str]): List of manifest file paths.
-                - tarr_paths (List[str]): List of tarred audio file paths.
-                - num_ipl_epochs (int): Number of epochs to train with pseudo-labels.
-                - p_cache (float): What part of pseudo-labels to update.
+            first_run (bool, optional): Whether this is the first run of pseudo-labeling.
 
         Returns:
             str: The constructed pseudo-labeling command.
@@ -286,7 +275,7 @@ def process(self):
         inference_config_paths_str = " ".join(self.inference_config_paths)        
         write_transcription_path = os.path.join(self.nemo_directory, "scripts/pseudo_labeling/write_transcribed_files.py")
         update_inference_config_path = os.path.join(self.nemo_directory, "scripts/pseudo_labeling/update_inference_config.pys")
-        if self.first_run:
+        if first_run:
             cmd += f"{self.get_pl_inference_command(self.inference_config_paths, shuffle=False)}"
             cmd += (
                 f" && python {write_transcription_path} "
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index ade2ab68..43df6448 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -115,3 +115,10 @@
 )
 from sdp.processors.nemo.asr_inference import ASRInference
 from sdp.processors.nemo.pc_inference import PCInference
+from sdp.processors.toloka.accept_if import AcceptIfWERLess
+from sdp.processors.toloka.create_pool import CreateTolokaPool
+from sdp.processors.toloka.create_project import CreateTolokaProject
+from sdp.processors.toloka.create_sentence_set import CreateSentenceSet
+from sdp.processors.toloka.create_task_set import CreateTolokaTaskSet
+from sdp.processors.toloka.download_responses import GetTolokaResults
+from sdp.processors.toloka.reject_if import RejectIfBanned
diff --git a/sdp/processors/nemo/ipl_command.py b/sdp/processors/nemo/ipl_command.py
new file mode 100644
index 00000000..a1fb8be8
--- /dev/null
+++ b/sdp/processors/nemo/ipl_command.py
@@ -0,0 +1,184 @@
+
+import os
+import subprocess
+from pathlib import Path
+from typing import Optional
+from typing import Dict, List
+from omegaconf import OmegaConf, open_dict
+from nemo.utils import logging
+from sdp.processors.base_processor import BaseProcessor
+
+
+class IPLCommandGenerator(BaseProcessor):
+    """This processor performs ASR inference on each utterance of the input manifest.
+
+    ASR predictions will be saved in the ``pred_text`` key.
+
+    Args:
+        pretrained_model (str): the name or the filepath of the pretrained NeMo ASR model
+            which will be used to do inference.
+        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
+
+    Returns:
+         The same data as in the input manifest with an additional field
+         ``pred_text`` containing ASR model's predictions.
+    """
+
+    def __init__(
+        self,
+        training_config: str,
+        infenrece_config: str,
+        training_script_path: str,
+        nemo_directory: str,
+        num_ipl_epochs: 50,
+
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        # Paths on the current machine
+        self.training_config = OmegaConf.load(training_config)
+        self.infenrece_config = OmegaConf.load(infenrece_config) 
+        self.training_script_path = os.path.join(nemo_directory, training_script_path)
+        self.nemo_directory = nemo_directory
+        self.num_ipl_epochs = num_ipl_epochs
+
+    def process(self):
+        """.""" 
+        
+
+
+        
+
+    def get_training_script_cmd(self, cluster_script_path, config_name, updated_manifest_filepaths=None, updated_tarred_filepaths=None):
+        """
+        Create the command to run the script on the cluster.
+
+        Args:
+            cluster_script_path (str): Path to the script to run on the cluster.
+            config_name (str): Name of the config file to use for the script.
+            updated_manifest_filepaths (str, optional): Path to the updated manifest file. Defaults to None.
+            updated_tarred_filepaths (str, optional): Path to the updated tarred audio filepaths. Defaults to None.
+
+        Returns:
+            str: Command to run the script on the cluster.
+        """
+
+        # Prepare the base command for training
+        cmd = (
+            "find /results/ -name '*-unfinished' -type f -delete && "
+            f"cd {os.path.dirname(cluster_script_path)} && "
+            f"python -u -B {os.path.basename(cluster_script_path)} "
+            f"--config-path \"/results/configs\" --config-name \"{config_name}\""
+        )
+
+        # Add additional parameters if provided
+        if updated_manifest_filepaths:
+            cmd += f" model.train_ds.manifest_filepath={updated_manifest_filepaths}"
+        if updated_tarred_filepaths:
+            cmd += f" model.train_ds.tarred_audio_filepaths={updated_tarred_filepaths}"
+
+        return cmd
+
+    def get_export_variables_cmd(self, merged_cfg):
+        wandb_key = os.environ.get("WANDB_API_KEY") or os.environ.get("WANDB") or os.environ.get("WANDB_KEY", "")
+        if not wandb_key:
+            logging.warning("WANDB key not found in environment variables. WANDB logging will not work.")
+
+            # Check if WANDB logging is enabled in the exp_manager config
+            if merged_cfg.get('exp_manager', {}).get('create_wandb_logger', False):
+                raise ValueError(
+                    "WANDB key is required for logging but was not found in environment variables. "
+                    "Please set WANDB_API_KEY to enable WANDB logging."
+                )
+
+        cmd = (
+            "nvidia-smi && "
+            "export PYTHONPATH=/nemo_run/code && "
+            f"export HF_TOKEN={os.getenv('HF_TOKEN', '')} && "
+            f"export WANDB_API_KEY={wandb_key} && ")
+        
+        return cmd
+
+    def get_pl_inference_command(self, inference_configs, shuffle=None):
+        """
+        Generate a command to run PL inference with multiple configuration files.
+        Args:
+            inference_configs (list): List of configuration file paths.
+
+        Returns:
+            str: Combined command string to execute PL inference.
+        """
+        # Base command template
+
+        base_cmd = "python /nemo_run/code/examples/asr/transcribe_speech_parallel.py --config-path \"/results/configs\" --config-name {config_name}"
+        if shuffle is not None:
+            base_cmd += f" predict_ds.shuffle={shuffle}"
+
+        # Generate the command list
+        cmd_list = [base_cmd.format(config_name=os.path.basename(config)) for config in inference_configs]
+
+        # Combine the commands with " && " separator
+        return " && ".join(cmd_list)
+
+    def get_pseudo_labeling_command(
+        self, merged_config: Dict, config_name: str, cluster_script_path: str, config_dir: str, ipl_training: Dict[str, any]) -> str:
+        """
+        Generate the pseudo-labeling command for the given configuration and training parameters.
+
+        Args:
+            merged_config (Dict): Merged configuration containing model and dataset settings.
+            config_name (str): Name of the configuration file to be used.
+            cluster_script_path (str): Path to the cluster execution script.
+            config_dir (str): Directory containing the configuration files.
+            ipl_training (Dict[str, any]): Dictionary containing:
+                - first_run (bool): Whether this is the first run of pseudo-labeling.
+                - num_gpus (int): Number of GPUs to use.
+                - inference_config_paths (List[str]): List of inference configuration file paths.
+                - manifests (List[str]): List of manifest file paths.
+                - tarr_paths (List[str]): List of tarred audio file paths.
+                - num_ipl_epochs (int): Number of epochs to train with pseudo-labels.
+                - p_cache (float): What part of pseudo-labels to update.
+
+        Returns:
+            str: The constructed pseudo-labeling command.
+        """
+        
+        prediction_directories_str = " ".join([os.path.dirname(path) for path in ipl_training['manifests']])
+        inference_config_paths_str = " ".join(ipl_training['inference_config_paths'])
+
+        updated_manifest_filepaths, updated_tarred_audio_filepaths = ipl_utils.update_training_sets(
+            merged_config, ipl_training["manifests"], ipl_training.get("tarr_paths", None), ipl_training["prefix"]
+        )
+        exec_cmd = self.get_export_variables_cmd(merged_cfg=merged_config)
+        exec_cmd += self.get_training_script_cmd(cluster_script_path, config_name)
+        exec_cmd += " && sleep 10"
+        if ipl_training.get("first_run", False):
+            exec_cmd += f" && {self.get_pl_inference_command(ipl_training['inference_config_paths'], shuffle=False)}"
+            exec_cmd += (
+                f" && python /nemo_run/code/examples/asr/run_write_transcribed_files.py "
+                f"--prediction_filepaths {prediction_directories_str} --full_pass --prefix {ipl_training['prefix']}"
+            )
+            if merged_config.model.train_ds.is_tarred:
+                exec_cmd += " --is_tarred"
+            exec_cmd += (
+                f" && python /nemo_run/code/examples/asr/run_update_inf_config.py "
+                f"--inference_configs {inference_config_paths_str} --p_cache {ipl_training['p_cache']} --num_gpus {ipl_training['num_gpus']}"
+            )
+
+        # If run has been interupted user has to change `num_ipl_epochs` in the config
+        for _ in range(ipl_training["num_ipl_epochs"]):
+            run_script = self.get_training_script_cmd(
+                cluster_script_path, config_name, updated_manifest_filepaths, updated_tarred_audio_filepaths
+            )
+            exec_cmd += " && sleep 10"
+            exec_cmd += f" && {run_script}"
+            exec_cmd += f" && {self.get_pl_inference_command(ipl_training['inference_config_paths'],shuffle=True)}"
+            exec_cmd += (
+                f" && python /nemo_run/code/examples/asr/run_write_transcribed_files.py "
+                f"--prediction_filepaths {prediction_directories_str} "
+                f"--prefix {ipl_training['prefix']}"
+            )
+            if merged_config.model.train_ds.is_tarred:
+                exec_cmd += " --is_tarred"
+
+        return exec_cmd
\ No newline at end of file
diff --git a/sdp/processors/nemo/ipl_training.py b/sdp/processors/nemo/ipl_training.py
new file mode 100644
index 00000000..ecc3520a
--- /dev/null
+++ b/sdp/processors/nemo/ipl_training.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/sdp/processors/nemo/ipl_utils.py b/sdp/processors/nemo/ipl_utils.py
new file mode 100644
index 00000000..0630be4f
--- /dev/null
+++ b/sdp/processors/nemo/ipl_utils.py
@@ -0,0 +1,330 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import json
+import os
+from typing import List, Optional, Tuple, Union
+
+from omegaconf import OmegaConf
+
+def separate_multiple_transcriptions(inference_config: dict) -> Tuple[List[str], Optional[List[str]]]:
+    """
+    Separates and returns the manifest and tarred audio file paths from the configuration.
+    This function makes it easier to run transcribe_speech_parallel for each bucket separately
+    Args:
+        inference_config (str): Path to the inference configuration file.
+    Returns:
+        Tuple[List[str], Optional[List[str]]]: A tuple containing:
+            - A list of manifest file paths.
+            - An optional list of tarred audio file paths, or None if not applicable.
+    """
+    
+    if hasattr(inference_config.predict_ds, "is_tarred") and inference_config.predict_ds.is_tarred:
+        tarred_audio_filepaths = inference_config.predict_ds.tarred_audio_filepaths
+        manifest_filepaths = inference_config.predict_ds.manifest_filepath
+        if type(tarred_audio_filepaths) != str and len(tarred_audio_filepaths) > 1:
+            manifests = []
+            tarr_audio_files = []
+            for manifest_filepath, tarred_audio_filepath in zip(manifest_filepaths, tarred_audio_filepaths):
+                manifests.append(manifest_filepath[0])
+                tarr_audio_files.append(tarred_audio_filepath[0])
+            return manifests, tarr_audio_files
+        else:
+            return [manifest_filepaths], [tarred_audio_filepaths]
+    else:
+        if isinstance(inference_config.predict_ds.manifest_filepath, str):
+            return [inference_config.predict_ds.manifest_filepath], None
+        else:
+            return inference_config.predict_ds.manifest_filepath, None
+
+
+def create_transcribed_shard_manifests(
+    prediction_filepaths: List[str],
+) -> List[str]:
+    """
+    Creates transcribed shard manifest files by processing predictions and organizing them by shard ID.
+    This function reads a `predictions_all.json` file from each given directory, organizes the data by
+    shard IDs, and writes the entries to separate shard manifest files. For each shard, the `pred_text`
+    field is updated as the main transcription (`text`), and the original transcription (`text`) is
+    stored as `orig_text`.
+    Args:
+        prediction_filepaths (List[str]): A list of file paths to directories containing
+            `predictions_all.json` files with prediction data, including shard IDs.
+    Returns:
+        List[str]: A list of file paths to the combined manifest files (`transcribed_manifest__OP_0..CL_.json`)
+        created for each directory.
+    """
+    all_manifest_filepaths = []
+    for prediction_filepath in prediction_filepaths:
+        max_shard_id = 0
+        shard_data = {}
+        full_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(full_path, 'r') as f:
+            for line in f.readlines():
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                if max_shard_id < shard_id:
+                    max_shard_id = shard_id
+                if shard_id not in shard_data:
+                    shard_data[shard_id] = []
+                shard_data[shard_id].append(data_entry)
+        for shard_id, entries in shard_data.items():
+            output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
+            with open(output_filename, 'w') as f:
+                for data_entry in entries:
+                    if data_entry['audio_filepath'].endswith(".wav"):
+                        if 'text' in data_entry:
+                            data_entry['orig_text'] = data_entry.pop('text')
+                        data_entry['text'] = data_entry.pop('pred_text')
+                        json.dump(data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+        shard_manifest_filepath = os.path.join(
+            prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json"
+        )
+        all_manifest_filepaths.append(shard_manifest_filepath)
+    return all_manifest_filepaths
+
+
+def create_transcribed_manifests(
+    prediction_filepaths: List[str],
+) -> List[str]:
+    """
+    Creates updated transcribed manifest files by processing predictions.
+    This function reads prediction files (`predictions_all.json`) from the provided directories,
+    updates the transcription data by renaming the `pred_text` field to `text`, and stores the
+    original `text` field as `orig_text`. The updated data is written to new transcribed manifest
+    files (`transcribed_manifest.json`) in each directory.
+    Args:
+        prediction_filepaths (List[str]): A list of file paths to directories containing
+            prediction files (`predictions_all.json`).
+    Returns:
+        List[str]: A list of file paths to the newly created transcribed manifest files
+        (`transcribed_manifest.json`).
+    """
+    all_manifest_filepaths = []
+    for prediction_filepath in prediction_filepaths:
+        prediction_name = os.path.join(prediction_filepath, "predictions_all.json")
+        transcripted_name = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+
+        # Open and read the original predictions_all.json file
+        with open(transcripted_name, 'w', encoding='utf-8') as f:
+            with open(prediction_name, 'r', encoding='utf-8') as pred_f:
+
+                for line in pred_f.readlines():
+                    data_entry = json.loads(line)
+                    if 'text' in data_entry:
+                        data_entry['orig_text'] = data_entry.pop('text')
+                    data_entry['text'] = data_entry.pop('pred_text')
+                    json.dump(data_entry, f, ensure_ascii=False)
+                    f.write("\n")
+            # Append the path of the new manifest file to the list
+            all_manifest_filepaths.append(transcripted_name)
+
+    return all_manifest_filepaths
+
+
+def write_sampled_shard_transcriptions(manifest_filepaths: List[str]) -> List[List[str]]:
+    """
+    Updates transcriptions by merging predicted shard data and transcribed manifest data.
+    This function processes prediction and transcribed manifest files, merges them
+    by matching the shard_id and audio file paths. For each shard, the corresponding
+    data entries are written to a new file.
+    Args:
+        manifest_filepaths (List[str]): A list of file paths to directories containing
+            prediction and transcribed manifest files.
+    Returns:
+        List[List[str]]: A list of lists containing the file paths to the generated
+            transcribed shard manifest files.
+    """
+    all_manifest_filepaths = []
+
+    # Process each prediction directory
+    for prediction_filepath in manifest_filepaths:
+        predicted_shard_data = {}
+        # Collect entries from prediction files based on shard id
+        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(prediction_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                audio_filepath = data_entry['audio_filepath']
+                predicted_shard_data.setdefault(shard_id, {})[audio_filepath] = data_entry
+    max_shard_id = 0
+    for full_path in glob.glob(os.path.join(prediction_filepath, f"transcribed_manifest_[0-9]*.json")):
+        all_data_entries = []
+        with open(full_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                max_shard_id = max(max_shard_id, shard_id)
+                all_data_entries.append(data_entry)
+        # Write the merged data to a new manifest file keeping new transcriptions
+        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
+        with open(output_filename, 'w') as f:
+            for data_entry in all_data_entries:
+                audio_filepath = data_entry['audio_filepath']
+                # Escape duplicated audio files that end with *dup
+                if audio_filepath.endswith(".wav"):
+                    if shard_id in predicted_shard_data and audio_filepath in predicted_shard_data[shard_id]:
+                        predicted_data_entry = predicted_shard_data[shard_id][audio_filepath]
+                        if 'text' in predicted_data_entry:
+                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
+                        if "pred_text" in predicted_data_entry:
+                            predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
+                        json.dump(predicted_data_entry, f, ensure_ascii=False)
+                    else:
+                        json.dump(data_entry, f, ensure_ascii=False)
+                    f.write("\n")
+
+    shard_manifest_filepath = os.path.join(
+        prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json"
+    )
+    all_manifest_filepaths.append([shard_manifest_filepath])
+
+    return all_manifest_filepaths
+
+def write_sampled_transcriptions(manifest_filepaths: List[str]) -> List[str]:
+    """
+    Updates transcriptions by merging predicted data with transcribed manifest data.
+    This function processes prediction and transcribed manifest files within given directories.
+    It matches audio file paths to update transcriptions with predictions, ensuring each audio file
+    is properly transcribed. The updated data is written to the transcribed manifest file.
+    Args:
+        manifest_filepaths (List[str]): A list of file paths to directories containing
+            the prediction file (`predictions_all.json`) and the transcribed manifest file
+            (`transcribed_manifest.json`).
+    Returns:
+        List[str]: A list of file paths to the updated transcribed manifest files.
+    """
+
+    all_manifest_filepaths = []
+    for prediction_filepath in manifest_filepaths:
+        predicted_data = {}
+
+        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(prediction_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                path = data_entry['audio_filepath']
+    
+                predicted_data[path] = data_entry
+        full_path = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+        all_data_entries = []
+        count = 0
+        with open(full_path, 'r') as f:
+            for line in f:
+                count += 1
+                data_entry = json.loads(line)
+                all_data_entries.append(data_entry)
+               
+
+        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+        with open(output_filename, 'w') as f:
+            for data_entry in all_data_entries:
+                audio_filepath = data_entry['audio_filepath']
+                if audio_filepath.endswith(".wav"):
+                    if audio_filepath in predicted_data:
+                        predicted_data_entry = predicted_data[audio_filepath]
+                        if 'text' in predicted_data_entry:
+                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
+                        predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
+                        json.dump(predicted_data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+                    else:
+                        json.dump(data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+        all_manifest_filepaths.append(output_filename)
+    return all_manifest_filepaths
+
+
+def update_training_sets(
+    merged_config: OmegaConf, final_cache_manifests: list, tarred_audio_filepaths: Union[list, str]
+) -> OmegaConf:
+    """
+    Adds pseudo-labeled sets to the training datasets based on dataset type and
+    handles tarred audio files differently. The function updates the 'manifest_filepath'
+    and 'tarred_audio_filepaths' fields in the training dataset configuration.
+    Args:
+        merged_config: The configuration object containing the model and dataset settings.
+        final_cache_manifests: A list of paths to the manifest files for the pseudo-labeled data.
+        tarred_audio_filepaths: A string or list of tarred audio file paths to be added to the training set.
+    Returns:
+        merged_config: The updated configuration object with the new training datasets.
+    """
+
+    print()
+    print(f"update_training_sets")
+    print(f"")
+    if merged_config.model.train_ds.get("is_tarred", False):
+        if isinstance(tarred_audio_filepaths, str):
+            if isinstance(merged_config.model.train_ds['tarred_audio_filepaths'], str):
+                merged_config.model.train_ds['tarred_audio_filepaths'] = [
+                    [merged_config.model.train_ds['tarred_audio_filepaths']],
+                    [tarred_audio_filepaths],
+                ]
+            else:
+                merged_config.model.train_ds.tarred_audio_filepaths.append(tarred_audio_filepaths)
+        else:
+            if isinstance(merged_config.model.train_ds.tarred_audio_filepaths, str):
+                merged_config.model.train_ds.tarred_audio_filepaths = [
+                    [merged_config.model.train_ds.tarred_audio_filepaths]
+                ]
+            merged_config.model.train_ds.tarred_audio_filepaths += tarred_audio_filepaths
+
+        if isinstance(merged_config.model.train_ds.manifest_filepath, str):
+            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
+
+        merged_config.model.train_ds.manifest_filepath += final_cache_manifests
+
+    else:
+        print(f"is not tarred")
+        if isinstance(merged_config.model.train_ds.manifest_filepath, str):
+            print(f"is str")
+            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
+
+        if merged_config.model.train_ds.get("use_lhotse", False):
+            print(f"is lhotse")
+            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
+            merged_config.model.train_ds.manifest_filepath.append(final_cache_manifests)
+        else:
+            print(f"not lhotse")
+            print(f"merged_config.model.train_ds.manifest_filepath {merged_config.model.train_ds.manifest_filepath}")
+            print(f"final_cache_manifests {final_cache_manifests}")
+            merged_config.model.train_ds.manifest_filepath += final_cache_manifests
+
+
+    return merged_config
+
+
+def count_files_for_pseudo_labeling(manifest_filepath: str, is_tarred: bool) -> int:
+    """
+    Counts the number of files for pseudo-labeling.
+    Args:
+        manifest_filepath (str): The path to the manifest file(s).
+        is_tarred (bool): Flag to determine whether to count files for multiple shard manifests.
+    Returns:
+        int: The total number of audio files given for pseudo labeling.
+    """
+    if is_tarred:
+        dir_path, filename = os.path.split(manifest_filepath)
+        prefix = filename.split('_', 1)[0]
+        number_of_files = 0
+        for full_path in glob.glob(os.path.join(dir_path, f"{prefix}_[0-9]*.json")):
+            with open(full_path, 'r') as f:
+                number_of_files += len(f.readlines())
+    else:
+        with open(manifest_filepath, 'r') as f:
+            number_of_files = len(f.readlines())
+
+    return number_of_files
\ No newline at end of file
diff --git a/sdp/processors/nemo/nemo_run_ipl.py b/sdp/processors/nemo/nemo_run_ipl.py
new file mode 100644
index 00000000..b615e9ca
--- /dev/null
+++ b/sdp/processors/nemo/nemo_run_ipl.py
@@ -0,0 +1,386 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import os
+from pathlib import Path
+from typing import Dict, List
+import argparse
+import nemo_run as run
+from omegaconf import OmegaConf, open_dict
+
+from sdp.utils import nemo_run_utils, ipl_utils
+import logging
+from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator, InferenceCommandGenerator
+# NEMO_ROOT = Path(__file__).absolute().parents[2]
+
+def gather_mounts(cluster_cfg):
+    """
+    Gather all mounts from the cluster config including ones which are disjoint from the cluster_cfg.mounts list.
+    It is used because Hydra does not support the ability to append to a list in the config file natively.
+
+    Users can provide additional mounts from the command line using the following syntax:
+    ++mount_<anything>='/src:/dest'
+
+    Args:
+        cluster_cfg: Cluster config dictionary with following fields.
+            
+            script (str): Path to the main Python script to be executed.
+            script_config (str): Path to the YAML config used by the script.
+            exp_name (str or None): Name of the experiment. If None, it is inferred from `exp_manager.name`
+              in the script configuration.
+            results_dir (str): Path to the directory where results should be saved.
+            
+            num_runs (int): Number of times to repeat the experiment.
+            num_gpus (int): Number of GPUs to allocate per run.
+            num_tasks_per_node (int): Number of tasks per node.
+            max_runtime (str): Max allowed runtime in Slurm format (DD:HH:MM:SS). Default is "00:03:45:00".
+
+            executor (str): Type of job executor, e.g., 'slurm', 'local'.
+
+            ssh_tunnel:
+                host (str): Hostname for the SSH tunnel.
+                user (str): Username for SSH login. Can be `${USER}` to auto-resolve.
+                job_dir (str): Remote path where jobs will be created and results uploaded.
+                identity (str): Path to SSH identity file. Resolved from environment variable `${NEMO_OCI_IAD_SSH_IDENTITY}`.
+
+            account (str): Account name used for SLURM job submissions.
+            partition (str): Comma-separated list of SLURM partitions to use.
+            job_name_prefix (str): Prefix for SLURM job names.
+
+            containers:
+                asr (str): URI or path to the container image used for ASR jobs.
+
+            env_vars:
+                List[str]: List of environment variable declarations to be set in the job,
+                e.g., 'TOKENIZERS_PARALLELISM=false', 'HYDRA_FULL_ERROR=1', etc.
+             
+            required_env_vars (List[str]): List of env vars that **must** be present in the environment before running.
+                - 'HF_TOKEN'
+                - 'WANDB_KEY'
+            mounts:
+                - /paths/to/be/mounted:/paths/to/mount/t
+
+            timeouts:
+                partition_name: 04:00:00 (max runtime for execution)
+    """ 
+    # Gather all mounts from the cluster config including ones which are disjoint from the cluster_cfg.mounts list.
+    mounts = cluster_cfg.get('mounts', [])
+    # Resolve any mounts in th cluster config that need user expansion
+    mounts = [os.path.expanduser(m) for m in mounts]
+
+    keys = list(cluster_cfg.keys())
+    # Check for any additional mounts in the cluster config
+    with open_dict(cluster_cfg):
+        for k in keys:
+            if k.startswith("mount_"):  # Additional mount found
+                logging.info(f"Found additional mount flag in the cluster config `{k}`. Adding it to the mounts list.")
+                mounts.append(cluster_cfg[k])
+                del cluster_cfg[k]  # Remove the key from the cluster config
+
+        cluster_cfg['mounts'] = mounts
+        logging.info(f"Final Mounts: {mounts}")
+
+
+# def check_root_path(path, nemo_root):
+#     """
+#     Check if a path is in the NeMo root directory and convert it to a path that is relative to the NeMo root directory.
+#     This is used to ensure that any path that is provided to this script will be in the NeMo root directory when
+#     mounted in the container.
+
+#     Args:
+#         path: Path to check
+#         nemo_root: NeMo root directory
+
+#     Returns:
+#         str: Path relative to the NeMo root directory
+#     """
+#     path = str(path)
+#     nemo_root = str(nemo_root)
+
+#     if not os.path.exists(path):
+#         raise FileNotFoundError(f"Path {path} does not exist.")
+
+#     if not path.startswith(nemo_root):
+#         raise ValueError(f"Path {path} is not in the NeMo root directory.")
+
+#     new_path = path.replace(nemo_root, '/nemo_run/code/')
+#     return new_path
+
+
+def check_config_mount_paths(script_config, cluster_config):
+    """
+    Check if all path-like strings in the script config are mounted paths in the cluster config.
+    If a path-like string is not a mounted path, raise an error.
+
+    Args:
+        script_config: Script config dictionary that represents the Model training/inference config
+        cluster_config: Cluster config dictionary that represents the cluster configuration
+    """
+    # recursively walk all values of the script_config, checking if its a path-like string and if so, check if the path is a mounted path
+    # if it is not, raise an error
+
+    def filepath_check(v, cluster_cfg):
+        if v.startswith(os.path.sep):  # check for absolute paths only
+            logging.info(f"Checking if {v} is a mounted path")
+            # Check if the path begins with mount path
+            nemo_run_utils.check_if_mounted(cluster_cfg, v)
+
+            # Check the file exists in the cluster at the unmounted path
+            unmounted_path = nemo_run_utils.get_unmounted_filepath(cluster_cfg, v)
+            nemo_run_utils.check_remote_mount_directories(unmounted_path, cluster_cfg)
+
+    def check_mounted_path(cfg, cluster_cfg):
+        if hasattr(cfg, 'items'):  # if the object is a dictionary
+            for k, v in cfg.items():
+                if hasattr(v, 'items'):  # if the value is a dictionary, recurse
+                    check_mounted_path(v, cluster_cfg)
+
+                elif isinstance(v, list):  # if the value is a list, check if its items are an absolute path
+                    for item in v:
+                        if isinstance(item, str):
+                            filepath_check(item, cluster_cfg)
+
+                elif isinstance(v, str):  # if the value is a string, check if its an absolute a path
+                    filepath_check(v, cluster_cfg)
+
+    check_mounted_path(script_config, cluster_config)
+
+    return 
+
+
+def get_export_variables_cmd(merged_cfg):
+    wandb_key = os.environ.get("WANDB_API_KEY") or os.environ.get("WANDB") or os.environ.get("WANDB_KEY", "")
+    if not wandb_key:
+        logging.warning("WANDB key not found in environment variables. WANDB logging will not work.")
+
+        # Check if WANDB logging is enabled in the exp_manager config
+        if merged_cfg.get('exp_manager', {}).get('create_wandb_logger', False):
+            raise ValueError(
+                "WANDB key is required for logging but was not found in environment variables. "
+                "Please set WANDB_API_KEY to enable WANDB logging."
+            )
+
+    cmd = (
+        "nvidia-smi && "
+        "export PYTHONPATH=/nemo_run/code && "
+        f"export HF_TOKEN={os.getenv('HF_TOKEN', '')} && "
+        f"export WANDB_API_KEY={wandb_key} && ")
+    
+    return cmd
+
+from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator, InferenceCommandGenerator
+
+def get_pseudo_labeling_command(
+    train_command_config: dict, 
+    inference_command_config: dict, 
+    num_ipl_epochs: int,
+    new_manifest_files,
+    new_tarr_files,
+    first_run: False,
+    
+) -> str:
+    """
+    Generate the pseudo-labeling command for the given configuration and training parameters using processors.
+
+    Args:
+        train_command_config (dict): Config for TrainingCommandGenerator.
+        inference_command_config (dict): Config for InferenceCommandGenerator.
+        num_ipl_epochs (int): Number of epochs to train with pseudo-labels.
+
+    Returns:
+        str: The constructed pseudo-labeling command.
+    """
+    # Instantiate processors
+    train_proc = TrainingCommandGenerator(**train_command_config)
+    infer_proc = InferenceCommandGenerator(**inference_command_config)
+
+    exec_cmd = train_proc.process()
+    exec_cmd += " && sleep 10"
+    exec_cmd += " && " + infer_proc.process(first_run=first_run)
+
+    # For subsequent epochs, set first_run to False
+    for _ in range(num_ipl_epochs):
+        exec_cmd += " && sleep 10"
+        exec_cmd += " && " + train_proc.process(new_manifest_files, new_tarr_files)
+        exec_cmd += " && " + infer_proc.process(first_run=False)
+
+    return exec_cmd
+
+
+def main(config_path: str):
+    """
+    Main entry point for running IPL training.
+    
+    Args:
+        config_path (str): Path to the YAML configuration file
+    """
+    # Load the cluster config from YAML
+    cluster_cfg = OmegaConf.load(config_path)
+    
+    # Process the required arguments from the cluster config
+    script_path = cluster_cfg.script
+    script_config_path = cluster_cfg.script_config
+    results_dir = cluster_cfg.results_dir
+    NEMO_ROOT = cluster_cfg.nemo_directory
+
+    script_config_path = Path(script_config_path).absolute()
+
+    # Gather all mounts from the cluster config
+    gather_mounts(cluster_cfg)
+
+    # Add the results directory to the cluster config as a mount path
+    nemo_run_utils.add_mount_path(results_dir, '/results', cluster_cfg)
+
+    # Create results and logdir
+    log_dir = cluster_cfg.get('log_dir', os.path.join(results_dir, 'logs'))
+    nemo_run_utils.create_remote_directory([results_dir, log_dir], cluster_cfg)
+
+    # Load the script config
+    script_config = OmegaConf.load(script_config_path)
+
+    # Update the exp_manager runtime with the max_runtime from the cluster config
+    import copy
+    # Perform all path checks in the merged config
+    if "ipl_training" in script_config.model:
+        ipl_training = copy.deepcopy(script_config.model.ipl_training)
+        # not to check the path
+        del script_config.model.ipl_training.inference_config
+    else:
+        raise KeyError("Parameters for `IPL` training are not provided.")
+    
+    check_config_mount_paths(script_config, cluster_cfg)
+
+    inference_config = ipl_training.inference_config
+    inference_config_path = Path(inference_config).absolute()
+    inference_config = OmegaConf.load(inference_config_path)
+
+    # Resolve experiment name; if not provided in the script config file, check the cluster config
+    exp_name = cluster_cfg.exp_name
+    if exp_name is None:
+        if 'exp_manager' in script_config and 'name' in script_config['exp_manager']:
+            exp_name = script_config['exp_manager']['name']
+        else:
+            raise ValueError(
+                "Experiment name not provided in the run config file (`exp_name`)) or the cluster config (inside exp_manager.name)"
+            )
+
+    # Begin NeMo Run setup
+    with run.Experiment(exp_name) as exp:
+        # Create the config file name
+        timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        config_name = f"{exp_name}_{timestamp}_config.yaml"
+
+        # Copy the merged config file to remote location's /results/configs directory
+        config_dir = os.path.join(results_dir, 'configs')
+        train_config_cluster = nemo_run_utils.create_remote_config(script_config, config_name, config_dir, cluster_cfg)
+
+        # Prepare arguments for the slurm job
+        job_name = f"{exp_name}_job"
+
+        # Get run parameters from the config
+        num_runs = cluster_cfg.num_runs  # Number of dependent jobs for this script
+        num_gpus = cluster_cfg.get('num_gpus', script_config['trainer']['devices'])
+        if isinstance(num_gpus, list):
+            num_gpus = len(num_gpus)
+        if num_gpus == -1:
+            num_gpus = 1 if cluster_cfg['executor'] == 'local' else 8
+            logging.warning(f"\n\nSetting num_gpus to {num_gpus} as it was set to -1\n\n")
+        num_nodes = cluster_cfg.get('num_nodes', script_config['trainer'].get('num_nodes', 1))
+
+
+        checkpoint_dir = os.path.join(
+            os.path.join(script_config.exp_manager.exp_dir, script_config.exp_manager.name), "checkpoints"
+        )
+        checkpoint_name = os.path.join(checkpoint_dir, script_config.exp_manager.name + ".nemo")
+        inference_config_paths, manifests, tarr_paths = nemo_run_utils.create_remote_inference_config(
+            cluster_cfg, config_dir, inference_config, checkpoint_name
+        )
+        check_config_mount_paths(inference_config, cluster_cfg)
+
+        train_command_generator_config = { 
+            "nemo_directory": NEMO_ROOT,
+            "training_config_local": script_config,
+            "training_config_cluster": train_config_cluster,
+            "training_script_path": script_path,
+            "output_manifest_file": "./train_output_manifest_filepath.json",
+        }   
+        inference_command_generator_config = {
+            "nemo_directory": NEMO_ROOT,
+            "inference_config_paths": inference_config_paths,
+            "manifests": manifests,
+            "p_cache": script_config.model.ipl_training.p_cache,
+            "num_gpus": num_nodes * num_gpus,
+            "is_tarred": getattr(script_config.model.train_ds, "is_tarred", False),
+            "output_manifest_file": "./inference_output_manifest_filepath.json",
+        }
+
+
+        cmd = get_pseudo_labeling_command(
+            train_command_generator_config,
+            inference_command_generator_config,
+            num_ipl_epochs=script_config.model.ipl_training.num_ipl_epochs,
+            new_manifest_files=manifests,
+            new_tarr_files=tarr_paths,
+            first_run=True,
+        ) 
+
+        # # Cast the cluster config to a dictionary for compatibility with NeMo Run
+        cluster_cfg = OmegaConf.to_object(cluster_cfg)
+
+       # logging.info(f"Scheduling {num_runs} runs of the script {script_path}...")
+
+        task = None
+        for run_id in range(num_runs):
+            # Add the task to the experiment
+            if run_id == 0:
+                task = None
+            else:
+                if ipl_training:
+                    cmd = get_pseudo_labeling_command(
+                        train_command_generator_config,
+                        inference_command_generator_config,
+                        num_ipl_epochs=script_config.model.ipl_training.num_ipl_epochs,
+                        new_manifest_files=manifests,
+                        new_tarr_files=tarr_paths,
+                        first_run=False
+                    ) 
+                task = [task]
+            print(f"will add task")
+            task = nemo_run_utils.add_task(
+                exp,
+                cmd=cmd,
+                task_name=job_name,
+                cluster_config=cluster_cfg,
+                container=cluster_cfg['containers']['asr'],
+                num_tasks=cluster_cfg.get('num_tasks', cluster_cfg.get('num_tasks_per_node', 1)),
+                num_gpus=num_gpus,
+                num_nodes=num_nodes,
+                log_dir=nemo_run_utils.get_mounted_filepath(cluster_cfg, log_dir),
+                partition=cluster_cfg.get('partition', None),
+                task_dependencies=task,
+            )
+
+        # Run the experiment on the cluster with all the tasks
+        nemo_run_utils.run_exp(exp, cluster_cfg)
+
+
+if __name__ == '__main__':
+    
+    
+    parser = argparse.ArgumentParser(description='Run IPL training with configuration')
+    parser.add_argument('--config', type=str, required=True, help='Path to the YAML configuration file')
+    args = parser.parse_args()
+    
+    main(args.config)
diff --git a/sdp/utils/ipl_utils.py b/sdp/utils/ipl_utils.py
new file mode 100644
index 00000000..53b6b807
--- /dev/null
+++ b/sdp/utils/ipl_utils.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Tuple
+
+from omegaconf import DictConfig
+
+
+def separate_bucket_transcriptions(inference_config: str) -> tuple:
+    """
+    Separates manifests and audio file paths from different buckets.
+
+    Args:
+        inference_config (str): The configuration object for inference.
+
+    Returns:
+        tuple: A tuple containing:
+            - manifests (list): A list of manifest file paths.
+            - tarr_audio_files (list or None): A list of tarred audio file paths or None if
+              the dataset is not tarred.
+    """
+
+    if hasattr(inference_config.predict_ds, "is_tarred") and inference_config.predict_ds.is_tarred:
+        tarred_audio_filepaths = inference_config.predict_ds.tarred_audio_filepaths
+        manifest_filepaths = inference_config.predict_ds.manifest_filepath
+        if type(tarred_audio_filepaths) != str and len(tarred_audio_filepaths) > 1:
+            manifests = []
+            tarr_audio_files = []
+            for manifest_filepath, tarred_audio_filepath in zip(manifest_filepaths, tarred_audio_filepaths):
+                manifests.append(manifest_filepath[0])
+                tarr_audio_files.append(tarred_audio_filepath[0])
+            return manifests, tarr_audio_files
+        else:
+            return [manifest_filepaths], [tarred_audio_filepaths]
+    else:
+        if isinstance(inference_config.predict_ds.manifest_filepath, str):
+            return [inference_config.predict_ds.manifest_filepath ], None
+        else:
+            return inference_config.predict_ds.manifest_filepath, None
+
+
+def get_transcribed_names(manifest_filepaths: List[str], prefix: str, is_tarred: bool=False) -> List[List[str]]:
+    """
+    Generates a list of modified file paths by prepending 'transcribed_' to the filenames.
+    The use case is for non AIStore datasets
+
+    Args:
+        manifest_filepaths (list of str): A list of file paths to be modified.
+
+    Returns:
+        list of list of str: A list where each element is a single-item list containing the updated file path.
+    Example:
+        >>> manifest_filepaths = [
+        ...     "/path/to/manifest_1.json",
+        ...     "/path/to/manifest_2.json"
+        ... ]
+        >>> get_transcribed_names(manifest_filepaths)
+        [
+            ["/path/to/prefix_transcribed_manifest_1.json"],
+            ["/path/to/prefix_transcribed_manifest_2.json"]
+        ]
+    """
+    # For manifest_filepath, modify the filenames by prepending 'prefix_transcribed_'
+    transcribed_paths = []
+
+    for file_path in manifest_filepaths:
+        directory, filename = os.path.split(file_path)
+        
+        new_filename = (
+            f"{prefix}_transcribed_{filename}" if is_tarred 
+            else f"{prefix}_transcribed_manifest.json"
+        )
+        transcribed_paths.append([os.path.join(directory, new_filename)])
+
+    return transcribed_paths
+
+
+def update_training_sets(
+    config: DictConfig,
+    updated_manifest_filepaths: List[str],
+    updated_tarred_audio_filepaths: Optional[List[str]] = None,
+    prefix:str  = ""
+) -> Tuple[str, str]:
+    """
+    Updates the training dataset configuration by adding pseudo-labeled datasets
+    to the training paths based on the dataset type.
+
+    Args:
+        config (DictConfig): Training config file to be updated.
+        updated_manifest_filepaths (List[str]): List of updated manifest file paths to be included.
+        updated_tarred_audio_filepaths (Optional[List[str]]): List of updated tarred audio filepaths to be included.
+
+    Returns:
+        Tuple[str, str]: A tuple containing:
+            - Updated manifest file paths as a string, formatted for Omegaconf.
+            - Updated tarred audio file paths as a string, formatted for Omegaconf.
+    """
+    updated_manifest_filepaths = get_transcribed_names(updated_manifest_filepaths, prefix, is_tarred=config.model.train_ds.get("is_tarred", False))
+    manifest_filepath = config.model.train_ds.manifest_filepath
+
+    if updated_tarred_audio_filepaths:
+        updated_tarred_audio_filepaths = [[path] for path in updated_tarred_audio_filepaths]
+
+    # Updating the configuration based on dataset types
+    if config.model.train_ds.get("is_tarred", False):
+        tarred_audio_filepaths = config.model.train_ds.tarred_audio_filepaths
+        if isinstance(tarred_audio_filepaths, str):
+            updated_tarred_audio_filepaths.append([tarred_audio_filepaths])
+            updated_manifest_filepaths.append([manifest_filepath])
+        else:
+            updated_tarred_audio_filepaths += tarred_audio_filepaths
+            updated_manifest_filepaths += manifest_filepath
+    else:
+        if config.model.train_ds.get("use_lhotse", False):
+            if isinstance(manifest_filepath, str):
+                updated_manifest_filepaths.append([manifest_filepath])
+            else:
+                updated_manifest_filepaths += manifest_filepath
+        else:
+            updated_manifest_filepaths = [item for sublist in updated_manifest_filepaths for item in sublist]
+            if isinstance(manifest_filepath, str):
+                updated_manifest_filepaths.append(manifest_filepath)
+            else:
+                updated_manifest_filepaths += manifest_filepath
+
+    # Returning strings formatted for Omegaconf
+    return (
+        str(updated_manifest_filepaths).replace(", ", ","),
+        str(updated_tarred_audio_filepaths).replace(", ", ",") if updated_tarred_audio_filepaths else None,
+    )
\ No newline at end of file
diff --git a/sdp/utils/nemo_run_utils.py b/sdp/utils/nemo_run_utils.py
new file mode 100644
index 00000000..f7252e04
--- /dev/null
+++ b/sdp/utils/nemo_run_utils.py
@@ -0,0 +1,406 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from functools import lru_cache
+from nemo_run.core.tunnel import LocalTunnel, SSHTunnel
+from omegaconf import DictConfig, OmegaConf
+from sdp.utils.skills_utils import (
+    get_mounts_from_config,
+    check_if_mounted,
+    add_task,
+    run_exp,
+)
+import logging
+import copy
+from sdp.processors.nemo import ipl_utils
+@lru_cache(maxsize=2)
+def get_tunnel(**ssh_tunnel):
+    return SSHTunnel(**ssh_tunnel)
+
+
+
+def add_mount_path(mount_source: str, mount_dest: str, cluster_config):
+    """
+    Add a mount path to the cluster config.
+
+    Args:
+        mount_source: The source filepath on the local/remote machine.
+        mount_dest: The destination filepath on the remote/local machine. Must be an absolute path.
+        cluster_config: The cluster config dictionary.
+    """
+
+    # Check if the cluster config is provided
+    if cluster_config is None:
+        raise ValueError("Cluster config is not provided.")
+
+    # Check if the mounts key is present in the cluster config
+    if 'mounts' in cluster_config:
+        # Resolve the environment variables for the mount source and mount destination
+        original_mounts = get_mounts_from_config(cluster_config)
+
+        added_mount = False
+        for mount_path in original_mounts:
+            source, destination = mount_path.split(':')
+
+            # Check if the mount path already exists in the cluster config
+            if source == mount_source and destination == mount_dest:
+                return
+
+        # Add the mount path to the cluster config if it does not already exist
+        if not added_mount:
+            cluster_config['mounts'].append(f"{mount_source}:{mount_dest}")
+            logging.info(f"Added mount path: `{mount_source}:{mount_dest}`")
+
+    else:
+        # Don't add a new mount path if the mounts key is not present in the cluster config
+        raise ValueError("No mounts found in cluster config, can only add to existing mount list.")
+
+
+def create_remote_directory(directory: str | list, cluster_config: dict):
+    """
+    Create a remote directory on the cluster using the cluster config.
+
+    **Note**: The ssh tunnel config must be provided in the cluster config for remote directory creation.
+
+    Args:
+        directory: The directory path to be created on the remote cluster. Can be a single directory path or a list
+            of directory paths.
+        cluster_config: The cluster config dictionary.
+    """
+
+    if cluster_config is None:
+        raise ValueError("Cluster config is not provided.")
+
+    # Check if the directory is a string or a list
+    if isinstance(directory, str):
+        directory = [directory]
+
+    # Check if the executor is local
+    if cluster_config.get('executor') == 'local':
+        tunnel = LocalTunnel(job_dir=directory[0])  # temp job dir, unused
+        for dir_path in directory:
+            tunnel.run(f'mkdir -p {dir_path}', hide=False, warn=True)
+            logging.info(f"Created directory: {dir_path} in local filesystem.")
+
+        # Dont cleanup, cache the tunnel
+        # tunnel.cleanup()
+
+    # Check if the executor is slurm
+    elif cluster_config.get('executor') == 'slurm':
+        # Check if the ssh tunnel config is provided in the cluster config
+        ssh_tunnel_config = cluster_config.get('ssh_tunnel', None)
+        if ssh_tunnel_config is None:
+            raise ValueError("`ssh_tunnel` sub-config is not provided in cluster_config.")
+
+        # Check for pre-existing job_dir in the ssh_tunnel_config
+        if 'job_dir' not in ssh_tunnel_config:
+            ssh_tunnel_config['job_dir'] = directory[0]
+
+        # Create the remote directory on the cluster
+        tunnel = get_tunnel(**cluster_config['ssh_tunnel'])
+        for dir_path in directory:
+            tunnel.run(f'mkdir -p {dir_path}', hide=False, warn=True)
+            logging.info(f"Created directory: {dir_path} on remote cluster.")
+
+        # Dont cleanup, cache the tunnel
+        # tunnel.cleanup()
+
+    else:
+        raise ValueError(f"Unsupported executor: {cluster_config.get('executor')}")
+
+
+def create_remote_config(config: dict | DictConfig, config_name: str, config_directory: str, cluster_config: dict):
+    """
+    Utility to write a remote config file on the cluster using the cluster config.
+
+    Args:
+        config: The config dictionary to be written to the file. Can be OmegaConf DictConfig or a dictionary.
+        config_name: The name of the config file to be created.
+        config_directory: The directory path where the config file will be created on the remote machine.
+            Can be a single directory path or a list of directory paths to copy the config file to.
+        cluster_config: The cluster config dictionary.
+    """
+    if cluster_config is None:
+        raise ValueError("Cluster config is not provided.")
+
+    # Check if the config_name is a string and ends with .yaml
+    if not config_name.endswith('.yaml'):
+        config_name = f"{config_name}.yaml"
+
+    # Check if the config_directory is a string or a list
+    if isinstance(config_directory, str):
+        config_directory = [config_directory]
+
+    # Cast a normal dict to OmeagConf DictConfig
+    if isinstance(config, dict):
+        config = OmegaConf.create(config)
+
+    # Check if the executor is local
+    if cluster_config.get('executor') == 'local':
+        tunnel = LocalTunnel(job_dir=config_directory[0])
+
+        # Create the config file on the local filesystem
+        for dir_path in config_directory:
+            config_filepath = os.path.join(dir_path, config_name)
+            tunnel.run(f'mkdir -p {dir_path}', hide=False, warn=True)
+            tunnel.run(f"touch {config_filepath}", hide=False, warn=True)
+            tunnel.run(f"echo '{OmegaConf.to_yaml(config)}' > {config_filepath}", hide=False, warn=True)
+            logging.info(f"Created config file: {dir_path} in local filesystem.")
+
+        # Dont cleanup, cache the tunnel
+        # tunnel.cleanup()
+
+    # Check if the executor is slurm
+    elif cluster_config.get('executor') == 'slurm':
+        # Check if the ssh tunnel config is provided in the cluster config
+        ssh_tunnel_config = cluster_config.get('ssh_tunnel', None)
+        if ssh_tunnel_config is None:
+            raise ValueError("`ssh_tunnel` sub-config is not provided in cluster_config.")
+
+        # Check for pre-existing job_dir in the ssh_tunnel_config
+        if 'job_dir' not in ssh_tunnel_config:
+            ssh_tunnel_config['job_dir'] = config_directory[0]
+
+        tunnel = get_tunnel(**cluster_config['ssh_tunnel'])
+
+        # Create the config file on the remote cluster
+        for dir_path in config_directory:
+            config_filepath = os.path.join(dir_path, config_name)
+            tunnel.run(f'mkdir -p {dir_path}', hide=False, warn=True)
+            tunnel.run(f"touch {config_filepath}", hide=False, warn=True)
+            tunnel.run(f"echo '{OmegaConf.to_yaml(config)}' > {config_filepath}", hide=False, warn=True)
+            logging.info(f"Created config file: {dir_path} on remote cluster.")
+
+        # Dont cleanup, cache the tunnel
+        # tunnel.cleanup()
+
+    else:
+        raise ValueError(f"Unsupported executor: {cluster_config.get('executor')}")
+    return config_filepath
+
+def create_remote_inference_config(cluster_config, config_directory: str, inference_config, checkpoint_path):
+    """
+    Utility to create and write remote inference configuration files for a cluster setup.
+
+    Args:
+        cluster_config (dict): The cluster configuration dictionary containing details about the cluster setup,
+            including the executor type (`local` or `slurm`) and optional SSH tunnel configurations.
+        config_directory (str or list of str): The directory path(s) where the inference configuration file(s)
+            will be created on the remote machine. If a single path is provided, it will be converted into a list.
+        inference_config: The base inference configuration object, which will be modified for each bucket.
+            Should be compatible with OmegaConf.
+        checkpoint_path (str): The path to the model checkpoint, which will be included in the modified inference configuration.
+
+    Returns:
+        tuple: A tuple containing:
+            - new_config_paths (list): A list of paths to the newly created inference configuration files.
+            - manifests (list): A list of manifest file paths, one for each bucket.
+            - tarr_audio_files (list or None): A list of tarred audio file paths, one for each bucket, or None if not applicable.
+    """
+    if isinstance(config_directory, str):
+        config_directory = [config_directory]
+
+    # separating each bucket for creating different inference config
+    manifests, tarr_audio_files = ipl_utils.separate_multiple_transcriptions(inference_config)
+
+    new_config_paths = []
+    for i in range(len(manifests)):
+        output_dir = os.path.dirname(manifests[i])
+        modified_cfg = copy.deepcopy(inference_config)
+        # Updating inference config for exact bucket
+        OmegaConf.update(modified_cfg, "output_path", output_dir)
+        OmegaConf.update(modified_cfg, "predict_ds.manifest_filepath", manifests[i])
+        if tarr_audio_files:
+            OmegaConf.update(modified_cfg, "predict_ds.tarred_audio_filepaths", tarr_audio_files[i])
+        OmegaConf.update(modified_cfg, "model", checkpoint_path)
+
+        if cluster_config.get('executor') == 'local':
+            for dir_path in config_directory:
+                inference_config_filepath = os.path.join(dir_path, f"modified_config_{i}.yaml")
+                new_config_paths.append(os.path.abspath(inference_config_filepath))
+                tunnel = LocalTunnel(job_dir=config_directory[0])
+                tunnel.run(f"touch {inference_config_filepath}", hide=False, warn=True)
+                tunnel.run(
+                    f"echo '{OmegaConf.to_yaml(modified_cfg)}' > {inference_config_filepath}", hide=False, warn=True
+                )
+                logging.info(f"Created config file: {dir_path} in local filesystem.")
+        elif cluster_config.get('executor') == 'slurm':
+            ssh_tunnel_config = cluster_config.get('ssh_tunnel', None)
+            if ssh_tunnel_config is None:
+                raise ValueError("`ssh_tunnel` sub-config is not provided in cluster_config.")
+            if 'job_dir' not in ssh_tunnel_config:
+                ssh_tunnel_config['job_dir'] = config_directory[0]
+            tunnel = get_tunnel(**cluster_config['ssh_tunnel'])
+
+            for dir_path in config_directory:
+                # Creating config files also locally to be able to count
+                inference_config_filepath = os.path.join(dir_path, f"modified_config_{i}.yaml")
+                new_config_paths.append(inference_config_filepath)
+                tunnel.run(f"touch {inference_config_filepath}", hide=False, warn=True)
+                tunnel.run(
+                    f"echo '{OmegaConf.to_yaml(modified_cfg)}' > {inference_config_filepath}", hide=False, warn=True
+                )
+
+    return new_config_paths, manifests, tarr_audio_files
+
+
+def check_remote_mount_directories(directories: str | list, cluster_config: dict, exit_on_failure: bool = True):
+    """
+    Check if files and directories at the source location exist for later mounting on the cluster.
+
+    Args:
+        directories: The directory path to be checked on the local/remote machine. Can be a single directory
+            path or a list. Can be either a file or a directory.
+        cluster_config: The cluster config dictionary.
+        exit_on_failure: If True, will raise an exception if the directories do not exist at the source location.
+    """
+
+    # Check if the cluster config is provided
+    if cluster_config is None:
+        raise ValueError("Cluster config is not provided.")
+
+    # Check if the directories is a string or a list
+    if isinstance(directories, str):
+        directories = [directories]
+
+    # Check if the executor is local
+    if cluster_config.get('executor') == 'local':
+        tunnel = LocalTunnel(job_dir=None)
+
+        # Check if the directories exist at the source location for mounting
+        missing_source_locations = []
+        for directory in directories:
+            result = tunnel.run(f'test -e {directory} && echo "Directory Exists"', hide=True, warn=True)
+
+            if "Directory Exists" not in result.stdout:
+                missing_source_locations.append(directory)
+
+        # Dont cleanup, cache the tunnel
+        # tunnel.cleanup()
+
+        # Raise an exception if the directories do not exist at the source location
+        if len(missing_source_locations) > 0 and exit_on_failure:
+            missing_source_locations = [
+                f"{loc} DOES NOT exist at source destination" for loc in missing_source_locations
+            ]
+            missing_source_locations = "\n".join(missing_source_locations)
+            raise FileNotFoundError(
+                f"Some files or directories do not exist at the source location for mounting !!\n\n"
+                f"{missing_source_locations}"
+            )
+
+    # Check if the executor is slurm
+    elif cluster_config.get('executor') == 'slurm':
+        # Check if the ssh tunnel config is provided in the cluster config
+        ssh_tunnel_config = cluster_config.get('ssh_tunnel', None)
+        if ssh_tunnel_config is None:
+            raise ValueError("`ssh_tunnel` sub-config is not provided in cluster_config.")
+
+        # Check for pre-existing job_dir in the ssh_tunnel_config
+        if 'job_dir' not in ssh_tunnel_config:
+            ssh_tunnel_config['job_dir'] = os.getcwd()
+
+        tunnel = get_tunnel(**cluster_config['ssh_tunnel'])
+        missing_source_locations = []
+
+        # Check if the directories exist at the source location for mounting
+        for directory in directories:
+            result = tunnel.run(f'test -e {directory} && echo "Directory Exists"', hide=True, warn=True)
+
+            if "Directory Exists" not in result.stdout:
+                missing_source_locations.append(directory)
+
+        # Dont cleanup, cache the tunnel
+        # tunnel.cleanup()
+
+        # Raise an exception if the directories do not exist at the source location
+        if len(missing_source_locations) > 0 and exit_on_failure:
+            missing_source_locations = [
+                f"{loc} DOES NOT exist at source destination" for loc in missing_source_locations
+            ]
+            missing_source_locations = "\n".join(missing_source_locations)
+            raise FileNotFoundError(
+                f"Some files or directories do not exist at the source location for mounting !!\n\n"
+                f"{missing_source_locations}"
+            )
+
+    else:
+        raise ValueError(f"Unsupported executor: {cluster_config.get('executor')}")
+
+
+def get_unmounted_filepath(cluster_config: dict, filepath: str):
+    """
+    Resolve the mounted filepath using the cluster config to merge the mount source path to the filepath.
+    Raises an exception if the mount path is not found for the file path.
+
+    Args:
+        cluster_config: The cluster config dictionary.
+        filepath: The filepath to be unmounted using the cluster config.
+
+    Returns:
+        str: unmounted filepath
+    """
+    # Find which mount path matches the filepaths prefix
+    mount_path = None
+    for mount in cluster_config['mounts']:
+        mount_source, mount_dest = mount.split(':')
+        if filepath.startswith(mount_dest):
+            mount_path = mount
+            break
+
+    if mount_path is None:
+        raise ValueError(
+            f"Could not find a mount path for the file path `{filepath}`. Below paths are mounted: \n"
+            f"{cluster_config['mounts']}"
+        )
+
+    # replace the mount destination inside the filepath with the mount source
+    mount_source, mount_dest = mount_path.split(':')
+    filepath = mount_source + filepath[len(mount_dest) :]  # replace the mount destination with the mount source
+
+    return filepath
+
+
+def get_mounted_filepath(cluster_config: dict, filepath: str):
+    """
+    Resolve the mounted filepath using the cluster config to merge the mount destination path to the filepath.
+    Raises an exception if the mount path is not found for the file path.
+
+    Args:
+        cluster_config: The cluster config dictionary.
+        filepath: The filepath to be mounted using the cluster config.
+
+    Returns:
+        str: mounted filepath
+    """
+    # Find which mount path matches the filepaths prefix
+    mount_path = None
+    for mount in cluster_config['mounts']:
+        mount_source, mount_dest = mount.split(':')
+        if filepath.startswith(mount_source):
+            mount_path = mount
+            break
+
+    if mount_path is None:
+        raise ValueError(
+            f"Could not find a mount path for the file path `{filepath}`. Below paths are mounted: \n"
+            f"{cluster_config['mounts']}"
+        )
+
+    # replace the mount destination inside the filepath with the mount source
+    mount_source, mount_dest = mount_path.split(':')
+    filepath = mount_dest + filepath[len(mount_source) :]  # replace the mount destination with the mount source
+
+    return filepath
\ No newline at end of file
diff --git a/sdp/utils/skills_utils.py b/sdp/utils/skills_utils.py
new file mode 100644
index 00000000..b8a1d707
--- /dev/null
+++ b/sdp/utils/skills_utils.py
@@ -0,0 +1,1226 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+#This file is maintained in sync with `nemo_skills/pipeline/utils.py`
+# and is intended to be copied as-is to ensure consistency across projects.
+
+import logging
+import os
+import shlex
+import subprocess
+import sys
+import tarfile
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import datetime
+from functools import lru_cache
+from pathlib import Path
+from typing import Optional
+
+import nemo_run as run
+import yaml
+from huggingface_hub import get_token
+from invoke import StreamWatcher
+from nemo_run.config import set_nemorun_home
+from nemo_run.core.execution.docker import DockerExecutor
+from nemo_run.core.execution.slurm import SlurmJobDetails, get_packaging_job_key
+from nemo_run.core.tunnel import SSHTunnel
+from omegaconf import DictConfig
+from torchx.specs.api import AppState
+
+LOG = logging.getLogger(__file__)
+
+
+# TODO: this file is way too big - we need to split it into pieces
+
+# keeping a global variable for first submitted experiment (per cluster) and reusing it by default
+# we are using ssh tunnel as a proxy for cluster identity, since even if other parameters are different
+# we can still reuse code as long as ssh matches
+REUSE_CODE_EXP = {}
+
+
+@dataclass
+class RepoMetadata:
+    """Metadata for a repo that is used in the experiment."""
+
+    name: str
+    path: Path
+
+    def __post_init__(self):
+        if isinstance(self.path, str):
+            self.path = Path(self.path)
+
+        if not self.path.exists():
+            raise ValueError(f"Repository path `{self.path}` does not exist.")
+
+
+# Registry of external repos that should be packaged with the code in the experiment
+EXTERNAL_REPOS = {
+    'nemo_skills': RepoMetadata(
+        name='nemo_skills', path=Path(__file__).absolute().parents[1]
+    ),  # path to nemo_skills repo
+}
+
+
+
+def register_external_repo(metadata: RepoMetadata):
+    """Register an external repo to be packaged with the code in the experiment.
+
+    Args:
+        metadata (RepoMetadata): Metadata for the external repo.
+    """
+    if metadata.name in EXTERNAL_REPOS:
+        raise ValueError(f"External repo {metadata.name} is already registered.")
+
+    EXTERNAL_REPOS[metadata.name] = metadata
+
+
+def get_registered_external_repo(name: str) -> Optional[RepoMetadata]:
+    """Get the path to the registered external repo.
+
+    Args:
+        name (str): Name of the external repo.
+
+    Returns:
+        A path to the external repo if it is registered, otherwise None.
+    """
+    if name not in EXTERNAL_REPOS:
+        return None
+
+    return EXTERNAL_REPOS[name]
+
+
+def check_if_mounted(cluster_config, path_to_check):
+    """Will check that path_to_check is referenced inside one of the mounts."""
+    for mount in get_mounts_from_config(cluster_config) + ['/nemo_run/code:/nemo_run/code']:
+        if path_to_check.startswith(mount.split(":")[1]):
+            return
+    raise ValueError(f"The path '{path_to_check}' is not mounted. Check cluster config.")
+
+
+def get_unmounted_path(cluster_config, path):
+    """Will return the path on the filesystem before it's mounted."""
+    if path is None:
+        return None
+    for mount in get_mounts_from_config(cluster_config):
+        if path.startswith(mount.split(":")[1]):
+            return mount.split(":")[0] + path[len(mount.split(":")[1]) :]
+    raise ValueError(f"The path '{path}' is not mounted. Check cluster config.")
+
+
+# caching the status assuming it doesn't change while experiment is being scheduled
+# otherwise this results in too many ssh calls
+@lru_cache
+def get_exp_handles(expname: str, ignore_finished=True, ignore_exp_not_exists=True) -> list[str]:
+    """Will return the handles of the tasks in the experiment.
+
+    If ignore_finished=True, will only return handles for the tasks
+    that are not yet finished. Useful for filtering handles to set dependencies on.
+
+    If ignore_exp_not_exists=True, will not raise an error if the experiment does not exist.
+
+    TODO: it's still possible that job submission fails if the tasks exist when this function
+          is called, but finish before nemo-run submits a new job (which might take minutes)
+    """
+
+    def _get_handles(exp):
+        handles = []
+        for job in exp.jobs:
+            if not ignore_finished or (
+                job.status(exp._runner) in [AppState.RUNNING, AppState.PENDING, AppState.SUBMITTED, AppState.UNKNOWN]
+            ):
+                handles.append(job.handle)
+                continue
+        return handles
+
+    # if we are given an experiment object, we can directly get the handles
+    if isinstance(expname, run.Experiment):
+        return _get_handles(expname)
+
+    try:
+        with run.Experiment.from_title(expname) as exp:
+            return _get_handles(exp)
+    except FileNotFoundError:
+        try:
+            with run.Experiment.from_id(expname) as exp:
+                return _get_handles(exp)
+        except AssertionError:
+            if ignore_exp_not_exists:
+                LOG.warning("Experiment %s not found!", expname)
+                return []
+            raise ValueError(f"Experiment {expname} not found!")
+
+
+def get_timeout(cluster_config, partition):
+    if 'timeouts' not in cluster_config:
+        timeout = "10000:00:00:00"
+    else:
+        timeout = cluster_config["timeouts"][partition or cluster_config["partition"]]
+
+        # subtracting 15 minutes to account for the time it takes to save the model
+        # the format expected by nemo is days:hours:minutes:seconds
+        time_diff = datetime.strptime(timeout, "%H:%M:%S") - datetime.strptime("00:15:00", "%H:%M:%S")
+        timeout = (
+            f'00:{time_diff.seconds // 3600:02d}:{(time_diff.seconds % 3600) // 60:02d}:{time_diff.seconds % 60:02d}'
+        )
+    return timeout
+
+
+def get_free_port(exclude: list[int] | None = None, strategy: int | str = 5000) -> int:
+    """Will return a free port on the host."""
+    exclude = exclude or []
+    if isinstance(strategy, int):
+        port = strategy
+        while port in exclude:
+            port += 1
+        return port
+    elif strategy == "random":
+        import random
+
+        port = random.randint(1024, 65535)
+        while port in exclude:
+            port = random.randint(1024, 65535)
+        return port
+    else:
+        raise ValueError(f"Strategy {strategy} not supported.")
+
+
+def get_generation_command(server_address, generation_commands):
+    cmd = (
+        f"export PYTHONPATH=$PYTHONPATH:/nemo_run/code && "
+        f"cd /nemo_run/code && "
+        # might be required if we are not hosting server ourselves
+        # this will try to handshake in a loop and unblock when the server responds
+        f"echo 'Waiting for the server to start at {server_address}' && "
+        f"while [ $(curl -X PUT {server_address} >/dev/null 2>&1; echo $?) -ne 0 ]; do sleep 3; done && "
+        # will run in a single task always (no need to check mpi env vars)
+        f"{generation_commands}"
+    )
+    return cmd
+
+
+def get_reward_server_command(
+    server_type: str,
+    num_gpus: int,
+    num_nodes: int,
+    model_path: str,
+    cluster_config: dict,
+    server_port: int,
+    server_args: str = "",
+):
+    num_tasks = num_gpus
+
+    # check if the model path is mounted if not vllm;
+    # vllm can also pass model name as "model_path" so we need special processing
+    if server_type != "vllm":
+        check_if_mounted(cluster_config, model_path)
+
+    # the model path will be mounted, so generally it will start with /
+    elif server_type == "vllm" and model_path.startswith("/"):
+        check_if_mounted(cluster_config, model_path)
+
+    if server_type == 'nemo':
+        nemo_aligner_reward_model_port = get_free_port(strategy="random", exclude=[server_port])
+        server_start_cmd = (
+            # Note: The order of the two commands is important as the reward model server
+            # needs to be the first command so it can get the HF_TOKEN from the environment
+            f"python -m nemo_skills.inference.server.serve_nemo_aligner_reward_model "
+            f"    ++rm_model_file={model_path} "
+            f"    trainer.devices={num_gpus} "
+            f"    trainer.num_nodes={num_nodes} "
+            f"    +model.tensor_model_parallel_size={num_gpus} "
+            f"    +model.pipeline_model_parallel_size={num_nodes} "
+            # This port could be configurable, but is hard coded to reduce
+            # the divergence of the server command parameters from pipeline/generate.py
+            f"    inference.port={nemo_aligner_reward_model_port} "
+            f"    {server_args} & "
+            f"python -m nemo_skills.inference.server.serve_nemo_reward_model "
+            # These ports could be configurable, but is hard coded to reduce
+            # the divergence of the server command parameters from pipeline/generate.py
+            f"    inference_port={server_port}  "
+            f"    triton_server_address=localhost:{nemo_aligner_reward_model_port} "
+        )
+
+        # somehow on slurm nemo needs multiple tasks, but locally only 1
+        if cluster_config["executor"] == "local":
+            num_tasks = 1
+
+    elif server_type == "vllm":
+        if num_nodes > 1:
+            raise ValueError("VLLM server does not support multi-node execution")
+
+        server_start_cmd = (
+            f"python3 -m nemo_skills.inference.server.serve_vllm "
+            f"    --model {model_path} "
+            f"    --num_gpus {num_gpus} "
+            f"    --port {server_port} "
+            f"    {server_args} "
+        )
+        num_tasks = 1
+    else:
+        raise ValueError(f"Server type '{server_type}' not supported for reward model.")
+
+    server_cmd = (
+        f"nvidia-smi && "
+        f"cd /nemo_run/code && "
+        f"export PYTHONPATH=$PYTHONPATH:/nemo_run/code && "
+        f"{server_start_cmd} "
+    )
+    return server_cmd, num_tasks
+
+
+def get_ray_server_cmd(start_cmd):
+    ports = (
+        "--node-manager-port=12345 "
+        "--object-manager-port=12346 "
+        "--dashboard-port=8265 "
+        "--dashboard-agent-grpc-port=12347 "
+        "--runtime-env-agent-port=12349 "
+        "--metrics-export-port=12350 "
+        "--min-worker-port=14349 "
+        "--max-worker-port=18349 "
+    )
+
+    ray_start_cmd = (
+        "if [ \"${SLURM_PROCID:-0}\" = 0 ]; then "
+        "    echo 'Starting head node' && "
+        "    export RAY_raylet_start_wait_time_s=120 && "
+        "    ray start "
+        "        --head "
+        "        --port=6379 "
+        f"       {ports} && "
+        f"   {start_cmd} ;"
+        "else "
+        "    echo 'Starting worker node' && "
+        "    export RAY_raylet_start_wait_time_s=120 && "
+        "    echo \"Connecting to head node at $SLURM_MASTER_NODE\" && "
+        "    ray start "
+        "        --block "
+        "        --address=$SLURM_MASTER_NODE:6379 "
+        f"       {ports} ;"
+        "fi"
+    )
+    return ray_start_cmd
+
+
+def get_server_command(
+    server_type: str,
+    num_gpus: int,
+    num_nodes: int,
+    model_path: str,
+    cluster_config: dict,
+    server_port: int,
+    server_args: str = "",
+):
+    num_tasks = num_gpus
+
+    # check if the model path is mounted if not vllm;
+    # vllm can also pass model name as "model_path" so we need special processing
+    if server_type != "vllm":
+        check_if_mounted(cluster_config, model_path)
+
+    # the model path will be mounted, so generally it will start with /
+    elif server_type == "vllm" and model_path.startswith("/"):
+        check_if_mounted(cluster_config, model_path)
+
+    if server_type == 'nemo':
+        server_start_cmd = (
+            f"python -m nemo_skills.inference.server.serve_nemo "
+            f"    gpt_model_file={model_path} "
+            f"    trainer.devices={num_gpus} "
+            f"    trainer.num_nodes={num_nodes} "
+            f"    tensor_model_parallel_size={num_gpus} "
+            f"    pipeline_model_parallel_size={num_nodes} "
+            f"    ++port={server_port} "
+            f"    {server_args} "
+        )
+
+        # somehow on slurm nemo needs multiple tasks, but locally only 1
+        if cluster_config["executor"] == "local":
+            num_tasks = 1
+    elif server_type == 'vllm':
+        start_vllm_cmd = (
+            f"python3 -m nemo_skills.inference.server.serve_vllm "
+            f"    --model {model_path} "
+            f"    --num_gpus {num_gpus} "
+            f"    --port {server_port} "
+            f"    {server_args} "
+        )
+        server_start_cmd = get_ray_server_cmd(start_vllm_cmd)
+        num_tasks = 1
+    elif server_type == 'sglang':
+        if num_nodes > 1:
+            multinode_args = f" --dist_init_addr $SLURM_MASTER_NODE --node_rank $SLURM_PROCID "
+        else:
+            multinode_args = ""
+        server_start_cmd = (
+            f"python3 -m nemo_skills.inference.server.serve_sglang "
+            f"    --model {model_path} "
+            f"    --num_gpus {num_gpus} "
+            f"    --num_nodes {num_nodes} "
+            f"    --port {server_port} "
+            f"    {multinode_args} "
+            f"    {server_args} "
+        )
+        num_tasks = 1
+    else:
+        # need this flag for stable Nemotron-4-340B deployment
+        server_start_cmd = (
+            f"FORCE_NCCL_ALL_REDUCE_STRATEGY=1 python -m nemo_skills.inference.server.serve_trt "
+            f"    --model_path {model_path} "
+            f"    --port {server_port} "
+            f"    {server_args} "
+        )
+        num_tasks = num_gpus
+
+    server_cmd = (
+        f"nvidia-smi && "
+        f"cd /nemo_run/code && "
+        f"export PYTHONPATH=$PYTHONPATH:/nemo_run/code && "
+        f"{server_start_cmd} "
+    )
+    return server_cmd, num_tasks
+
+
+def get_sandox_command():
+    return "/entrypoint.sh && /start.sh"
+
+
+@dataclass(kw_only=True)
+class CustomJobDetails(SlurmJobDetails):
+    # we have 1 srun per sub-task (e.g. server/sandbox/main), but only a single sbatch
+    srun_prefix: str = "main"
+    sbatch_prefix: str = ""
+
+    @property
+    def stdout(self) -> Path:
+        return Path(self.folder) / f"{self.sbatch_prefix}%j_sbatch.log"
+
+    @property
+    def srun_stdout(self) -> Path:
+        return Path(self.folder) / f"{self.srun_prefix}%j_srun.log"
+
+    @property
+    def stderr(self) -> Path:
+        return Path(self.folder) / f"{self.sbatch_prefix}%j_sbatch.log"
+
+    @property
+    def srun_stderr(self) -> Path:
+        return Path(self.folder) / f"{self.srun_prefix}%j_srun.log"
+
+    @property
+    def ls_term(self) -> str:
+        """This term will be used to fetch the logs.
+
+        The command used to list the files is ls -1 {ls_term} 2> /dev/null
+        """
+        assert self.folder
+        return os.path.join(self.folder, "*srun.log")
+
+
+def read_config(config_file):
+    with open(config_file, "rt", encoding="utf-8") as fin:
+        cluster_config = yaml.safe_load(fin)
+
+    return cluster_config
+
+
+def get_cluster_config(cluster=None, config_dir=None):
+    """Trying to find an appropriate cluster config.
+
+    Will search in the following order:
+    1. config_dir parameter
+    2. NEMO_SKILLS_CONFIG_DIR environment variable
+    3. Current folder / cluster_configs
+    4. This file folder / ../../cluster_configs
+
+    If NEMO_SKILLS_CONFIG is provided and cluster is None,
+    it will be used as a full path to the config file
+    and NEMO_SKILLS_CONFIG_DIR will be ignored.
+
+    If cluster is a python object (dict-like), then we simply
+    return the cluster config, under the assumption that the
+    config is prepared by the user.
+    """
+    # if cluster is provided, we try to find it in one of the folders
+    if cluster is not None:
+        # check if cluster is a python object instead of a str path, pass through
+        if isinstance(cluster, (dict, DictConfig)):
+            return cluster
+
+        # either using the provided config_dir or getting from env var
+        config_dir = config_dir or os.environ.get("NEMO_SKILLS_CONFIG_DIR")
+        if config_dir:
+            return read_config(Path(config_dir) / f"{cluster}.yaml")
+
+        # if it's not defined we are trying to find locally
+        if (Path.cwd() / 'cluster_configs' / f"{cluster}.yaml").exists():
+            return read_config(Path.cwd() / 'cluster_configs' / f"{cluster}.yaml")
+
+        if (Path(__file__).parents[2] / 'cluster_configs' / f"{cluster}.yaml").exists():
+            return read_config(Path(__file__).parents[2] / 'cluster_configs' / f"{cluster}.yaml")
+
+        raise ValueError(f"Cluster config {cluster} not found in any of the supported folders.")
+
+    config_file = os.environ.get("NEMO_SKILLS_CONFIG")
+    if not config_file:
+        raise ValueError("Either cluster or NEMO_SKILLS_CONFIG must be provided.")
+
+    if not Path(config_file).exists():
+        raise ValueError(f"Cluster config {config_file} not found.")
+
+    cluster_config = read_config(config_file)
+
+    if cluster_config['executor'] == 'slurm' and "ssh_tunnel" not in cluster_config:
+        if "job_dir" not in cluster_config:
+            raise ValueError("job_dir must be provided in the cluster config if ssh_tunnel is not provided.")
+        set_nemorun_home(cluster_config["job_dir"])
+
+    return cluster_config
+
+
+@lru_cache
+def _get_tunnel_cached(
+    job_dir: str,
+    host: str,
+    user: str,
+    identity: str | None = None,
+    shell: str | None = None,
+    pre_command: str | None = None,
+):
+    return run.SSHTunnel(
+        host=host,
+        user=user,
+        identity=identity,
+        shell=shell,
+        pre_command=pre_command,
+        job_dir=job_dir,
+    )
+
+
+def tunnel_hash(tunnel):
+    return f"{tunnel.job_dir}:{tunnel.host}:{tunnel.user}:{tunnel.identity}:{tunnel.shell}:{tunnel.pre_command}"
+
+
+def get_tunnel(cluster_config):
+    if "ssh_tunnel" not in cluster_config:
+        LOG.info("No ssh_tunnel configuration found, assuming we are running from the cluster already.")
+        return run.LocalTunnel(job_dir="")
+    return _get_tunnel_cached(**cluster_config["ssh_tunnel"])
+
+
+# Helper class and function to support streaming updates
+class OutputWatcher(StreamWatcher):
+    """Class for streaming remote tar/compression process."""
+
+    def submit(self, stream):
+        print(stream, end='\r')
+        sys.stdout.flush()
+        return []
+
+
+def progress_callback(transferred: int, total: int) -> None:
+    """Display SFTP transfer progress."""
+    percent = (transferred / total) * 100
+    bar = '=' * int(percent / 2) + '>'
+    sys.stdout.write(
+        f'\rFile Transfer Progress: [{bar:<50}] {percent:.1f}% '
+        f'({transferred/1024/1024:.1f}MB/{total/1024/1024:.1f}MB)'
+    )
+    sys.stdout.flush()
+
+
+def cluster_download(
+    tunnel: SSHTunnel, remote_dir: str, local_dir: str, remote_tar_dir: Optional[str] = None, verbose: bool = True
+):
+    """
+    Downloads a directory from a remote cluster by creating a tar archive and transferring it.
+
+    Args:
+        tunnel: SSHTunnel connection
+        remote_dir: Path to the directory on remote server
+        local_dir: Local path to save the downloaded directory
+        remote_tar_dir: Optional directory for temporary tar file creation
+        verbose: Print download progress
+    """
+
+    remote_dir = remote_dir.rstrip('/')
+    remote_dir_parent, remote_dir_name = os.path.split(remote_dir)
+
+    # Directory where the remote tarball is written
+    remote_tar_dir = remote_tar_dir if remote_tar_dir else remote_dir_parent
+    # Path of the remote tar file
+    remote_tar_filename = f"{remote_dir_name}.tar.gz"
+
+    # Remote and local tar files
+    remote_tar = f"{os.path.join(remote_tar_dir, remote_tar_filename)}"
+    local_tar = os.path.join(local_dir, remote_tar_filename)
+
+    # Get the directory size
+    result = tunnel.run(f'du -sb {remote_dir} | cut -f1')
+    total_size = int(result.stdout.strip())
+
+    # Check if result directory compression is streamable
+    streaming_possible = False
+    try:
+        # Check whether the command pv is present on the remote system or not.
+        # Certain systems may not have the `pv` command
+        result = tunnel.run('which pv', warn=True)
+        streaming_possible = result.exited == 0
+    except Exception:
+        streaming_possible = False
+
+    if streaming_possible and verbose:
+        # We can do streaming compression
+        # Command for streaming the compression progress
+        command = (
+            f'cd {remote_dir_parent} && '
+            f'tar --exclude="*.log" -cf - {remote_dir_name} | '
+            f'pv -s {total_size} -p -t -e -b -F "Compressing Remote Directory: %b %t %p" | '
+            f'gzip > {remote_tar}'
+        )
+        # Run the remote compression command and stream the progress
+        result = tunnel.run(command, watchers=[OutputWatcher()], pty=True, hide=(not verbose))
+    else:
+        command = f'cd {remote_dir_parent} && tar -czf {remote_tar} {remote_dir_name}'
+        result = tunnel.run(command, hide=(not verbose))
+
+    # Get SFTP client from tunnel's session's underlying client
+    sftp = tunnel.session.client.open_sftp()
+
+    # Use SFTP's get with callback
+    sftp.get(remote_tar, local_tar, callback=progress_callback if verbose else None)
+    print(f"\nTransfer complete: {local_tar}")
+
+    # Extract the tarball locally
+    os.makedirs(local_dir, exist_ok=True)
+    with tarfile.open(local_tar, "r:gz") as tar:
+        tar.extractall(path=local_dir)
+
+    # Clean up the tarball from the remote server
+    tunnel.run(f'rm {remote_tar}', hide=True)
+
+    # Clean up the local tarball
+    os.remove(local_tar)
+
+
+def cluster_upload(tunnel: SSHTunnel, local_file: str, remote_dir: str, verbose: bool = True):
+    """
+    Uploads a file to cluster.
+    TODO: extend to a folder.
+
+    Args:
+        tunnel: SSHTunnel connection
+        local_file: Path to the local file to upload
+        remote_dir: Cluster path where to save the file
+        verbose: Print upload progress
+    """
+    sftp = tunnel.session.client.open_sftp()
+    sftp.put(str(local_file), str(remote_dir), callback=progress_callback if verbose else None)
+    print(f"\nTransfer complete")
+
+
+def get_git_repo_path(path: str | Path = None):
+    """Check if the path is a git repo.
+
+    Args:
+        path: Path to the directory to check. If None, will check the current directory.
+
+    Returns:
+        Path to the repo if it is a git repo, otherwise None.
+    """
+    original_path = os.getcwd()
+    try:
+        if path:
+            os.chdir(path)
+
+        repo_path = (
+            subprocess.run(
+                ["git", "rev-parse", "--show-toplevel"],
+                capture_output=True,
+                check=True,
+            )
+            .stdout.decode()
+            .strip()
+        )
+        return Path(repo_path)
+
+    except subprocess.CalledProcessError:
+        return None
+
+    finally:
+        os.chdir(original_path)
+
+
+def get_packager(extra_package_dirs: tuple[str] | None = None):
+    """Will check if we are running from a git repo and use git packager or default packager otherwise."""
+    nemo_skills_dir = get_registered_external_repo('nemo_skills').path
+
+    if extra_package_dirs:
+        include_patterns = [str(Path(d) / '*') for d in extra_package_dirs]
+        include_pattern_relative_paths = [str(Path(d).parent) for d in extra_package_dirs]
+    else:
+        include_patterns = []
+        include_pattern_relative_paths = []
+
+    check_uncommited_changes = not bool(os.getenv('NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK', 0))
+
+    # are we in a git repo? If yes, we are uploading the current code
+    repo_path = get_git_repo_path(path=None)  # check if we are in a git repo in pwd
+
+    if repo_path:
+        # Do we have nemo_skills package in this repo? If no, we need to pick it up from installed location
+        if not (Path(repo_path) / 'nemo_skills').is_dir():
+            logging.warning(
+                "Not running from NeMo-Skills repo, trying to upload installed package. "
+                "Make sure there are no extra files in %s",
+                str(nemo_skills_dir / '*'),
+            )
+            include_patterns.append(str(nemo_skills_dir / '*'))
+        else:
+            # picking up local dataset files if we are in the right repo
+            include_patterns.append(str(nemo_skills_dir / "dataset/**/*.jsonl"))
+        include_pattern_relative_paths.append(str(nemo_skills_dir.parent))
+
+        root_package = run.GitArchivePackager(
+            include_pattern=include_patterns,
+            include_pattern_relative_path=include_pattern_relative_paths,
+            check_uncommitted_changes=check_uncommited_changes,
+        )
+    else:
+        logging.warning(
+            "Not running from a git repo, trying to upload installed package. Make sure there are no extra files in %s",
+            str(nemo_skills_dir / '*'),
+        )
+        include_patterns.append(str(nemo_skills_dir / '*'))
+        include_pattern_relative_paths.append(str(nemo_skills_dir.parent))
+
+        root_package = run.PatternPackager(
+            include_pattern=include_patterns,
+            relative_path=include_pattern_relative_paths,
+        )
+
+    extra_repos = {}
+    if len(EXTERNAL_REPOS) > 1:
+        # Insert root package as the first package
+        extra_repos['nemo_run'] = root_package
+
+        for repo_name, repo_meta in EXTERNAL_REPOS.items():
+            if repo_name == 'nemo_skills':
+                continue
+
+            repo_path = repo_meta.path
+            if get_git_repo_path(repo_path):
+                # Extra repos is a git repos, so we need to package only committed files
+                extra_repos[repo_name] = run.GitArchivePackager(
+                    basepath=str(repo_path), check_uncommitted_changes=check_uncommited_changes
+                )
+            else:
+                # Extra repos is not a git repo, so we need to package all files in the directory
+                repo_include_pattern = [str(Path(repo_path) / '*')]
+                repo_include_pattern_relative_path = [str(Path(repo_path).parent)]
+                extra_repos[repo_name] = run.PatternPackager(
+                    include_pattern=repo_include_pattern,
+                    relative_path=repo_include_pattern_relative_path,
+                )
+
+        # Return hybrid packager
+        return run.HybridPackager(sub_packagers=extra_repos, extract_at_root=True)
+
+    return root_package
+
+
+def get_env_variables(cluster_config):
+    """
+    Will get the environment variables from the cluster config and the user environment.
+
+    The following items in the cluster config are supported:
+    - `required_env_vars` - list of required environment variables
+    - `env_vars` - list of optional environment variables
+
+    WANDB_API_KEY, NVIDIA_API_KEY, OPENAI_API_KEY, and HF_TOKEN are always added if they exist.
+
+    Args:
+        cluster_config: cluster config dictionary
+
+    Returns:
+        dict: dictionary of environment
+    """
+    env_vars = {}
+    # Check for user requested env variables
+    required_env_vars = cluster_config.get("required_env_vars", [])
+    for env_var in required_env_vars:
+        if "=" in env_var:
+            if env_var.count("=") == 1:
+                env_var, value = env_var.split("=")
+            else:
+                raise ValueError(f"Invalid required environment variable format: {env_var}")
+            env_vars[env_var.strip()] = value.strip()
+            logging.info(f"Adding required environment variable {env_var}")
+        elif env_var in os.environ:
+            logging.info(f"Adding required environment variable {env_var} from environment")
+            env_vars[env_var] = os.environ[env_var]
+        else:
+            raise ValueError(f"Required environment variable {env_var} not found.")
+
+    # It is fine to have these as always optional even if they are required for some configs
+    # Assume it is required, then this will override the value set above with the same
+    # value, assuming it has not been updated externally between these two calls
+    always_optional_env_vars = ["WANDB_API_KEY", "NVIDIA_API_KEY", "OPENAI_API_KEY", "HF_TOKEN"]
+    default_factories = {
+        "HF_TOKEN": lambda: str(get_token()),
+    }
+    # Add optional env variables
+    optional_env_vars = cluster_config.get("env_vars", [])
+    for env_var in optional_env_vars + always_optional_env_vars:
+        if "=" in env_var:
+            if env_var.count("=") == 1:
+                env_var, value = env_var.split("=")
+            else:
+                raise ValueError(f"Invalid optional environment variable format: {env_var}")
+            env_vars[env_var.strip()] = value.strip()
+            logging.info(f"Adding optional environment variable {env_var}")
+        elif env_var in os.environ:
+            logging.info(f"Adding optional environment variable {env_var} from environment")
+            env_vars[env_var] = os.environ[env_var]
+        elif env_var in default_factories:
+            env_vars[env_var] = default_factories[env_var]()
+            logging.info(f"Adding optional environment variable {env_var} from environment")
+        else:
+            logging.info(f"Optional environment variable {env_var} not found in user environment; skipping.")
+
+    return env_vars
+
+
+def get_mounts_from_config(cluster_config: dict):
+    """
+    Determines if there are mount paths that are being passed via environment variables.
+    Selects the key in the cluster config called `mounts` which is a list of strings.
+    Each string is in the format of `<str | {env_var}>:<str | {env_var}>` where `env_var`
+    is the name of the environment variable.
+
+    Args:
+        cluster_config (dict): cluster config dictionary
+
+    Returns:
+        list: updated list of mounts
+    """
+    mounts = cluster_config.get('mounts', [])
+
+    # if there are env_mounts, we will add the mounts from the env_mounts
+    for mount_id in range(len(mounts)):
+        mount = mounts[mount_id]
+
+        if ":" not in mount:
+            raise ValueError(f"Invalid mount format: {mount}. The mount path must be separated by a colon.")
+
+        mount_source, mount_target = mount.split(":")
+
+        if mount_source[0] == "{" and mount_source[-1] == "}":
+            # Resolve the environment variable for the mount source
+            mount_source = mount_source[1:-1]
+
+            if mount_source not in os.environ:
+                raise ValueError(
+                    f"Required environment variable {mount_source} not found in env variables passed in cluster configs."
+                )
+
+            mount_source = os.environ[mount_source]
+
+        if mount_target[0] == "{" and mount_target[-1] == "}":
+            # Resolve the environment variable for the mount target
+            mount_target = mount_target[1:-1]
+
+            if mount_target not in os.environ:
+                raise ValueError(
+                    f"Required environment variable {mount_target} not found in env variables passed in cluster configs."
+                )
+
+            mount_target = os.environ[mount_target]
+
+        # add the mount to the list of mounts
+        resolved_mount = f"{mount_source}:{mount_target}"
+        mounts[mount_id] = resolved_mount
+
+    return mounts
+
+
+def get_executor(
+    cluster_config,
+    container,
+    num_nodes,
+    tasks_per_node,
+    gpus_per_node,
+    job_name,
+    log_dir,
+    log_prefix: str = "main",
+    mounts=None,
+    partition=None,
+    time_min=None,
+    dependencies=None,
+    extra_package_dirs: tuple[str] | None = None,
+    heterogeneous=False,
+    het_group=None,
+    total_het_groups=None,
+    slurm_kwargs: dict | None = None,
+):
+    env_vars = get_env_variables(cluster_config)
+    config_mounts = get_mounts_from_config(cluster_config)
+
+    mounts = mounts or config_mounts
+    if extra_package_dirs is not None:
+        extra_package_dirs = tuple(extra_package_dirs)
+    packager = get_packager(extra_package_dirs=extra_package_dirs)
+    if cluster_config["executor"] == "local":
+        if num_nodes > 1:
+            raise ValueError("Local executor does not support multi-node execution")
+
+        env_vars["PYTHONUNBUFFERED"] = "1"  # this makes sure logs are streamed right away
+        return DockerExecutor(
+            container_image=container,
+            packager=packager,
+            ipc_mode="host",
+            volumes=mounts,
+            ntasks_per_node=1,
+            num_gpus=gpus_per_node,
+            network="host",
+            env_vars=env_vars,
+            additional_kwargs={"entrypoint": ""},
+        )
+
+    if not heterogeneous:
+        env_vars["SLURM_MASTER_NODE"] = "$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n1)"
+    else:
+        # master node will be within the same group
+        env_vars["SLURM_MASTER_NODE"] = (
+            f"$(scontrol show hostnames $SLURM_JOB_NODELIST_HET_GROUP_{het_group} | head -n1)"
+        )
+        # in addition defining master nodes for all groups to allow communication
+        for group in range(total_het_groups):
+            env_vars[f"SLURM_MASTER_NODE_HET_GROUP_{group}"] = (
+                f"$(scontrol show hostnames $SLURM_JOB_NODELIST_HET_GROUP_{group} | head -n1)"
+            )
+
+    partition = partition or cluster_config.get("partition")
+    if 'timeouts' not in cluster_config:
+        timeout = "10000:00:00:00"
+    else:
+        timeout = cluster_config["timeouts"][partition]
+
+    additional_parameters = {'time_min': time_min} if time_min is not None else {}
+    if cluster_config.get('mail_type') is not None:
+        additional_parameters['mail_type'] = cluster_config['mail_type']
+    if cluster_config.get('mail_user') is not None:
+        additional_parameters['mail_user'] = cluster_config['mail_user']
+    srun_args = [
+        "--no-container-mount-home",
+        "--overlap",
+        "--mpi=pmix",
+        '--wait=10',
+        # we need to be explicit about this in srun as commands might need to run in parallel
+        f"--ntasks-per-node={tasks_per_node}",
+        f"--nodes={num_nodes}",
+        # NeMo-run should take care of this, but we'll put it here temporarily
+        f"--container-env={','.join([k.strip() for k in env_vars.keys()])}",
+    ]
+    if not cluster_config.get("disable_gpus_per_node", False) and gpus_per_node is not None:
+        srun_args.append(f"--gpus-per-node={gpus_per_node}")
+
+    dependency_type = cluster_config.get("dependency_type", "afterany")
+
+    return run.SlurmExecutor(
+        account=cluster_config["account"],
+        partition=partition,
+        nodes=num_nodes,
+        ntasks_per_node=tasks_per_node,
+        tunnel=get_tunnel(cluster_config),
+        container_image=container,
+        container_mounts=mounts,
+        time=timeout,
+        additional_parameters=additional_parameters,
+        packager=packager,
+        gpus_per_node=gpus_per_node if not cluster_config.get("disable_gpus_per_node", False) else None,
+        srun_args=srun_args,
+        job_details=CustomJobDetails(
+            job_name=cluster_config.get("job_name_prefix", "") + job_name,
+            folder=get_unmounted_path(cluster_config, log_dir),
+            srun_prefix=log_prefix + '_' + job_name + '_',
+            sbatch_prefix=job_name + '_',
+        ),
+        wait_time_for_group_job=0.01,
+        monitor_group_job_wait_time=20,
+        dependencies=dependencies,
+        dependency_type=dependency_type,
+        heterogeneous=heterogeneous,
+        env_vars=env_vars,
+        **(slurm_kwargs or {}),
+    )
+
+
+@contextmanager
+def temporary_env_update(cluster_config, updates):
+    original_env_vars = cluster_config.get("env_vars", []).copy()
+    updated_env_vars = original_env_vars.copy()
+    for key, value in updates.items():
+        updated_env_vars.append(f"{key}={value}")
+        cluster_config["env_vars"] = updated_env_vars
+    try:
+        yield
+    finally:
+        cluster_config["env_vars"] = original_env_vars
+
+
+# TODO: this function has become too cumbersome to use with all recently added support
+#       we should make it simpler by perhaps removing separate logic for server/sandbox
+#       and supporting them through a list of cmds directly
+#       should also make heterogenous logic very clear and more robust
+#       and all parameters that can be list should be list for consistency
+def add_task(
+    exp,
+    cmd: str | list[str],
+    task_name,
+    cluster_config,
+    container: str | list[str],
+    num_tasks: int | list[int] = 1,
+    num_gpus=None,
+    num_nodes=1,
+    log_dir=None,
+    partition=None,
+    time_min=None,
+    with_sandbox=False,
+    sandbox_port: int | None = None,
+    server_config=None,
+    reuse_code_exp: str | run.Experiment | None = None,
+    reuse_code: bool = True,
+    task_dependencies: list[str] = None,
+    run_after: str | list[str] | None = None,
+    get_server_command=get_server_command,
+    extra_package_dirs: list[str] | None = None,
+    slurm_kwargs: dict | None = None,
+    heterogeneous: bool = False,
+):
+    """Wrapper for nemo-run exp.add to help setting up executors and dependencies.
+
+    Note that there are two parameters that control dependencies.
+        - task_dependencies: list of tasks that this task depends on **within the same experiment**
+        - run_after: a string with experiment name or a list of experiment names that this task
+          should run after. Will schedule dependencies on all tasks inside `run_after` experiments.
+          It needs to already be launched and running.
+
+    Example of how to set task_dependencies:
+
+    with run.Experiment(expname) as exp:
+        task1 = add_task(exp, ...)
+        task2 = add_task(exp, ..., task_dependencies=[task1])
+
+    You can use `reuse_code_exp` to reuse the code from another experiment
+    (and thus avoid costly packaging/ssh uploading). You can provide either experiment
+    name or the experiment object itself.
+
+    By default we will reuse the code of the first submitted experiment.
+    If you want to avoid this, set `reuse_code=False`.
+    """
+    if run_after is not None and cluster_config["executor"] == "slurm":
+        if isinstance(run_after, (str, run.Experiment)):
+            run_after = [run_after]
+        dependencies = []
+        for dep_expname in run_after:
+            exp_handles = get_exp_handles(dep_expname)
+            if len(exp_handles) == 0:
+                LOG.warning(
+                    "No pending or running tasks found for experiment %s, cannot set dependencies.", dep_expname
+                )
+            dependencies.extend(exp_handles)
+        if len(dependencies) == 0:
+            dependencies = None
+    else:
+        dependencies = None
+
+    if num_gpus is None and cluster_config['executor'] == "slurm":
+        if not 'cpu' in (partition or cluster_config.get("partition", "")):
+            num_gpus = 1
+
+    if sandbox_port is None:
+        sandbox_port = get_free_port(strategy="random")
+
+    het_group = 0
+    het_group_indices = []
+    total_het_groups = (server_config is not None) + bool(cmd) + with_sandbox
+
+    commands = []
+    executors = []
+    # assuming server always has the largest resources request, so it needs to go first
+    if server_config is not None:
+        server_cmd, num_server_tasks = get_server_command(**server_config, cluster_config=cluster_config)
+        if 'container' not in server_config:
+            server_container = cluster_config["containers"][server_config['server_type']]
+        server_executor = get_executor(
+            cluster_config=cluster_config,
+            container=server_container,
+            num_nodes=server_config['num_nodes'],
+            tasks_per_node=num_server_tasks,
+            gpus_per_node=server_config['num_gpus'],
+            partition=partition,
+            time_min=time_min,
+            dependencies=dependencies,
+            job_name=task_name,
+            log_dir=log_dir,
+            log_prefix="server",
+            extra_package_dirs=extra_package_dirs,
+            slurm_kwargs=slurm_kwargs,
+            heterogeneous=heterogeneous,
+            het_group=het_group,
+            total_het_groups=total_het_groups,
+        )
+        if cluster_config["executor"] == "local" and num_server_tasks > 1:
+            server_cmd = f"mpirun --allow-run-as-root -np {num_server_tasks} bash -c {shlex.quote(server_cmd)}"
+        commands.append(server_cmd)
+        executors.append(server_executor)
+        het_group_indices.append(het_group)
+        het_group += 1
+
+    # then goes the main task(s) unless it's empty
+    if cmd:
+        if isinstance(cmd, str):
+            cmd = [cmd]
+        if isinstance(container, str):
+            container = [container]
+        if isinstance(num_tasks, int):
+            num_tasks = [num_tasks]
+        if len(cmd) != len(container) or len(cmd) != len(num_tasks):
+            raise ValueError("Number of commands, containers and num_tasks must match.")
+        for cur_idx, (cur_cmd, cur_container, cur_tasks) in enumerate(zip(cmd, container, num_tasks)):
+            if cluster_config["executor"] == "local" and cur_tasks > 1:
+                cur_cmd = f"mpirun --allow-run-as-root -np {cur_tasks} bash -c {shlex.quote(cur_cmd)}"
+            with temporary_env_update(cluster_config, {"NEMO_SKILLS_SANDBOX_PORT": sandbox_port}):
+                commands.append(cur_cmd)
+                executors.append(
+                    get_executor(
+                        cluster_config=cluster_config,
+                        container=cur_container,
+                        num_nodes=num_nodes,
+                        tasks_per_node=cur_tasks,
+                        gpus_per_node=num_gpus,
+                        partition=partition,
+                        time_min=time_min,
+                        dependencies=dependencies,
+                        job_name=task_name,
+                        log_dir=log_dir,
+                        log_prefix="main" if len(cmd) == 1 else f"main_{cur_idx}",
+                        extra_package_dirs=extra_package_dirs,
+                        slurm_kwargs=slurm_kwargs,
+                        heterogeneous=heterogeneous,
+                        het_group=het_group,
+                        total_het_groups=total_het_groups,
+                    )
+                )
+                het_group_indices.append(het_group)
+        het_group += 1
+
+    # finally a sandbox if needed
+    if with_sandbox:
+        sandbox_env_updates = {"LISTEN_PORT": sandbox_port}
+        current_env_vars = cluster_config.get("env_vars", []).copy()
+        for override in current_env_vars:
+            if "PYTHONPATH" in override:
+                if override.startswith("PYTHONPATH="):
+                    override = override[11:]
+                sandbox_env_updates["PYTHONPATH"] = override + ":/app"
+
+        with temporary_env_update(cluster_config, sandbox_env_updates):
+            commands.append(get_sandox_command())
+            sandbox_executor = get_executor(
+                cluster_config=cluster_config,
+                container=cluster_config["containers"]["sandbox"],
+                num_nodes=executors[0].nodes if cluster_config["executor"] == "slurm" else 1,
+                tasks_per_node=1,
+                gpus_per_node=num_gpus,
+                partition=partition,
+                time_min=time_min,
+                mounts=tuple(),  # we don't want to mount anything
+                dependencies=dependencies,
+                job_name=task_name,
+                log_dir=log_dir,
+                log_prefix="sandbox",
+                extra_package_dirs=extra_package_dirs,
+                slurm_kwargs=slurm_kwargs,
+                heterogeneous=heterogeneous,
+                het_group=het_group,
+                total_het_groups=total_het_groups,
+            )
+            executors.append(sandbox_executor)
+            het_group_indices.append(het_group)
+        het_group += 1
+
+    if cluster_config["executor"] != "local":
+        tunnel = get_tunnel(cluster_config)
+        if isinstance(tunnel, run.SSHTunnel) and reuse_code:
+            reuse_code_exp = reuse_code_exp or REUSE_CODE_EXP.get(tunnel_hash(tunnel))
+            if reuse_code_exp is not None:
+                if isinstance(reuse_code_exp, str):
+                    try:
+                        reuse_code_exp = run.Experiment.from_id(reuse_code_exp)
+                    except Exception:
+                        LOG.debug(f"Failed to create experiment from id {reuse_code_exp}, trying to find it by title")
+                        reuse_code_exp = run.Experiment.from_title(reuse_code_exp)
+
+                LOG.info("Trying to reuse code from experiment %s", reuse_code_exp._title)
+                reuse_key = get_packaging_job_key(reuse_code_exp._id, "nemo-run")
+                if reuse_key in reuse_code_exp.tunnels[tunnel.key].packaging_jobs:
+                    reuse_dir = reuse_code_exp.tunnels[tunnel.key].packaging_jobs[reuse_key].dst_path
+
+                    for executor in executors:
+                        executor.packager.symlink_from_remote_dir = reuse_dir
+                    LOG.info(f"Successfully reused code from {reuse_key}")
+                else:
+                    LOG.warning("Relevant packaging job not found for experiment %s", reuse_code_exp._title)
+        # if current is not reused, we are refreshing the cache as there is a reason to believe it's outdated
+        elif isinstance(tunnel, run.SSHTunnel):
+            REUSE_CODE_EXP.pop(tunnel_hash(tunnel), None)
+
+    if len(commands) == 1:
+        # to keep sbatch script simpler, we don't wrap in a list in this case
+        return exp.add(
+            run.Script(inline=commands[0]),
+            executor=executors[0],
+            name="nemo-run",
+            dependencies=task_dependencies,
+        )
+    else:
+        if heterogeneous:
+            executors[0].het_group_indices = het_group_indices
+        return exp.add(
+            [run.Script(inline=command) for command in commands],
+            executor=executors,
+            name="nemo-run",
+            dependencies=task_dependencies,
+        )
+
+
+def run_exp(exp, cluster_config, sequential=None):
+    """If sequential is not specified, using True locally and False otherwise.
+
+    If it is specified, it will be used as is.
+    """
+    if cluster_config['executor'] == 'local':
+        exp.run(detach=False, tail_logs=True, sequential=True if sequential is None else sequential)
+    else:
+        exp.run(detach=True, sequential=False if sequential is None else sequential)
+
+        # caching the experiment code for reuse
+        tunnel = get_tunnel(cluster_config)
+        if isinstance(tunnel, run.SSHTunnel):
+            ssh_hash = tunnel_hash(tunnel)
+            if ssh_hash not in REUSE_CODE_EXP:
+                REUSE_CODE_EXP[ssh_hash] = exp
\ No newline at end of file

From abc22408164a0882f679d8ea927e6a32f730a608 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 28 May 2025 20:05:48 +0400
Subject: [PATCH 04/36] IPL Processors

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 inference_output_manifest_filepath.json |  2 +-
 sdp/processors/IPL/ipl_processors.py    | 20 ++++++++++----------
 sdp/processors/__init__.py              |  1 +
 sdp/utils/skills_utils.py               |  2 +-
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/inference_output_manifest_filepath.json b/inference_output_manifest_filepath.json
index df932017..ac8711aa 100644
--- a/inference_output_manifest_filepath.json
+++ b/inference_output_manifest_filepath.json
@@ -1,3 +1,3 @@
 {
-    "inference_command": " && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_0.yaml predict_ds.shuffle=True && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_1.yaml predict_ds.shuffle=True && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_2.yaml predict_ds.shuffle=True && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_3.yaml predict_ds.shuffle=True && python /workspace/nemo/scripts/pseudo_labeling/write_transcribed_files.py --prediction_filepaths /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket1/sharded_manifests /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket2/sharded_manifests /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket3/sharded_manifests /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket4/sharded_manifests  --is_tarred"
+    "inference_command": " && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_0.yaml predict_ds.shuffle=True && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_1.yaml predict_ds.shuffle=True && python /workspace/nemo/scripts/pseudo_labeling/write_transcribed_files.py --prediction_filepaths /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket1/sharded_manifests /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket2/sharded_manifests  --is_tarred"
 }
\ No newline at end of file
diff --git a/sdp/processors/IPL/ipl_processors.py b/sdp/processors/IPL/ipl_processors.py
index b8373f01..18c159ef 100644
--- a/sdp/processors/IPL/ipl_processors.py
+++ b/sdp/processors/IPL/ipl_processors.py
@@ -274,7 +274,8 @@ def process(self, first_run=False):
         prediction_directories_str = " ".join([os.path.dirname(path) for path in self.manifests])
         inference_config_paths_str = " ".join(self.inference_config_paths)        
         write_transcription_path = os.path.join(self.nemo_directory, "scripts/pseudo_labeling/write_transcribed_files.py")
-        update_inference_config_path = os.path.join(self.nemo_directory, "scripts/pseudo_labeling/update_inference_config.pys")
+        update_inference_config_path = os.path.join(self.nemo_directory, "scripts/pseudo_labeling/update_inference_config.py")
+        
         if first_run:
             cmd += f"{self.get_pl_inference_command(self.inference_config_paths, shuffle=False)}"
             cmd += (
@@ -287,15 +288,14 @@ def process(self, first_run=False):
                 f" && python {update_inference_config_path} "
                 f"--inference_configs {inference_config_paths_str} --p_cache {self.p_cache} --num_gpus {self.num_gpus}"
             )
-
-       
-        cmd += f" && {self.get_pl_inference_command(self.inference_config_paths, shuffle=True)}"
-        cmd += (
-            f" && python {write_transcription_path} "
-            f"--prediction_filepaths {prediction_directories_str} "
-        )
-        if self.is_tarred:
-            cmd += " --is_tarred"
+        else:
+            cmd += f" && {self.get_pl_inference_command(self.inference_config_paths, shuffle=True)}"
+            cmd += (
+                f" && python {write_transcription_path} "
+                f"--prediction_filepaths {prediction_directories_str} "
+            )
+            if self.is_tarred:
+                cmd += " --is_tarred"
 
         output_data = {"inference_command": cmd}
         with open(self.output_manifest_file, 'w') as f:
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 43df6448..ce3b71b5 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -114,6 +114,7 @@
     MakeLettersUppercaseAfterPeriod,
 )
 from sdp.processors.nemo.asr_inference import ASRInference
+from sdp.processors.nemo.nemo_run_processor import NemoRunIPLProcessor
 from sdp.processors.nemo.pc_inference import PCInference
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool
diff --git a/sdp/utils/skills_utils.py b/sdp/utils/skills_utils.py
index b8a1d707..892fdcd1 100644
--- a/sdp/utils/skills_utils.py
+++ b/sdp/utils/skills_utils.py
@@ -674,7 +674,7 @@ def get_packager(extra_package_dirs: tuple[str] | None = None):
         include_patterns = []
         include_pattern_relative_paths = []
 
-    check_uncommited_changes = not bool(os.getenv('NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK', 0))
+    check_uncommited_changes = False
 
     # are we in a git repo? If yes, we are uploading the current code
     repo_path = get_git_repo_path(path=None)  # check if we are in a git repo in pwd

From bfdc49c5413314e7ebadfae6a29ecb9b7afe70d2 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 28 May 2025 21:51:26 +0400
Subject: [PATCH 05/36] IPL Processors

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/nemo/nemo_run_processor.py | 318 ++++++++++++++++++++++
 1 file changed, 318 insertions(+)
 create mode 100644 sdp/processors/nemo/nemo_run_processor.py

diff --git a/sdp/processors/nemo/nemo_run_processor.py b/sdp/processors/nemo/nemo_run_processor.py
new file mode 100644
index 00000000..f6160270
--- /dev/null
+++ b/sdp/processors/nemo/nemo_run_processor.py
@@ -0,0 +1,318 @@
+from sdp.processors.base_processor import BaseProcessor
+from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator, InferenceCommandGenerator
+from omegaconf import OmegaConf, open_dict
+import os
+from pathlib import Path
+import logging
+import datetime
+import nemo_run as run
+from sdp.utils import nemo_run_utils
+
+class NemoRunIPLProcessor(BaseProcessor):
+    """
+    A processor that handles Iterative Pseudo-Labeling (IPL) training workflow.
+    
+    Args:
+        config_path (str): Path to the YAML configuration file containing IPL settings
+        output_manifest_file (str): Path where the output manifest file will be written
+        input_manifest_file (str, optional): Path to the input manifest file
+    """
+    
+    def __init__(
+        self,
+        config_path: str,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.config_path = config_path
+        
+    def process(self):
+        """
+        Main processing method that implements the IPL workflow.
+        This method:
+        1. Loads and validates configurations
+        2. Sets up training and inference command generators
+        3. Executes the IPL training pipeline
+        """
+        # Load the cluster config from YAML
+        cluster_cfg = OmegaConf.load(self.config_path)
+        
+        # Process the required arguments from the cluster config
+        script_path = cluster_cfg.script
+        script_config_path = cluster_cfg.script_config
+        results_dir = cluster_cfg.results_dir
+        nemo_root = cluster_cfg.nemo_directory
+        inference_config = cluster_cfg.inference_config
+        do_average = cluster_cfg.get('do_average', False)
+        inference_config_path = Path(inference_config).absolute()
+
+        inference_config = OmegaConf.load(inference_config_path)
+
+        script_config_path = Path(script_config_path).absolute()
+
+        # Gather all mounts from the cluster config
+        self.gather_mounts(cluster_cfg)
+
+        # Add the results directory to the cluster config as a mount path
+        nemo_run_utils.add_mount_path(results_dir, '/results', cluster_cfg)
+
+        # Create results and logdir
+        log_dir = cluster_cfg.get('log_dir', os.path.join(results_dir, 'logs'))
+        nemo_run_utils.create_remote_directory([results_dir, log_dir], cluster_cfg)
+
+        # Load the script config
+        script_config = OmegaConf.load(script_config_path)
+
+        # Validate IPL training configuration
+        if "ipl_training" not in script_config.model:
+            raise KeyError("Parameters for `IPL` training are not provided.")
+        # Check all paths in configs are properly mounted
+   
+        self.check_config_mount_paths(script_config, cluster_cfg)
+        # Resolve experiment name
+        exp_name = cluster_cfg.exp_name
+        if exp_name is None:
+            if 'exp_manager' in script_config and 'name' in script_config['exp_manager']:
+                exp_name = script_config['exp_manager']['name']
+            else:
+                raise ValueError(
+                    "Experiment name not provided in the run config file (`exp_name`) or the cluster config (inside exp_manager.name)"
+                )
+
+        # Begin NeMo Run setup
+        with run.Experiment(exp_name) as exp:
+            # Create the config file name
+            timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+            config_name = f"{exp_name}_{timestamp}_config.yaml"
+
+            # Copy the merged config file to remote location's /results/configs directory
+            config_dir = os.path.join(results_dir, 'configs')
+            train_config_cluster = nemo_run_utils.create_remote_config(script_config, config_name, config_dir, cluster_cfg)
+
+            # Get run parameters from the config
+            num_runs = cluster_cfg.num_runs
+            num_gpus = cluster_cfg.get('num_gpus', script_config['trainer']['devices'])
+            if isinstance(num_gpus, list):
+                num_gpus = len(num_gpus)
+            if num_gpus == -1:
+                num_gpus = 1 if cluster_cfg['executor'] == 'local' else 8
+                logging.warning(f"\n\nSetting num_gpus to {num_gpus} as it was set to -1\n\n")
+            num_nodes = cluster_cfg.get('num_nodes', script_config['trainer'].get('num_nodes', 1))
+
+            # Set up checkpoint paths
+            checkpoint_dir = os.path.join(
+                os.path.join(script_config.exp_manager.exp_dir, script_config.exp_manager.name), "checkpoints"
+            )
+            checkpoint_name = os.path.join(checkpoint_dir, script_config.exp_manager.name + ".nemo")
+            
+            # Create remote inference config
+            if do_average:
+                avg_cmd, averaged_checkpoint = self.average_checkpoints(checkpoint_name, nemo_root)
+            else:
+                avg_cmd = None
+                averaged_checkpoint = checkpoint_name
+            inference_config_paths, manifests, tarr_paths = nemo_run_utils.create_remote_inference_config(
+                cluster_cfg, config_dir, inference_config, averaged_checkpoint
+            )
+            self.check_config_mount_paths(inference_config, cluster_cfg)
+            # Configure command generators
+            train_command_generator_config = { 
+                "nemo_directory": nemo_root,
+                "training_config_local": script_config,
+                "training_config_cluster": train_config_cluster,
+                "training_script_path": script_path,
+                "output_manifest_file": "./train_output_manifest_filepath.json",
+            }   
+            inference_command_generator_config = {
+                "nemo_directory": nemo_root,
+                "inference_config_paths": inference_config_paths,
+                "manifests": manifests,
+                "p_cache": script_config.model.ipl_training.p_cache,
+                "num_gpus": num_nodes * num_gpus,
+                "is_tarred": getattr(script_config.model.train_ds, "is_tarred", False),
+                "output_manifest_file": "./inference_output_manifest_filepath.json",
+            }
+
+            # Generate the complete IPL command
+            cmd = self.get_pseudo_labeling_command(
+                train_command_generator_config,
+                inference_command_generator_config,
+                num_ipl_epochs=script_config.model.ipl_training.num_ipl_epochs,
+                new_manifest_files=manifests,
+                new_tarr_files=tarr_paths,
+                first_run=True,
+                avg_cmd=avg_cmd
+            )
+
+            # Cast the cluster config to a dictionary for compatibility with NeMo Run
+            cluster_cfg = OmegaConf.to_object(cluster_cfg)
+
+            # Schedule tasks
+            task = None
+            for run_id in range(num_runs):
+                if run_id == 0:
+                    task = None
+                else:
+                    cmd = self.get_pseudo_labeling_command(
+                        train_command_generator_config,
+                        inference_command_generator_config,
+                        num_ipl_epochs=script_config.model.ipl_training.num_ipl_epochs,
+                        new_manifest_files=manifests,
+                        new_tarr_files=tarr_paths,
+                        first_run=False
+                    )
+                    task = [task]
+
+                task = nemo_run_utils.add_task(
+                    exp,
+                    cmd=cmd,
+                    task_name=f"{exp_name}_job",
+                    cluster_config=cluster_cfg,
+                    container=cluster_cfg['containers']['asr'],
+                    num_tasks=cluster_cfg.get('num_tasks', cluster_cfg.get('num_tasks_per_node', 1)),
+                    num_gpus=num_gpus,
+                    num_nodes=num_nodes,
+                    log_dir=nemo_run_utils.get_mounted_filepath(cluster_cfg, log_dir),
+                    partition=cluster_cfg.get('partition', None),
+                    task_dependencies=task,
+                )
+
+            # Run the experiment
+            nemo_run_utils.run_exp(exp, cluster_cfg)
+
+    def gather_mounts(self, cluster_cfg):
+        """
+        Gather all mounts from the cluster config including ones which are disjoint from the cluster_cfg.mounts list.
+        
+        Args:
+            cluster_cfg: Cluster config dictionary
+        """
+        mounts = cluster_cfg.get('mounts', [])
+        mounts = [os.path.expanduser(m) for m in mounts]
+
+        keys = list(cluster_cfg.keys())
+        with open_dict(cluster_cfg):
+            for k in keys:
+                if k.startswith("mount_"):
+                    logging.info(f"Found additional mount flag in the cluster config `{k}`. Adding it to the mounts list.")
+                    mounts.append(cluster_cfg[k])
+                    del cluster_cfg[k]
+
+            cluster_cfg['mounts'] = mounts
+            logging.info(f"Final Mounts: {mounts}")
+
+    def check_config_mount_paths(self, script_config, cluster_config):
+        """
+        Check if all path-like strings in the script config are mounted paths in the cluster config.
+        
+        Args:
+            script_config: Script config dictionary
+            cluster_config: Cluster config dictionary
+        """
+        def filepath_check(v, cluster_cfg):
+            if v.startswith(os.path.sep):
+                logging.info(f"Checking if {v} is a mounted path")
+                nemo_run_utils.check_if_mounted(cluster_cfg, v)
+                unmounted_path = nemo_run_utils.get_unmounted_filepath(cluster_cfg, v)
+                nemo_run_utils.check_remote_mount_directories(unmounted_path, cluster_cfg)
+
+        def check_mounted_path(cfg, cluster_cfg):
+            if hasattr(cfg, 'items'):
+                for k, v in cfg.items():
+                    if hasattr(v, 'items'):
+                        check_mounted_path(v, cluster_cfg)
+                    elif isinstance(v, list):
+                        for item in v:
+                            if isinstance(item, str):
+                                filepath_check(item, cluster_cfg)
+                    elif isinstance(v, str):
+                        filepath_check(v, cluster_cfg)
+
+        check_mounted_path(script_config, cluster_config)
+
+    def get_pseudo_labeling_command(
+        self,
+        train_command_config: dict,
+        inference_command_config: dict,
+        num_ipl_epochs: int,
+        new_manifest_files,
+        new_tarr_files,
+        first_run: bool = False,
+        avg_cmd: str = None
+    ) -> str:
+        """
+        Generate the pseudo-labeling command for the given configuration and training parameters.
+
+        Args:
+            train_command_config (dict): Config for TrainingCommandGenerator
+            inference_command_config (dict): Config for InferenceCommandGenerator
+            num_ipl_epochs (int): Number of epochs to train with pseudo-labels
+            new_manifest_files: List of manifest files to use
+            new_tarr_files: List of tarred audio files to use
+            first_run (bool): Whether this is the first run of pseudo-labeling
+
+        Returns:
+            str: The constructed pseudo-labeling command
+        """
+        train_proc = TrainingCommandGenerator(**train_command_config)
+        infer_proc = InferenceCommandGenerator(**inference_command_config)
+
+        exec_cmd = self.get_export_variables_cmd(train_command_config["training_config_local"], train_command_config["nemo_directory"])
+
+        exec_cmd += train_proc.process()
+        exec_cmd += " && sleep 10"
+        if avg_cmd:
+            exec_cmd += " && " + avg_cmd
+        exec_cmd += " && " + infer_proc.process(first_run=first_run)
+
+        for _ in range(num_ipl_epochs):
+            exec_cmd += " && sleep 10"
+            exec_cmd += " && " + train_proc.process(new_manifest_files, new_tarr_files)
+            if avg_cmd:
+                exec_cmd += " && " + avg_cmd
+            exec_cmd += " " + infer_proc.process(first_run=False)
+
+        return exec_cmd
+
+    def get_export_variables_cmd(self, merged_cfg , nemo_root):
+        """Generate command to export required environment variables."""
+        wandb_key = os.environ.get("WANDB_API_KEY") or os.environ.get("WANDB") or os.environ.get("WANDB_KEY", "")
+        if not wandb_key:
+            logging.warning("WANDB key not found in environment variables. WANDB logging will not work.")
+
+            if merged_cfg.get('exp_manager', {}).get('create_wandb_logger', False):
+                raise ValueError(
+                    "WANDB key is required for logging but was not found in environment variables. "
+                    "Please set WANDB_API_KEY to enable WANDB logging."
+                )
+
+        cmd = (
+            "nvidia-smi && "
+            f"export PYTHONPATH={nemo_root} && "
+            f"export HF_TOKEN={os.getenv('HF_TOKEN', '')} && "
+            f"export WANDB_API_KEY={wandb_key} && ")
+        
+        return cmd
+    
+    def average_checkpoints(self, checkpoint_path: str, nemo_root:str) -> str:
+        """
+        Generates the command to average all checkpoints in the given directory and returns the path to the averaged checkpoint.
+        
+        Args:
+            checkpoint_path (str): Path to the directory containing checkpoints
+            
+        Returns:
+            tuple: (command to run, path to the averaged checkpoint file)
+        """
+        # Get the directory containing the checkpoints
+        checkpoint_dir = os.path.dirname(checkpoint_path)
+        
+        # Construct the command for checkpoint averaging
+        cmd = f"python {nemo_root}/scripts/checkpoint_averaging/legacy/checkpoint_averaging.py {checkpoint_dir}"
+        
+        # The averaged checkpoint will have the same name but with '-averaged' suffix
+        checkpoint_name = os.path.basename(checkpoint_path)
+        base_name = os.path.splitext(checkpoint_name)[0]
+        averaged_checkpoint = os.path.join(checkpoint_dir, f"{base_name}-averaged.nemo")
+        
+        return cmd, averaged_checkpoint
\ No newline at end of file

From aae3a02aa561c0c889028ca5be5be5bc7f5f7d64 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 28 May 2025 21:52:47 +0400
Subject: [PATCH 06/36] IPL Processors

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/nemo/nemo_run_processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sdp/processors/nemo/nemo_run_processor.py b/sdp/processors/nemo/nemo_run_processor.py
index f6160270..6fefad6d 100644
--- a/sdp/processors/nemo/nemo_run_processor.py
+++ b/sdp/processors/nemo/nemo_run_processor.py
@@ -258,7 +258,6 @@ def get_pseudo_labeling_command(
         infer_proc = InferenceCommandGenerator(**inference_command_config)
 
         exec_cmd = self.get_export_variables_cmd(train_command_config["training_config_local"], train_command_config["nemo_directory"])
-
         exec_cmd += train_proc.process()
         exec_cmd += " && sleep 10"
         if avg_cmd:
@@ -315,4 +314,4 @@ def average_checkpoints(self, checkpoint_path: str, nemo_root:str) -> str:
         base_name = os.path.splitext(checkpoint_name)[0]
         averaged_checkpoint = os.path.join(checkpoint_dir, f"{base_name}-averaged.nemo")
         
-        return cmd, averaged_checkpoint
\ No newline at end of file
+        return cmd, averaged_checkpoint

From 125699ac5c917f670ecc60bb2391ce663f250030 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 28 May 2025 22:15:25 +0400
Subject: [PATCH 07/36] Remove unneseccary files

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 inference_output_manifest_filepath.json |   3 -
 run_ipl.py                              | 187 ------------
 run_ipl.yaml                            |   5 -
 run_pt_mcv.yaml                         |  57 ----
 run_pt_mcv_cs_you.yaml                  |  64 ----
 sdp/processors/nemo/ipl_command.py      | 184 -----------
 sdp/processors/nemo/ipl_training.py     |  13 -
 sdp/processors/nemo/ipl_utils.py        | 330 --------------------
 sdp/processors/nemo/nemo_run_ipl.py     | 386 ------------------------
 sdp/utils/ipl_utils.py                  | 356 ++++++++++++++++------
 sdp/utils/nemo_run_utils.py             |   2 +-
 11 files changed, 273 insertions(+), 1314 deletions(-)
 delete mode 100644 inference_output_manifest_filepath.json
 delete mode 100644 run_ipl.py
 delete mode 100644 run_ipl.yaml
 delete mode 100644 run_pt_mcv.yaml
 delete mode 100644 run_pt_mcv_cs_you.yaml
 delete mode 100644 sdp/processors/nemo/ipl_command.py
 delete mode 100644 sdp/processors/nemo/ipl_training.py
 delete mode 100644 sdp/processors/nemo/ipl_utils.py
 delete mode 100644 sdp/processors/nemo/nemo_run_ipl.py

diff --git a/inference_output_manifest_filepath.json b/inference_output_manifest_filepath.json
deleted file mode 100644
index ac8711aa..00000000
--- a/inference_output_manifest_filepath.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "inference_command": " && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_0.yaml predict_ds.shuffle=True && python /workspace/nemo/examples/asr/transcribe_speech_parallel.py --config-path /lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth/configs --config-name modified_config_1.yaml predict_ds.shuffle=True && python /workspace/nemo/scripts/pseudo_labeling/write_transcribed_files.py --prediction_filepaths /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket1/sharded_manifests /lustre/fsw/portfolios/convai/users/ntadevosyan/draco_data/datasets/pt/tarred_unlabeled_youtube_2/bucket2/sharded_manifests  --is_tarred"
-}
\ No newline at end of file
diff --git a/run_ipl.py b/run_ipl.py
deleted file mode 100644
index f0512c01..00000000
--- a/run_ipl.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import copy
-import glob
-import os
-import subprocess
-import sys
-from pathlib import Path
-from typing import Any, Dict
-import torch
-from typing import List, Optional, Tuple, Union
-from omegaconf import OmegaConf, open_dict
-#import sdp.processors.nemo.ipl_utils as ipl_utils
-#from nemo.core.config import hydra_runner
-from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator
-
-# def check_training_finished(log_dir):
-#     """
-#     Searches to see ig lightning finished training .
-#     Parameters:
-#         log_dir (str): Directory where logs are stored.
-#     """
-#     print(f"************************************************")
-#     print(f"************************************************")
-
-#     if not os.path.exists(log_dir):
-#         print(f"Log directory '{log_dir}' does not exist.")
-#         return
-#     print(f"")
-#     log_pattern = os.path.join(log_dir, f"lightning_logs.txt")
-#     command = f"grep -ri '`Trainer.fit` stopped:' {log_pattern}"
-
-#     result = subprocess.run(command, shell=True, capture_output=True, text=True)
-#     if result.stdout:
-#         print("Stopping reasons found:")
-#         print(result.stdout)
-#         return True
-#     else:
-#         print("No stopping reasons found in the logs.")
-#         return False
-    
-# def get_command_for_inference(
-#     inference_config: str, inference_config_dir: Union[str, Path], p_cache: float, checkpoint: str, nemo_path: str
-# ) -> Tuple[str, List[str], List[str]]:
-#     """
-#     Generates the command string for running speech inference with transcribe_speech_parallel.
-#     Args:
-#         inference_config (str): Path to the base inference configuration file.
-#         inference_config_dir (Union[str, Path]): Directory to store temporary modified configurations.
-#         p_cache (float): Proportion of the dataset to be cached for pseudo-labeling.
-#         checkpoint (str): Path to the model checkpoint to use for inference.
-#     Returns:
-#         Tuple[str, List[str], List[str]]:
-#             - The command string to execute inference for all specified manifests.
-#             - List of output directories corresponding to each manifest.
-#             - List of completed full pass transcribed manifest paths, if any.
-#     """
-#     """"""
-    
-#     manifests, tarr_audio_files = ipl_utils.separate_multiple_transcriptions(inference_config)
-#     num_gpus = torch.cuda.device_count()
-#     output_dirs = []
-#     cmd = ""
-#     for i in range(len(manifests)):
-#         print()
-#         print(f"manifests  {manifests[i]}")
-#         output_dir = os.path.dirname(manifests[i])
-#         output_dirs.append(output_dir)
-#         print(f"output_dir {output_dir}")
-#         base_cfg = OmegaConf.load(inference_config)
-#         print(f"inference_config_dir {inference_config_dir}")
-#         print()
-#         temp_config_dir = Path(str(inference_config_dir) + "/temp_configs").absolute()
-#         os.makedirs(temp_config_dir, exist_ok=True)
-#         modified_cfg = copy.deepcopy(base_cfg)
-
-#         # Check if we need to run inference on the whole set or update part of it
-#         full_pass_done = glob.glob(os.path.join(output_dir, 'transcribed_manifest*'))
-#         if full_pass_done:
-#             number_of_files = ipl_utils.count_files_for_pseudo_labeling(manifests[i], bool(tarr_audio_files))
-#             limit_predict_batches = int((number_of_files * p_cache) / (modified_cfg.predict_ds.batch_size * num_gpus))
-#             OmegaConf.update(modified_cfg, "trainer.limit_predict_batches", limit_predict_batches)
-
-#         # Replace OmegaConf updates with simple assignments
-#         OmegaConf.update(modified_cfg, "output_path", output_dir)
-#         OmegaConf.update(modified_cfg, "predict_ds.manifest_filepath", manifests[i])
-#         if tarr_audio_files:
-#             OmegaConf.update(modified_cfg, "predict_ds.tarred_audio_filepaths", tarr_audio_files[i])
-#         OmegaConf.update(modified_cfg, "model", checkpoint)
-
-#         temp_config_file = os.path.join(temp_config_dir, f"modified_config_{i}.yaml")
-#         OmegaConf.save(modified_cfg, temp_config_file)
-#         trancribe_script = nemo_path + "/" + "transcribe_speech_parallel.py"
-#         cmd += f"python {trancribe_script} --config-path {temp_config_dir} --config-name modified_config_{i}.yaml && "
-
-#     # Remove trailing '&&' from the final command string
-#     cmd = cmd.rstrip(" &&")
-
-#     print(f"Inference command: {cmd}")
-#     return cmd, output_dirs, full_pass_done
-
-
-# def merge_configs(script_config_path, run_config):
-#     # Load the configurations
-#     script_config = OmegaConf.load(script_config_path)
-
-#     print(run_config)
-
-#     # Keep track of the original keys in script_config
-#     original_script_keys = set(script_config.keys())
-
-#     # Merge only the 'training' part of run_config with script_config
-#     result = OmegaConf.merge(script_config, run_config)
-
-#     with open_dict(result):
-#         for k in run_config.keys():
-#             if k in result and k not in original_script_keys:
-#                 del result[k]
-
-#     def check_missing_values(cfg):
-#         if hasattr(cfg, 'items'):
-#             for k, v in cfg.items():
-#                 if hasattr(v, 'items'):
-#                     check_missing_values(v)
-#                 elif v == '???':
-#                     raise ValueError(f"Missing value for key {k} in the config file")
-
-#     check_missing_values(result)
-#     result.exp_manager.resume_if_exists = True
-#     return result
-
-
-# def get_execution_script(cluster_script_path: str, config_name: str, config_path: str, nemo_path: str) -> str:
-#     """
-#     Constructs a command string to execute a training with the specified configuration.
-#     Args:
-#         cluster_script_path (str): Path to the cluster script to be executed.
-#         config_name (str): Name of the configuration file or object to be passed as a parameter.
-#         config_path (str): Path to the directory where the configuration resides.
-#     Returns:
-#         str: A formatted command string ready for execution.
-#     """
-#     # Create the command to run the script
-#     cluster_script_path = nemo_path + "/" + cluster_script_path
-#     cmd = """
-#         python {cluster_script_path} --config-path {config_path} --config-name "{config_name}" 
-#     """
-#     print("in get_execution_script")
-#     print(f"cluster_script_path {cluster_script_path}")
-#     format_dict = dict(
-#         cluster_script_path=cluster_script_path,
-#         config_path=config_path,
-#         config_name=config_name,
-#     )
-#     cmd = cmd.format(**format_dict)
-#     print(f"format cmd {cmd}")
-
-#     return cmd
-
-
-# def find_checkpoint_dir(base_path):
-#     """
-#     Find the 'checkpoints' folder in the directory structure.
-#     Parameters:
-#         base_path (str): The base directory path to search from.
-#     """
-#     for root, dirs, files in os.walk(base_path):
-#         for dir_name in dirs:
-#             if dir_name == "checkpoints":
-#                 return os.path.join(root, dir_name), root
-#     return None, None
-
-
-def main():
-    config = {
-        "training_config_local": "/home/ntadevosyan/code/canary_ngpt/NeMo/ngpt_rnnt_bpe.yaml",
-        "training_config_cluster": "path/to/your/cluster/config.yaml",
-        "training_script_path": "path/to/training/script.py",
-        "nemo_directory": "path/to/nemo/directory",
-        "output_manifest_file": "path/to/output/manifest.json",
-        "new_manifest_files": None,  # or list of manifest files if you have them
-        "new_tarred_audio_filepaths": None  # or list of tarred audio paths if you have them
-    }
-    processor = TrainingCommandGenerator(**config)
-    cmd = processor.process(param="str")
-    print("Generated command:", cmd)
-
-if __name__ == '__main__':
-    main()
diff --git a/run_ipl.yaml b/run_ipl.yaml
deleted file mode 100644
index eaff04ca..00000000
--- a/run_ipl.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-script: asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py 
-num_epochs: 2
-script_config: /home/ntadevosyan/code/fork_ipl/NeMo/examples/asr/conf/update_script_config.yaml 
-inference_config: /home/ntadevosyan/code/nemo_run_ipl_PR/NeMo/examples/asr/inference_config_local_non_tarred.yaml 
-nemo_path: /home/ntadevosyan/code/pr_iplmixin/NeMo/examples/asr/
diff --git a/run_pt_mcv.yaml b/run_pt_mcv.yaml
deleted file mode 100644
index 1d299241..00000000
--- a/run_pt_mcv.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# The script to be run.
-script: "examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py"
-script_config: "/home/ntadevosyan/code/nemo_run_ipl_PR/NeMo/examples/asr/conf/mcv_scratch_cs_you.yaml"
-
-exp_name: null  # populated by exp_manager.name if not provided
-results_dir: '/lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth'  # Where to store the results of the run
-
-# Optional arguments
-num_runs: 1
-num_tasks_per_node: 1
-num_gpus: 1
-max_runtime: "00:03:45:00"
-
-########################################################################################################################
-
-executor: slurm
-
-USER: ntadevosyan
-ssh_tunnel:
-  host: draco-oci-login-01.draco-oci-iad.nvidia.com
-  # ------------------------------- Fill this up! -------------------------------
-  user: "${USER}"  # your username; or resolved from ${USER} environment variable ; or can be null which resolved from ${USER} environment variable
-  job_dir: "/lustre/fsw/portfolios/convai/users/${USER}/nemo-run/"
-  identity: "${NEMO_OCI_IAD_SSH_IDENTITY}"
-  # -----------------------------------------------------------------------------
-
-account: convai_convaird_nemo-speech
-partition: batch_block1,batch_block3,batch_block4
-job_name_prefix: "convai_convaird_nemo-speech-pt"
-
-containers:
-  # asr: /lustre/fsw/portfolios/llmservice/users/kpuvvada/local_containers/nemo_dev_20240717_aistore.sqsh
-  asr: /lustre/fsw/portfolios/llmservice/users/pzelasko/containers/nemo-nightly-24jul24-oomptimizer.sqsh
-
-env_vars:
-  - 'TOKENIZERS_PARALLELISM=false'
-  - 'AIS_ENDPOINT="http://asr.iad.oci.aistore.nvidia.com:51080"'
-  - 'LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE=0.3'
-  - 'TORCH_CUDNN_V8_API_ENABLED=1'
-  - 'PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True'
-  - 'HYDRA_FULL_ERROR=1'
-
-required_env_vars:
-  - 'HF_TOKEN'
-  - 'WANDB_KEY'
-
-mounts:
-  # Replace with your own paths in your cluster config
-  - /lustre/fsw:/lustre/fsw
-  - /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data:/data
-  #- /lustre/fsw/portfolios/convai/users/ntadevosyan:/asr_checkpoints
-  - /lustre/fsw/portfolios/convai/users/ntadevosyan:/lustre/fsw/portfolios/convai/users/ntadevosyan
-
-timeouts:
-  batch_block1,batch_block3,batch_block4: 04:00:00
-  interactive: 04:00:00
-  interactive_singlenode: 04:00:00
diff --git a/run_pt_mcv_cs_you.yaml b/run_pt_mcv_cs_you.yaml
deleted file mode 100644
index 9614a1da..00000000
--- a/run_pt_mcv_cs_you.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# The script to be run.
-script: "examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py"
-script_config: "/home/ntadevosyan/code/nemo_run_ipl_PR/NeMo/examples/asr/conf/mcv_scratch_cs_you.yaml"
-
-exp_name: null  # populated by exp_manager.name if not provided
-results_dir: '/lustre/fsw/portfolios/convai/users/ntadevosyan/results/smth'  # Where to store the results of the run
-nemo_directory: "/workspace/nemo"
-
-# Optional arguments
-num_runs: 6
-num_gpus: 8
-num_tasks_per_node: 8
-max_runtime: "00:03:45:00"
-
-########################################################################################################################
-
-executor: slurm
-ipl_training:
-  inference_config: inference_config_cs_you.yaml
-  p_cache: 0.2
-  num_ipl_epochs: 100
-  prefix: mcv_you_3
-
-USER: ntadevosyan
-
-ssh_tunnel:
-  host: cs-oci-ord-login-01.nvidia.com
-  # ------------------------------- Fill this up! -------------------------------
-  user: "${USER}"  # your username; or resolved from ${USER} environment variable ; or can be null which resolved from ${USER} environment variable
-  job_dir: "//lustre/fsw/portfolios/convai/users/${USER}/nemo-run/"
-  identity: ""
-  # -----------------------------------------------------------------------------
-
-account: convai_convaird_nemo-speech
-partition: polar,polar3 
-job_name_prefix: "convai_convaird_nemo-speech-pt"
-
-containers:
-  # asr: /lustre/fsw/portfolios/llmservice/users/kpuvvada/local_containers/nemo_dev_20240717_aistore.sqsh
-  asr: nvcr.io/nvidian/ac-aiapps/nemo_ntad:ipl
-
-env_vars:
-  - 'TOKENIZERS_PARALLELISM=false'
-  - 'AIS_ENDPOINT="http://asr.iad.oci.aistore.nvidia.com:51080"'
-  - 'LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE=0.3'
-  - 'TORCH_CUDNN_V8_API_ENABLED=1'
-  - 'PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True'
-  - 'HYDRA_FULL_ERROR=1'
-
-required_env_vars:
-  - 'HF_TOKEN'
-  - 'WANDB_KEY=037abd530ba9fc776c9d617c95c91f5dd0340471' 
-
-mounts:
-  # Replace with your own paths in your cluster config
-  - /lustre/fsw/:/lustre/fsw/
-  #- /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data:/data
-  #- /lustre/fsw/portfolios/convai/users/ntadevosyan:/asr_checkpoints
-  - /lustre/fsw/portfolios/convai/users/ntadevosyan:/lustre/fsw/portfolios/convai/users/ntadevosyan
-
-timeouts:
-  polar,polar3: 04:00:00
-  interactive: 04:00:00
-  interactive_singlenode: 04:00:00
diff --git a/sdp/processors/nemo/ipl_command.py b/sdp/processors/nemo/ipl_command.py
deleted file mode 100644
index a1fb8be8..00000000
--- a/sdp/processors/nemo/ipl_command.py
+++ /dev/null
@@ -1,184 +0,0 @@
-
-import os
-import subprocess
-from pathlib import Path
-from typing import Optional
-from typing import Dict, List
-from omegaconf import OmegaConf, open_dict
-from nemo.utils import logging
-from sdp.processors.base_processor import BaseProcessor
-
-
-class IPLCommandGenerator(BaseProcessor):
-    """This processor performs ASR inference on each utterance of the input manifest.
-
-    ASR predictions will be saved in the ``pred_text`` key.
-
-    Args:
-        pretrained_model (str): the name or the filepath of the pretrained NeMo ASR model
-            which will be used to do inference.
-        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
-
-    Returns:
-         The same data as in the input manifest with an additional field
-         ``pred_text`` containing ASR model's predictions.
-    """
-
-    def __init__(
-        self,
-        training_config: str,
-        infenrece_config: str,
-        training_script_path: str,
-        nemo_directory: str,
-        num_ipl_epochs: 50,
-
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        # Paths on the current machine
-        self.training_config = OmegaConf.load(training_config)
-        self.infenrece_config = OmegaConf.load(infenrece_config) 
-        self.training_script_path = os.path.join(nemo_directory, training_script_path)
-        self.nemo_directory = nemo_directory
-        self.num_ipl_epochs = num_ipl_epochs
-
-    def process(self):
-        """.""" 
-        
-
-
-        
-
-    def get_training_script_cmd(self, cluster_script_path, config_name, updated_manifest_filepaths=None, updated_tarred_filepaths=None):
-        """
-        Create the command to run the script on the cluster.
-
-        Args:
-            cluster_script_path (str): Path to the script to run on the cluster.
-            config_name (str): Name of the config file to use for the script.
-            updated_manifest_filepaths (str, optional): Path to the updated manifest file. Defaults to None.
-            updated_tarred_filepaths (str, optional): Path to the updated tarred audio filepaths. Defaults to None.
-
-        Returns:
-            str: Command to run the script on the cluster.
-        """
-
-        # Prepare the base command for training
-        cmd = (
-            "find /results/ -name '*-unfinished' -type f -delete && "
-            f"cd {os.path.dirname(cluster_script_path)} && "
-            f"python -u -B {os.path.basename(cluster_script_path)} "
-            f"--config-path \"/results/configs\" --config-name \"{config_name}\""
-        )
-
-        # Add additional parameters if provided
-        if updated_manifest_filepaths:
-            cmd += f" model.train_ds.manifest_filepath={updated_manifest_filepaths}"
-        if updated_tarred_filepaths:
-            cmd += f" model.train_ds.tarred_audio_filepaths={updated_tarred_filepaths}"
-
-        return cmd
-
-    def get_export_variables_cmd(self, merged_cfg):
-        wandb_key = os.environ.get("WANDB_API_KEY") or os.environ.get("WANDB") or os.environ.get("WANDB_KEY", "")
-        if not wandb_key:
-            logging.warning("WANDB key not found in environment variables. WANDB logging will not work.")
-
-            # Check if WANDB logging is enabled in the exp_manager config
-            if merged_cfg.get('exp_manager', {}).get('create_wandb_logger', False):
-                raise ValueError(
-                    "WANDB key is required for logging but was not found in environment variables. "
-                    "Please set WANDB_API_KEY to enable WANDB logging."
-                )
-
-        cmd = (
-            "nvidia-smi && "
-            "export PYTHONPATH=/nemo_run/code && "
-            f"export HF_TOKEN={os.getenv('HF_TOKEN', '')} && "
-            f"export WANDB_API_KEY={wandb_key} && ")
-        
-        return cmd
-
-    def get_pl_inference_command(self, inference_configs, shuffle=None):
-        """
-        Generate a command to run PL inference with multiple configuration files.
-        Args:
-            inference_configs (list): List of configuration file paths.
-
-        Returns:
-            str: Combined command string to execute PL inference.
-        """
-        # Base command template
-
-        base_cmd = "python /nemo_run/code/examples/asr/transcribe_speech_parallel.py --config-path \"/results/configs\" --config-name {config_name}"
-        if shuffle is not None:
-            base_cmd += f" predict_ds.shuffle={shuffle}"
-
-        # Generate the command list
-        cmd_list = [base_cmd.format(config_name=os.path.basename(config)) for config in inference_configs]
-
-        # Combine the commands with " && " separator
-        return " && ".join(cmd_list)
-
-    def get_pseudo_labeling_command(
-        self, merged_config: Dict, config_name: str, cluster_script_path: str, config_dir: str, ipl_training: Dict[str, any]) -> str:
-        """
-        Generate the pseudo-labeling command for the given configuration and training parameters.
-
-        Args:
-            merged_config (Dict): Merged configuration containing model and dataset settings.
-            config_name (str): Name of the configuration file to be used.
-            cluster_script_path (str): Path to the cluster execution script.
-            config_dir (str): Directory containing the configuration files.
-            ipl_training (Dict[str, any]): Dictionary containing:
-                - first_run (bool): Whether this is the first run of pseudo-labeling.
-                - num_gpus (int): Number of GPUs to use.
-                - inference_config_paths (List[str]): List of inference configuration file paths.
-                - manifests (List[str]): List of manifest file paths.
-                - tarr_paths (List[str]): List of tarred audio file paths.
-                - num_ipl_epochs (int): Number of epochs to train with pseudo-labels.
-                - p_cache (float): What part of pseudo-labels to update.
-
-        Returns:
-            str: The constructed pseudo-labeling command.
-        """
-        
-        prediction_directories_str = " ".join([os.path.dirname(path) for path in ipl_training['manifests']])
-        inference_config_paths_str = " ".join(ipl_training['inference_config_paths'])
-
-        updated_manifest_filepaths, updated_tarred_audio_filepaths = ipl_utils.update_training_sets(
-            merged_config, ipl_training["manifests"], ipl_training.get("tarr_paths", None), ipl_training["prefix"]
-        )
-        exec_cmd = self.get_export_variables_cmd(merged_cfg=merged_config)
-        exec_cmd += self.get_training_script_cmd(cluster_script_path, config_name)
-        exec_cmd += " && sleep 10"
-        if ipl_training.get("first_run", False):
-            exec_cmd += f" && {self.get_pl_inference_command(ipl_training['inference_config_paths'], shuffle=False)}"
-            exec_cmd += (
-                f" && python /nemo_run/code/examples/asr/run_write_transcribed_files.py "
-                f"--prediction_filepaths {prediction_directories_str} --full_pass --prefix {ipl_training['prefix']}"
-            )
-            if merged_config.model.train_ds.is_tarred:
-                exec_cmd += " --is_tarred"
-            exec_cmd += (
-                f" && python /nemo_run/code/examples/asr/run_update_inf_config.py "
-                f"--inference_configs {inference_config_paths_str} --p_cache {ipl_training['p_cache']} --num_gpus {ipl_training['num_gpus']}"
-            )
-
-        # If run has been interupted user has to change `num_ipl_epochs` in the config
-        for _ in range(ipl_training["num_ipl_epochs"]):
-            run_script = self.get_training_script_cmd(
-                cluster_script_path, config_name, updated_manifest_filepaths, updated_tarred_audio_filepaths
-            )
-            exec_cmd += " && sleep 10"
-            exec_cmd += f" && {run_script}"
-            exec_cmd += f" && {self.get_pl_inference_command(ipl_training['inference_config_paths'],shuffle=True)}"
-            exec_cmd += (
-                f" && python /nemo_run/code/examples/asr/run_write_transcribed_files.py "
-                f"--prediction_filepaths {prediction_directories_str} "
-                f"--prefix {ipl_training['prefix']}"
-            )
-            if merged_config.model.train_ds.is_tarred:
-                exec_cmd += " --is_tarred"
-
-        return exec_cmd
\ No newline at end of file
diff --git a/sdp/processors/nemo/ipl_training.py b/sdp/processors/nemo/ipl_training.py
deleted file mode 100644
index ecc3520a..00000000
--- a/sdp/processors/nemo/ipl_training.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
diff --git a/sdp/processors/nemo/ipl_utils.py b/sdp/processors/nemo/ipl_utils.py
deleted file mode 100644
index 0630be4f..00000000
--- a/sdp/processors/nemo/ipl_utils.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import glob
-import json
-import os
-from typing import List, Optional, Tuple, Union
-
-from omegaconf import OmegaConf
-
-def separate_multiple_transcriptions(inference_config: dict) -> Tuple[List[str], Optional[List[str]]]:
-    """
-    Separates and returns the manifest and tarred audio file paths from the configuration.
-    This function makes it easier to run transcribe_speech_parallel for each bucket separately
-    Args:
-        inference_config (str): Path to the inference configuration file.
-    Returns:
-        Tuple[List[str], Optional[List[str]]]: A tuple containing:
-            - A list of manifest file paths.
-            - An optional list of tarred audio file paths, or None if not applicable.
-    """
-    
-    if hasattr(inference_config.predict_ds, "is_tarred") and inference_config.predict_ds.is_tarred:
-        tarred_audio_filepaths = inference_config.predict_ds.tarred_audio_filepaths
-        manifest_filepaths = inference_config.predict_ds.manifest_filepath
-        if type(tarred_audio_filepaths) != str and len(tarred_audio_filepaths) > 1:
-            manifests = []
-            tarr_audio_files = []
-            for manifest_filepath, tarred_audio_filepath in zip(manifest_filepaths, tarred_audio_filepaths):
-                manifests.append(manifest_filepath[0])
-                tarr_audio_files.append(tarred_audio_filepath[0])
-            return manifests, tarr_audio_files
-        else:
-            return [manifest_filepaths], [tarred_audio_filepaths]
-    else:
-        if isinstance(inference_config.predict_ds.manifest_filepath, str):
-            return [inference_config.predict_ds.manifest_filepath], None
-        else:
-            return inference_config.predict_ds.manifest_filepath, None
-
-
-def create_transcribed_shard_manifests(
-    prediction_filepaths: List[str],
-) -> List[str]:
-    """
-    Creates transcribed shard manifest files by processing predictions and organizing them by shard ID.
-    This function reads a `predictions_all.json` file from each given directory, organizes the data by
-    shard IDs, and writes the entries to separate shard manifest files. For each shard, the `pred_text`
-    field is updated as the main transcription (`text`), and the original transcription (`text`) is
-    stored as `orig_text`.
-    Args:
-        prediction_filepaths (List[str]): A list of file paths to directories containing
-            `predictions_all.json` files with prediction data, including shard IDs.
-    Returns:
-        List[str]: A list of file paths to the combined manifest files (`transcribed_manifest__OP_0..CL_.json`)
-        created for each directory.
-    """
-    all_manifest_filepaths = []
-    for prediction_filepath in prediction_filepaths:
-        max_shard_id = 0
-        shard_data = {}
-        full_path = os.path.join(prediction_filepath, "predictions_all.json")
-        with open(full_path, 'r') as f:
-            for line in f.readlines():
-                data_entry = json.loads(line)
-                shard_id = data_entry.get("shard_id")
-                if max_shard_id < shard_id:
-                    max_shard_id = shard_id
-                if shard_id not in shard_data:
-                    shard_data[shard_id] = []
-                shard_data[shard_id].append(data_entry)
-        for shard_id, entries in shard_data.items():
-            output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
-            with open(output_filename, 'w') as f:
-                for data_entry in entries:
-                    if data_entry['audio_filepath'].endswith(".wav"):
-                        if 'text' in data_entry:
-                            data_entry['orig_text'] = data_entry.pop('text')
-                        data_entry['text'] = data_entry.pop('pred_text')
-                        json.dump(data_entry, f, ensure_ascii=False)
-                        f.write("\n")
-        shard_manifest_filepath = os.path.join(
-            prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json"
-        )
-        all_manifest_filepaths.append(shard_manifest_filepath)
-    return all_manifest_filepaths
-
-
-def create_transcribed_manifests(
-    prediction_filepaths: List[str],
-) -> List[str]:
-    """
-    Creates updated transcribed manifest files by processing predictions.
-    This function reads prediction files (`predictions_all.json`) from the provided directories,
-    updates the transcription data by renaming the `pred_text` field to `text`, and stores the
-    original `text` field as `orig_text`. The updated data is written to new transcribed manifest
-    files (`transcribed_manifest.json`) in each directory.
-    Args:
-        prediction_filepaths (List[str]): A list of file paths to directories containing
-            prediction files (`predictions_all.json`).
-    Returns:
-        List[str]: A list of file paths to the newly created transcribed manifest files
-        (`transcribed_manifest.json`).
-    """
-    all_manifest_filepaths = []
-    for prediction_filepath in prediction_filepaths:
-        prediction_name = os.path.join(prediction_filepath, "predictions_all.json")
-        transcripted_name = os.path.join(prediction_filepath, f"transcribed_manifest.json")
-
-        # Open and read the original predictions_all.json file
-        with open(transcripted_name, 'w', encoding='utf-8') as f:
-            with open(prediction_name, 'r', encoding='utf-8') as pred_f:
-
-                for line in pred_f.readlines():
-                    data_entry = json.loads(line)
-                    if 'text' in data_entry:
-                        data_entry['orig_text'] = data_entry.pop('text')
-                    data_entry['text'] = data_entry.pop('pred_text')
-                    json.dump(data_entry, f, ensure_ascii=False)
-                    f.write("\n")
-            # Append the path of the new manifest file to the list
-            all_manifest_filepaths.append(transcripted_name)
-
-    return all_manifest_filepaths
-
-
-def write_sampled_shard_transcriptions(manifest_filepaths: List[str]) -> List[List[str]]:
-    """
-    Updates transcriptions by merging predicted shard data and transcribed manifest data.
-    This function processes prediction and transcribed manifest files, merges them
-    by matching the shard_id and audio file paths. For each shard, the corresponding
-    data entries are written to a new file.
-    Args:
-        manifest_filepaths (List[str]): A list of file paths to directories containing
-            prediction and transcribed manifest files.
-    Returns:
-        List[List[str]]: A list of lists containing the file paths to the generated
-            transcribed shard manifest files.
-    """
-    all_manifest_filepaths = []
-
-    # Process each prediction directory
-    for prediction_filepath in manifest_filepaths:
-        predicted_shard_data = {}
-        # Collect entries from prediction files based on shard id
-        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
-        with open(prediction_path, 'r') as f:
-            for line in f:
-                data_entry = json.loads(line)
-                shard_id = data_entry.get("shard_id")
-                audio_filepath = data_entry['audio_filepath']
-                predicted_shard_data.setdefault(shard_id, {})[audio_filepath] = data_entry
-    max_shard_id = 0
-    for full_path in glob.glob(os.path.join(prediction_filepath, f"transcribed_manifest_[0-9]*.json")):
-        all_data_entries = []
-        with open(full_path, 'r') as f:
-            for line in f:
-                data_entry = json.loads(line)
-                shard_id = data_entry.get("shard_id")
-                max_shard_id = max(max_shard_id, shard_id)
-                all_data_entries.append(data_entry)
-        # Write the merged data to a new manifest file keeping new transcriptions
-        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
-        with open(output_filename, 'w') as f:
-            for data_entry in all_data_entries:
-                audio_filepath = data_entry['audio_filepath']
-                # Escape duplicated audio files that end with *dup
-                if audio_filepath.endswith(".wav"):
-                    if shard_id in predicted_shard_data and audio_filepath in predicted_shard_data[shard_id]:
-                        predicted_data_entry = predicted_shard_data[shard_id][audio_filepath]
-                        if 'text' in predicted_data_entry:
-                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
-                        if "pred_text" in predicted_data_entry:
-                            predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
-                        json.dump(predicted_data_entry, f, ensure_ascii=False)
-                    else:
-                        json.dump(data_entry, f, ensure_ascii=False)
-                    f.write("\n")
-
-    shard_manifest_filepath = os.path.join(
-        prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json"
-    )
-    all_manifest_filepaths.append([shard_manifest_filepath])
-
-    return all_manifest_filepaths
-
-def write_sampled_transcriptions(manifest_filepaths: List[str]) -> List[str]:
-    """
-    Updates transcriptions by merging predicted data with transcribed manifest data.
-    This function processes prediction and transcribed manifest files within given directories.
-    It matches audio file paths to update transcriptions with predictions, ensuring each audio file
-    is properly transcribed. The updated data is written to the transcribed manifest file.
-    Args:
-        manifest_filepaths (List[str]): A list of file paths to directories containing
-            the prediction file (`predictions_all.json`) and the transcribed manifest file
-            (`transcribed_manifest.json`).
-    Returns:
-        List[str]: A list of file paths to the updated transcribed manifest files.
-    """
-
-    all_manifest_filepaths = []
-    for prediction_filepath in manifest_filepaths:
-        predicted_data = {}
-
-        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
-        with open(prediction_path, 'r') as f:
-            for line in f:
-                data_entry = json.loads(line)
-                path = data_entry['audio_filepath']
-    
-                predicted_data[path] = data_entry
-        full_path = os.path.join(prediction_filepath, f"transcribed_manifest.json")
-        all_data_entries = []
-        count = 0
-        with open(full_path, 'r') as f:
-            for line in f:
-                count += 1
-                data_entry = json.loads(line)
-                all_data_entries.append(data_entry)
-               
-
-        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest.json")
-        with open(output_filename, 'w') as f:
-            for data_entry in all_data_entries:
-                audio_filepath = data_entry['audio_filepath']
-                if audio_filepath.endswith(".wav"):
-                    if audio_filepath in predicted_data:
-                        predicted_data_entry = predicted_data[audio_filepath]
-                        if 'text' in predicted_data_entry:
-                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
-                        predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
-                        json.dump(predicted_data_entry, f, ensure_ascii=False)
-                        f.write("\n")
-                    else:
-                        json.dump(data_entry, f, ensure_ascii=False)
-                        f.write("\n")
-        all_manifest_filepaths.append(output_filename)
-    return all_manifest_filepaths
-
-
-def update_training_sets(
-    merged_config: OmegaConf, final_cache_manifests: list, tarred_audio_filepaths: Union[list, str]
-) -> OmegaConf:
-    """
-    Adds pseudo-labeled sets to the training datasets based on dataset type and
-    handles tarred audio files differently. The function updates the 'manifest_filepath'
-    and 'tarred_audio_filepaths' fields in the training dataset configuration.
-    Args:
-        merged_config: The configuration object containing the model and dataset settings.
-        final_cache_manifests: A list of paths to the manifest files for the pseudo-labeled data.
-        tarred_audio_filepaths: A string or list of tarred audio file paths to be added to the training set.
-    Returns:
-        merged_config: The updated configuration object with the new training datasets.
-    """
-
-    print()
-    print(f"update_training_sets")
-    print(f"")
-    if merged_config.model.train_ds.get("is_tarred", False):
-        if isinstance(tarred_audio_filepaths, str):
-            if isinstance(merged_config.model.train_ds['tarred_audio_filepaths'], str):
-                merged_config.model.train_ds['tarred_audio_filepaths'] = [
-                    [merged_config.model.train_ds['tarred_audio_filepaths']],
-                    [tarred_audio_filepaths],
-                ]
-            else:
-                merged_config.model.train_ds.tarred_audio_filepaths.append(tarred_audio_filepaths)
-        else:
-            if isinstance(merged_config.model.train_ds.tarred_audio_filepaths, str):
-                merged_config.model.train_ds.tarred_audio_filepaths = [
-                    [merged_config.model.train_ds.tarred_audio_filepaths]
-                ]
-            merged_config.model.train_ds.tarred_audio_filepaths += tarred_audio_filepaths
-
-        if isinstance(merged_config.model.train_ds.manifest_filepath, str):
-            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
-
-        merged_config.model.train_ds.manifest_filepath += final_cache_manifests
-
-    else:
-        print(f"is not tarred")
-        if isinstance(merged_config.model.train_ds.manifest_filepath, str):
-            print(f"is str")
-            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
-
-        if merged_config.model.train_ds.get("use_lhotse", False):
-            print(f"is lhotse")
-            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
-            merged_config.model.train_ds.manifest_filepath.append(final_cache_manifests)
-        else:
-            print(f"not lhotse")
-            print(f"merged_config.model.train_ds.manifest_filepath {merged_config.model.train_ds.manifest_filepath}")
-            print(f"final_cache_manifests {final_cache_manifests}")
-            merged_config.model.train_ds.manifest_filepath += final_cache_manifests
-
-
-    return merged_config
-
-
-def count_files_for_pseudo_labeling(manifest_filepath: str, is_tarred: bool) -> int:
-    """
-    Counts the number of files for pseudo-labeling.
-    Args:
-        manifest_filepath (str): The path to the manifest file(s).
-        is_tarred (bool): Flag to determine whether to count files for multiple shard manifests.
-    Returns:
-        int: The total number of audio files given for pseudo labeling.
-    """
-    if is_tarred:
-        dir_path, filename = os.path.split(manifest_filepath)
-        prefix = filename.split('_', 1)[0]
-        number_of_files = 0
-        for full_path in glob.glob(os.path.join(dir_path, f"{prefix}_[0-9]*.json")):
-            with open(full_path, 'r') as f:
-                number_of_files += len(f.readlines())
-    else:
-        with open(manifest_filepath, 'r') as f:
-            number_of_files = len(f.readlines())
-
-    return number_of_files
\ No newline at end of file
diff --git a/sdp/processors/nemo/nemo_run_ipl.py b/sdp/processors/nemo/nemo_run_ipl.py
deleted file mode 100644
index b615e9ca..00000000
--- a/sdp/processors/nemo/nemo_run_ipl.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import datetime
-import os
-from pathlib import Path
-from typing import Dict, List
-import argparse
-import nemo_run as run
-from omegaconf import OmegaConf, open_dict
-
-from sdp.utils import nemo_run_utils, ipl_utils
-import logging
-from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator, InferenceCommandGenerator
-# NEMO_ROOT = Path(__file__).absolute().parents[2]
-
-def gather_mounts(cluster_cfg):
-    """
-    Gather all mounts from the cluster config including ones which are disjoint from the cluster_cfg.mounts list.
-    It is used because Hydra does not support the ability to append to a list in the config file natively.
-
-    Users can provide additional mounts from the command line using the following syntax:
-    ++mount_<anything>='/src:/dest'
-
-    Args:
-        cluster_cfg: Cluster config dictionary with following fields.
-            
-            script (str): Path to the main Python script to be executed.
-            script_config (str): Path to the YAML config used by the script.
-            exp_name (str or None): Name of the experiment. If None, it is inferred from `exp_manager.name`
-              in the script configuration.
-            results_dir (str): Path to the directory where results should be saved.
-            
-            num_runs (int): Number of times to repeat the experiment.
-            num_gpus (int): Number of GPUs to allocate per run.
-            num_tasks_per_node (int): Number of tasks per node.
-            max_runtime (str): Max allowed runtime in Slurm format (DD:HH:MM:SS). Default is "00:03:45:00".
-
-            executor (str): Type of job executor, e.g., 'slurm', 'local'.
-
-            ssh_tunnel:
-                host (str): Hostname for the SSH tunnel.
-                user (str): Username for SSH login. Can be `${USER}` to auto-resolve.
-                job_dir (str): Remote path where jobs will be created and results uploaded.
-                identity (str): Path to SSH identity file. Resolved from environment variable `${NEMO_OCI_IAD_SSH_IDENTITY}`.
-
-            account (str): Account name used for SLURM job submissions.
-            partition (str): Comma-separated list of SLURM partitions to use.
-            job_name_prefix (str): Prefix for SLURM job names.
-
-            containers:
-                asr (str): URI or path to the container image used for ASR jobs.
-
-            env_vars:
-                List[str]: List of environment variable declarations to be set in the job,
-                e.g., 'TOKENIZERS_PARALLELISM=false', 'HYDRA_FULL_ERROR=1', etc.
-             
-            required_env_vars (List[str]): List of env vars that **must** be present in the environment before running.
-                - 'HF_TOKEN'
-                - 'WANDB_KEY'
-            mounts:
-                - /paths/to/be/mounted:/paths/to/mount/t
-
-            timeouts:
-                partition_name: 04:00:00 (max runtime for execution)
-    """ 
-    # Gather all mounts from the cluster config including ones which are disjoint from the cluster_cfg.mounts list.
-    mounts = cluster_cfg.get('mounts', [])
-    # Resolve any mounts in th cluster config that need user expansion
-    mounts = [os.path.expanduser(m) for m in mounts]
-
-    keys = list(cluster_cfg.keys())
-    # Check for any additional mounts in the cluster config
-    with open_dict(cluster_cfg):
-        for k in keys:
-            if k.startswith("mount_"):  # Additional mount found
-                logging.info(f"Found additional mount flag in the cluster config `{k}`. Adding it to the mounts list.")
-                mounts.append(cluster_cfg[k])
-                del cluster_cfg[k]  # Remove the key from the cluster config
-
-        cluster_cfg['mounts'] = mounts
-        logging.info(f"Final Mounts: {mounts}")
-
-
-# def check_root_path(path, nemo_root):
-#     """
-#     Check if a path is in the NeMo root directory and convert it to a path that is relative to the NeMo root directory.
-#     This is used to ensure that any path that is provided to this script will be in the NeMo root directory when
-#     mounted in the container.
-
-#     Args:
-#         path: Path to check
-#         nemo_root: NeMo root directory
-
-#     Returns:
-#         str: Path relative to the NeMo root directory
-#     """
-#     path = str(path)
-#     nemo_root = str(nemo_root)
-
-#     if not os.path.exists(path):
-#         raise FileNotFoundError(f"Path {path} does not exist.")
-
-#     if not path.startswith(nemo_root):
-#         raise ValueError(f"Path {path} is not in the NeMo root directory.")
-
-#     new_path = path.replace(nemo_root, '/nemo_run/code/')
-#     return new_path
-
-
-def check_config_mount_paths(script_config, cluster_config):
-    """
-    Check if all path-like strings in the script config are mounted paths in the cluster config.
-    If a path-like string is not a mounted path, raise an error.
-
-    Args:
-        script_config: Script config dictionary that represents the Model training/inference config
-        cluster_config: Cluster config dictionary that represents the cluster configuration
-    """
-    # recursively walk all values of the script_config, checking if its a path-like string and if so, check if the path is a mounted path
-    # if it is not, raise an error
-
-    def filepath_check(v, cluster_cfg):
-        if v.startswith(os.path.sep):  # check for absolute paths only
-            logging.info(f"Checking if {v} is a mounted path")
-            # Check if the path begins with mount path
-            nemo_run_utils.check_if_mounted(cluster_cfg, v)
-
-            # Check the file exists in the cluster at the unmounted path
-            unmounted_path = nemo_run_utils.get_unmounted_filepath(cluster_cfg, v)
-            nemo_run_utils.check_remote_mount_directories(unmounted_path, cluster_cfg)
-
-    def check_mounted_path(cfg, cluster_cfg):
-        if hasattr(cfg, 'items'):  # if the object is a dictionary
-            for k, v in cfg.items():
-                if hasattr(v, 'items'):  # if the value is a dictionary, recurse
-                    check_mounted_path(v, cluster_cfg)
-
-                elif isinstance(v, list):  # if the value is a list, check if its items are an absolute path
-                    for item in v:
-                        if isinstance(item, str):
-                            filepath_check(item, cluster_cfg)
-
-                elif isinstance(v, str):  # if the value is a string, check if its an absolute a path
-                    filepath_check(v, cluster_cfg)
-
-    check_mounted_path(script_config, cluster_config)
-
-    return 
-
-
-def get_export_variables_cmd(merged_cfg):
-    wandb_key = os.environ.get("WANDB_API_KEY") or os.environ.get("WANDB") or os.environ.get("WANDB_KEY", "")
-    if not wandb_key:
-        logging.warning("WANDB key not found in environment variables. WANDB logging will not work.")
-
-        # Check if WANDB logging is enabled in the exp_manager config
-        if merged_cfg.get('exp_manager', {}).get('create_wandb_logger', False):
-            raise ValueError(
-                "WANDB key is required for logging but was not found in environment variables. "
-                "Please set WANDB_API_KEY to enable WANDB logging."
-            )
-
-    cmd = (
-        "nvidia-smi && "
-        "export PYTHONPATH=/nemo_run/code && "
-        f"export HF_TOKEN={os.getenv('HF_TOKEN', '')} && "
-        f"export WANDB_API_KEY={wandb_key} && ")
-    
-    return cmd
-
-from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator, InferenceCommandGenerator
-
-def get_pseudo_labeling_command(
-    train_command_config: dict, 
-    inference_command_config: dict, 
-    num_ipl_epochs: int,
-    new_manifest_files,
-    new_tarr_files,
-    first_run: False,
-    
-) -> str:
-    """
-    Generate the pseudo-labeling command for the given configuration and training parameters using processors.
-
-    Args:
-        train_command_config (dict): Config for TrainingCommandGenerator.
-        inference_command_config (dict): Config for InferenceCommandGenerator.
-        num_ipl_epochs (int): Number of epochs to train with pseudo-labels.
-
-    Returns:
-        str: The constructed pseudo-labeling command.
-    """
-    # Instantiate processors
-    train_proc = TrainingCommandGenerator(**train_command_config)
-    infer_proc = InferenceCommandGenerator(**inference_command_config)
-
-    exec_cmd = train_proc.process()
-    exec_cmd += " && sleep 10"
-    exec_cmd += " && " + infer_proc.process(first_run=first_run)
-
-    # For subsequent epochs, set first_run to False
-    for _ in range(num_ipl_epochs):
-        exec_cmd += " && sleep 10"
-        exec_cmd += " && " + train_proc.process(new_manifest_files, new_tarr_files)
-        exec_cmd += " && " + infer_proc.process(first_run=False)
-
-    return exec_cmd
-
-
-def main(config_path: str):
-    """
-    Main entry point for running IPL training.
-    
-    Args:
-        config_path (str): Path to the YAML configuration file
-    """
-    # Load the cluster config from YAML
-    cluster_cfg = OmegaConf.load(config_path)
-    
-    # Process the required arguments from the cluster config
-    script_path = cluster_cfg.script
-    script_config_path = cluster_cfg.script_config
-    results_dir = cluster_cfg.results_dir
-    NEMO_ROOT = cluster_cfg.nemo_directory
-
-    script_config_path = Path(script_config_path).absolute()
-
-    # Gather all mounts from the cluster config
-    gather_mounts(cluster_cfg)
-
-    # Add the results directory to the cluster config as a mount path
-    nemo_run_utils.add_mount_path(results_dir, '/results', cluster_cfg)
-
-    # Create results and logdir
-    log_dir = cluster_cfg.get('log_dir', os.path.join(results_dir, 'logs'))
-    nemo_run_utils.create_remote_directory([results_dir, log_dir], cluster_cfg)
-
-    # Load the script config
-    script_config = OmegaConf.load(script_config_path)
-
-    # Update the exp_manager runtime with the max_runtime from the cluster config
-    import copy
-    # Perform all path checks in the merged config
-    if "ipl_training" in script_config.model:
-        ipl_training = copy.deepcopy(script_config.model.ipl_training)
-        # not to check the path
-        del script_config.model.ipl_training.inference_config
-    else:
-        raise KeyError("Parameters for `IPL` training are not provided.")
-    
-    check_config_mount_paths(script_config, cluster_cfg)
-
-    inference_config = ipl_training.inference_config
-    inference_config_path = Path(inference_config).absolute()
-    inference_config = OmegaConf.load(inference_config_path)
-
-    # Resolve experiment name; if not provided in the script config file, check the cluster config
-    exp_name = cluster_cfg.exp_name
-    if exp_name is None:
-        if 'exp_manager' in script_config and 'name' in script_config['exp_manager']:
-            exp_name = script_config['exp_manager']['name']
-        else:
-            raise ValueError(
-                "Experiment name not provided in the run config file (`exp_name`)) or the cluster config (inside exp_manager.name)"
-            )
-
-    # Begin NeMo Run setup
-    with run.Experiment(exp_name) as exp:
-        # Create the config file name
-        timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        config_name = f"{exp_name}_{timestamp}_config.yaml"
-
-        # Copy the merged config file to remote location's /results/configs directory
-        config_dir = os.path.join(results_dir, 'configs')
-        train_config_cluster = nemo_run_utils.create_remote_config(script_config, config_name, config_dir, cluster_cfg)
-
-        # Prepare arguments for the slurm job
-        job_name = f"{exp_name}_job"
-
-        # Get run parameters from the config
-        num_runs = cluster_cfg.num_runs  # Number of dependent jobs for this script
-        num_gpus = cluster_cfg.get('num_gpus', script_config['trainer']['devices'])
-        if isinstance(num_gpus, list):
-            num_gpus = len(num_gpus)
-        if num_gpus == -1:
-            num_gpus = 1 if cluster_cfg['executor'] == 'local' else 8
-            logging.warning(f"\n\nSetting num_gpus to {num_gpus} as it was set to -1\n\n")
-        num_nodes = cluster_cfg.get('num_nodes', script_config['trainer'].get('num_nodes', 1))
-
-
-        checkpoint_dir = os.path.join(
-            os.path.join(script_config.exp_manager.exp_dir, script_config.exp_manager.name), "checkpoints"
-        )
-        checkpoint_name = os.path.join(checkpoint_dir, script_config.exp_manager.name + ".nemo")
-        inference_config_paths, manifests, tarr_paths = nemo_run_utils.create_remote_inference_config(
-            cluster_cfg, config_dir, inference_config, checkpoint_name
-        )
-        check_config_mount_paths(inference_config, cluster_cfg)
-
-        train_command_generator_config = { 
-            "nemo_directory": NEMO_ROOT,
-            "training_config_local": script_config,
-            "training_config_cluster": train_config_cluster,
-            "training_script_path": script_path,
-            "output_manifest_file": "./train_output_manifest_filepath.json",
-        }   
-        inference_command_generator_config = {
-            "nemo_directory": NEMO_ROOT,
-            "inference_config_paths": inference_config_paths,
-            "manifests": manifests,
-            "p_cache": script_config.model.ipl_training.p_cache,
-            "num_gpus": num_nodes * num_gpus,
-            "is_tarred": getattr(script_config.model.train_ds, "is_tarred", False),
-            "output_manifest_file": "./inference_output_manifest_filepath.json",
-        }
-
-
-        cmd = get_pseudo_labeling_command(
-            train_command_generator_config,
-            inference_command_generator_config,
-            num_ipl_epochs=script_config.model.ipl_training.num_ipl_epochs,
-            new_manifest_files=manifests,
-            new_tarr_files=tarr_paths,
-            first_run=True,
-        ) 
-
-        # # Cast the cluster config to a dictionary for compatibility with NeMo Run
-        cluster_cfg = OmegaConf.to_object(cluster_cfg)
-
-       # logging.info(f"Scheduling {num_runs} runs of the script {script_path}...")
-
-        task = None
-        for run_id in range(num_runs):
-            # Add the task to the experiment
-            if run_id == 0:
-                task = None
-            else:
-                if ipl_training:
-                    cmd = get_pseudo_labeling_command(
-                        train_command_generator_config,
-                        inference_command_generator_config,
-                        num_ipl_epochs=script_config.model.ipl_training.num_ipl_epochs,
-                        new_manifest_files=manifests,
-                        new_tarr_files=tarr_paths,
-                        first_run=False
-                    ) 
-                task = [task]
-            print(f"will add task")
-            task = nemo_run_utils.add_task(
-                exp,
-                cmd=cmd,
-                task_name=job_name,
-                cluster_config=cluster_cfg,
-                container=cluster_cfg['containers']['asr'],
-                num_tasks=cluster_cfg.get('num_tasks', cluster_cfg.get('num_tasks_per_node', 1)),
-                num_gpus=num_gpus,
-                num_nodes=num_nodes,
-                log_dir=nemo_run_utils.get_mounted_filepath(cluster_cfg, log_dir),
-                partition=cluster_cfg.get('partition', None),
-                task_dependencies=task,
-            )
-
-        # Run the experiment on the cluster with all the tasks
-        nemo_run_utils.run_exp(exp, cluster_cfg)
-
-
-if __name__ == '__main__':
-    
-    
-    parser = argparse.ArgumentParser(description='Run IPL training with configuration')
-    parser.add_argument('--config', type=str, required=True, help='Path to the YAML configuration file')
-    args = parser.parse_args()
-    
-    main(args.config)
diff --git a/sdp/utils/ipl_utils.py b/sdp/utils/ipl_utils.py
index 53b6b807..0630be4f 100644
--- a/sdp/utils/ipl_utils.py
+++ b/sdp/utils/ipl_utils.py
@@ -11,27 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import glob
+import json
 import os
-from typing import List, Optional, Tuple
-
-from omegaconf import DictConfig
+from typing import List, Optional, Tuple, Union
 
+from omegaconf import OmegaConf
 
-def separate_bucket_transcriptions(inference_config: str) -> tuple:
+def separate_multiple_transcriptions(inference_config: dict) -> Tuple[List[str], Optional[List[str]]]:
     """
-    Separates manifests and audio file paths from different buckets.
-
+    Separates and returns the manifest and tarred audio file paths from the configuration.
+    This function makes it easier to run transcribe_speech_parallel for each bucket separately
     Args:
-        inference_config (str): The configuration object for inference.
-
+        inference_config (str): Path to the inference configuration file.
     Returns:
-        tuple: A tuple containing:
-            - manifests (list): A list of manifest file paths.
-            - tarr_audio_files (list or None): A list of tarred audio file paths or None if
-              the dataset is not tarred.
+        Tuple[List[str], Optional[List[str]]]: A tuple containing:
+            - A list of manifest file paths.
+            - An optional list of tarred audio file paths, or None if not applicable.
     """
-
+    
     if hasattr(inference_config.predict_ds, "is_tarred") and inference_config.predict_ds.is_tarred:
         tarred_audio_filepaths = inference_config.predict_ds.tarred_audio_filepaths
         manifest_filepaths = inference_config.predict_ds.manifest_filepath
@@ -46,97 +44,287 @@ def separate_bucket_transcriptions(inference_config: str) -> tuple:
             return [manifest_filepaths], [tarred_audio_filepaths]
     else:
         if isinstance(inference_config.predict_ds.manifest_filepath, str):
-            return [inference_config.predict_ds.manifest_filepath ], None
+            return [inference_config.predict_ds.manifest_filepath], None
         else:
             return inference_config.predict_ds.manifest_filepath, None
 
 
-def get_transcribed_names(manifest_filepaths: List[str], prefix: str, is_tarred: bool=False) -> List[List[str]]:
+def create_transcribed_shard_manifests(
+    prediction_filepaths: List[str],
+) -> List[str]:
     """
-    Generates a list of modified file paths by prepending 'transcribed_' to the filenames.
-    The use case is for non AIStore datasets
-
+    Creates transcribed shard manifest files by processing predictions and organizing them by shard ID.
+    This function reads a `predictions_all.json` file from each given directory, organizes the data by
+    shard IDs, and writes the entries to separate shard manifest files. For each shard, the `pred_text`
+    field is updated as the main transcription (`text`), and the original transcription (`text`) is
+    stored as `orig_text`.
     Args:
-        manifest_filepaths (list of str): A list of file paths to be modified.
-
+        prediction_filepaths (List[str]): A list of file paths to directories containing
+            `predictions_all.json` files with prediction data, including shard IDs.
     Returns:
-        list of list of str: A list where each element is a single-item list containing the updated file path.
-    Example:
-        >>> manifest_filepaths = [
-        ...     "/path/to/manifest_1.json",
-        ...     "/path/to/manifest_2.json"
-        ... ]
-        >>> get_transcribed_names(manifest_filepaths)
-        [
-            ["/path/to/prefix_transcribed_manifest_1.json"],
-            ["/path/to/prefix_transcribed_manifest_2.json"]
-        ]
+        List[str]: A list of file paths to the combined manifest files (`transcribed_manifest__OP_0..CL_.json`)
+        created for each directory.
     """
-    # For manifest_filepath, modify the filenames by prepending 'prefix_transcribed_'
-    transcribed_paths = []
-
-    for file_path in manifest_filepaths:
-        directory, filename = os.path.split(file_path)
-        
-        new_filename = (
-            f"{prefix}_transcribed_{filename}" if is_tarred 
-            else f"{prefix}_transcribed_manifest.json"
+    all_manifest_filepaths = []
+    for prediction_filepath in prediction_filepaths:
+        max_shard_id = 0
+        shard_data = {}
+        full_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(full_path, 'r') as f:
+            for line in f.readlines():
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                if max_shard_id < shard_id:
+                    max_shard_id = shard_id
+                if shard_id not in shard_data:
+                    shard_data[shard_id] = []
+                shard_data[shard_id].append(data_entry)
+        for shard_id, entries in shard_data.items():
+            output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
+            with open(output_filename, 'w') as f:
+                for data_entry in entries:
+                    if data_entry['audio_filepath'].endswith(".wav"):
+                        if 'text' in data_entry:
+                            data_entry['orig_text'] = data_entry.pop('text')
+                        data_entry['text'] = data_entry.pop('pred_text')
+                        json.dump(data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+        shard_manifest_filepath = os.path.join(
+            prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json"
         )
-        transcribed_paths.append([os.path.join(directory, new_filename)])
+        all_manifest_filepaths.append(shard_manifest_filepath)
+    return all_manifest_filepaths
 
-    return transcribed_paths
 
-
-def update_training_sets(
-    config: DictConfig,
-    updated_manifest_filepaths: List[str],
-    updated_tarred_audio_filepaths: Optional[List[str]] = None,
-    prefix:str  = ""
-) -> Tuple[str, str]:
+def create_transcribed_manifests(
+    prediction_filepaths: List[str],
+) -> List[str]:
+    """
+    Creates updated transcribed manifest files by processing predictions.
+    This function reads prediction files (`predictions_all.json`) from the provided directories,
+    updates the transcription data by renaming the `pred_text` field to `text`, and stores the
+    original `text` field as `orig_text`. The updated data is written to new transcribed manifest
+    files (`transcribed_manifest.json`) in each directory.
+    Args:
+        prediction_filepaths (List[str]): A list of file paths to directories containing
+            prediction files (`predictions_all.json`).
+    Returns:
+        List[str]: A list of file paths to the newly created transcribed manifest files
+        (`transcribed_manifest.json`).
     """
-    Updates the training dataset configuration by adding pseudo-labeled datasets
-    to the training paths based on the dataset type.
+    all_manifest_filepaths = []
+    for prediction_filepath in prediction_filepaths:
+        prediction_name = os.path.join(prediction_filepath, "predictions_all.json")
+        transcripted_name = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+
+        # Open and read the original predictions_all.json file
+        with open(transcripted_name, 'w', encoding='utf-8') as f:
+            with open(prediction_name, 'r', encoding='utf-8') as pred_f:
 
+                for line in pred_f.readlines():
+                    data_entry = json.loads(line)
+                    if 'text' in data_entry:
+                        data_entry['orig_text'] = data_entry.pop('text')
+                    data_entry['text'] = data_entry.pop('pred_text')
+                    json.dump(data_entry, f, ensure_ascii=False)
+                    f.write("\n")
+            # Append the path of the new manifest file to the list
+            all_manifest_filepaths.append(transcripted_name)
+
+    return all_manifest_filepaths
+
+
+def write_sampled_shard_transcriptions(manifest_filepaths: List[str]) -> List[List[str]]:
+    """
+    Updates transcriptions by merging predicted shard data and transcribed manifest data.
+    This function processes prediction and transcribed manifest files, merges them
+    by matching the shard_id and audio file paths. For each shard, the corresponding
+    data entries are written to a new file.
     Args:
-        config (DictConfig): Training config file to be updated.
-        updated_manifest_filepaths (List[str]): List of updated manifest file paths to be included.
-        updated_tarred_audio_filepaths (Optional[List[str]]): List of updated tarred audio filepaths to be included.
+        manifest_filepaths (List[str]): A list of file paths to directories containing
+            prediction and transcribed manifest files.
+    Returns:
+        List[List[str]]: A list of lists containing the file paths to the generated
+            transcribed shard manifest files.
+    """
+    all_manifest_filepaths = []
 
+    # Process each prediction directory
+    for prediction_filepath in manifest_filepaths:
+        predicted_shard_data = {}
+        # Collect entries from prediction files based on shard id
+        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(prediction_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                audio_filepath = data_entry['audio_filepath']
+                predicted_shard_data.setdefault(shard_id, {})[audio_filepath] = data_entry
+    max_shard_id = 0
+    for full_path in glob.glob(os.path.join(prediction_filepath, f"transcribed_manifest_[0-9]*.json")):
+        all_data_entries = []
+        with open(full_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                shard_id = data_entry.get("shard_id")
+                max_shard_id = max(max_shard_id, shard_id)
+                all_data_entries.append(data_entry)
+        # Write the merged data to a new manifest file keeping new transcriptions
+        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest_{shard_id}.json")
+        with open(output_filename, 'w') as f:
+            for data_entry in all_data_entries:
+                audio_filepath = data_entry['audio_filepath']
+                # Escape duplicated audio files that end with *dup
+                if audio_filepath.endswith(".wav"):
+                    if shard_id in predicted_shard_data and audio_filepath in predicted_shard_data[shard_id]:
+                        predicted_data_entry = predicted_shard_data[shard_id][audio_filepath]
+                        if 'text' in predicted_data_entry:
+                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
+                        if "pred_text" in predicted_data_entry:
+                            predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
+                        json.dump(predicted_data_entry, f, ensure_ascii=False)
+                    else:
+                        json.dump(data_entry, f, ensure_ascii=False)
+                    f.write("\n")
+
+    shard_manifest_filepath = os.path.join(
+        prediction_filepath, f"transcribed_manifest__OP_0..{max_shard_id}_CL_.json"
+    )
+    all_manifest_filepaths.append([shard_manifest_filepath])
+
+    return all_manifest_filepaths
+
+def write_sampled_transcriptions(manifest_filepaths: List[str]) -> List[str]:
+    """
+    Updates transcriptions by merging predicted data with transcribed manifest data.
+    This function processes prediction and transcribed manifest files within given directories.
+    It matches audio file paths to update transcriptions with predictions, ensuring each audio file
+    is properly transcribed. The updated data is written to the transcribed manifest file.
+    Args:
+        manifest_filepaths (List[str]): A list of file paths to directories containing
+            the prediction file (`predictions_all.json`) and the transcribed manifest file
+            (`transcribed_manifest.json`).
     Returns:
-        Tuple[str, str]: A tuple containing:
-            - Updated manifest file paths as a string, formatted for Omegaconf.
-            - Updated tarred audio file paths as a string, formatted for Omegaconf.
+        List[str]: A list of file paths to the updated transcribed manifest files.
     """
-    updated_manifest_filepaths = get_transcribed_names(updated_manifest_filepaths, prefix, is_tarred=config.model.train_ds.get("is_tarred", False))
-    manifest_filepath = config.model.train_ds.manifest_filepath
 
-    if updated_tarred_audio_filepaths:
-        updated_tarred_audio_filepaths = [[path] for path in updated_tarred_audio_filepaths]
+    all_manifest_filepaths = []
+    for prediction_filepath in manifest_filepaths:
+        predicted_data = {}
+
+        prediction_path = os.path.join(prediction_filepath, "predictions_all.json")
+        with open(prediction_path, 'r') as f:
+            for line in f:
+                data_entry = json.loads(line)
+                path = data_entry['audio_filepath']
+    
+                predicted_data[path] = data_entry
+        full_path = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+        all_data_entries = []
+        count = 0
+        with open(full_path, 'r') as f:
+            for line in f:
+                count += 1
+                data_entry = json.loads(line)
+                all_data_entries.append(data_entry)
+               
 
-    # Updating the configuration based on dataset types
-    if config.model.train_ds.get("is_tarred", False):
-        tarred_audio_filepaths = config.model.train_ds.tarred_audio_filepaths
+        output_filename = os.path.join(prediction_filepath, f"transcribed_manifest.json")
+        with open(output_filename, 'w') as f:
+            for data_entry in all_data_entries:
+                audio_filepath = data_entry['audio_filepath']
+                if audio_filepath.endswith(".wav"):
+                    if audio_filepath in predicted_data:
+                        predicted_data_entry = predicted_data[audio_filepath]
+                        if 'text' in predicted_data_entry:
+                            predicted_data_entry['orig_text'] = predicted_data_entry.pop('text')
+                        predicted_data_entry['text'] = predicted_data_entry.pop('pred_text')
+                        json.dump(predicted_data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+                    else:
+                        json.dump(data_entry, f, ensure_ascii=False)
+                        f.write("\n")
+        all_manifest_filepaths.append(output_filename)
+    return all_manifest_filepaths
+
+
+def update_training_sets(
+    merged_config: OmegaConf, final_cache_manifests: list, tarred_audio_filepaths: Union[list, str]
+) -> OmegaConf:
+    """
+    Adds pseudo-labeled sets to the training datasets based on dataset type and
+    handles tarred audio files differently. The function updates the 'manifest_filepath'
+    and 'tarred_audio_filepaths' fields in the training dataset configuration.
+    Args:
+        merged_config: The configuration object containing the model and dataset settings.
+        final_cache_manifests: A list of paths to the manifest files for the pseudo-labeled data.
+        tarred_audio_filepaths: A string or list of tarred audio file paths to be added to the training set.
+    Returns:
+        merged_config: The updated configuration object with the new training datasets.
+    """
+
+    print()
+    print(f"update_training_sets")
+    print(f"")
+    if merged_config.model.train_ds.get("is_tarred", False):
         if isinstance(tarred_audio_filepaths, str):
-            updated_tarred_audio_filepaths.append([tarred_audio_filepaths])
-            updated_manifest_filepaths.append([manifest_filepath])
+            if isinstance(merged_config.model.train_ds['tarred_audio_filepaths'], str):
+                merged_config.model.train_ds['tarred_audio_filepaths'] = [
+                    [merged_config.model.train_ds['tarred_audio_filepaths']],
+                    [tarred_audio_filepaths],
+                ]
+            else:
+                merged_config.model.train_ds.tarred_audio_filepaths.append(tarred_audio_filepaths)
         else:
-            updated_tarred_audio_filepaths += tarred_audio_filepaths
-            updated_manifest_filepaths += manifest_filepath
+            if isinstance(merged_config.model.train_ds.tarred_audio_filepaths, str):
+                merged_config.model.train_ds.tarred_audio_filepaths = [
+                    [merged_config.model.train_ds.tarred_audio_filepaths]
+                ]
+            merged_config.model.train_ds.tarred_audio_filepaths += tarred_audio_filepaths
+
+        if isinstance(merged_config.model.train_ds.manifest_filepath, str):
+            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
+
+        merged_config.model.train_ds.manifest_filepath += final_cache_manifests
+
     else:
-        if config.model.train_ds.get("use_lhotse", False):
-            if isinstance(manifest_filepath, str):
-                updated_manifest_filepaths.append([manifest_filepath])
-            else:
-                updated_manifest_filepaths += manifest_filepath
+        print(f"is not tarred")
+        if isinstance(merged_config.model.train_ds.manifest_filepath, str):
+            print(f"is str")
+            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
+
+        if merged_config.model.train_ds.get("use_lhotse", False):
+            print(f"is lhotse")
+            merged_config.model.train_ds.manifest_filepath = [merged_config.model.train_ds.manifest_filepath]
+            merged_config.model.train_ds.manifest_filepath.append(final_cache_manifests)
         else:
-            updated_manifest_filepaths = [item for sublist in updated_manifest_filepaths for item in sublist]
-            if isinstance(manifest_filepath, str):
-                updated_manifest_filepaths.append(manifest_filepath)
-            else:
-                updated_manifest_filepaths += manifest_filepath
+            print(f"not lhotse")
+            print(f"merged_config.model.train_ds.manifest_filepath {merged_config.model.train_ds.manifest_filepath}")
+            print(f"final_cache_manifests {final_cache_manifests}")
+            merged_config.model.train_ds.manifest_filepath += final_cache_manifests
+
+
+    return merged_config
+
+
+def count_files_for_pseudo_labeling(manifest_filepath: str, is_tarred: bool) -> int:
+    """
+    Counts the number of files for pseudo-labeling.
+    Args:
+        manifest_filepath (str): The path to the manifest file(s).
+        is_tarred (bool): Flag to determine whether to count files for multiple shard manifests.
+    Returns:
+        int: The total number of audio files given for pseudo labeling.
+    """
+    if is_tarred:
+        dir_path, filename = os.path.split(manifest_filepath)
+        prefix = filename.split('_', 1)[0]
+        number_of_files = 0
+        for full_path in glob.glob(os.path.join(dir_path, f"{prefix}_[0-9]*.json")):
+            with open(full_path, 'r') as f:
+                number_of_files += len(f.readlines())
+    else:
+        with open(manifest_filepath, 'r') as f:
+            number_of_files = len(f.readlines())
 
-    # Returning strings formatted for Omegaconf
-    return (
-        str(updated_manifest_filepaths).replace(", ", ","),
-        str(updated_tarred_audio_filepaths).replace(", ", ",") if updated_tarred_audio_filepaths else None,
-    )
\ No newline at end of file
+    return number_of_files
\ No newline at end of file
diff --git a/sdp/utils/nemo_run_utils.py b/sdp/utils/nemo_run_utils.py
index f7252e04..6289da80 100644
--- a/sdp/utils/nemo_run_utils.py
+++ b/sdp/utils/nemo_run_utils.py
@@ -24,7 +24,7 @@
 )
 import logging
 import copy
-from sdp.processors.nemo import ipl_utils
+from sdp.utils import ipl_utils
 @lru_cache(maxsize=2)
 def get_tunnel(**ssh_tunnel):
     return SSHTunnel(**ssh_tunnel)

From f5227e147e1c545f126ec7a779807fe515ab4f33 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 29 May 2025 15:37:48 +0400
Subject: [PATCH 08/36] IPL dependencies

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 requirements/ipl.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 requirements/ipl.txt

diff --git a/requirements/ipl.txt b/requirements/ipl.txt
new file mode 100644
index 00000000..2950b05a
--- /dev/null
+++ b/requirements/ipl.txt
@@ -0,0 +1 @@
+nemo_run

From c4ed0caf116671d4415ec17569e8ef55dab24e1b Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 29 May 2025 15:43:52 +0400
Subject: [PATCH 09/36] Small changes

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/nemo/nemo_run_processor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdp/processors/nemo/nemo_run_processor.py b/sdp/processors/nemo/nemo_run_processor.py
index 6fefad6d..26da6089 100644
--- a/sdp/processors/nemo/nemo_run_processor.py
+++ b/sdp/processors/nemo/nemo_run_processor.py
@@ -127,7 +127,7 @@ def process(self):
                 "nemo_directory": nemo_root,
                 "inference_config_paths": inference_config_paths,
                 "manifests": manifests,
-                "p_cache": script_config.model.ipl_training.p_cache,
+                "p_cache": cluster_cfg.p_cache,
                 "num_gpus": num_nodes * num_gpus,
                 "is_tarred": getattr(script_config.model.train_ds, "is_tarred", False),
                 "output_manifest_file": "./inference_output_manifest_filepath.json",
@@ -137,7 +137,7 @@ def process(self):
             cmd = self.get_pseudo_labeling_command(
                 train_command_generator_config,
                 inference_command_generator_config,
-                num_ipl_epochs=script_config.model.ipl_training.num_ipl_epochs,
+                num_ipl_epochs=cluster_cfg.num_ipl_epochs,
                 new_manifest_files=manifests,
                 new_tarr_files=tarr_paths,
                 first_run=True,
@@ -156,7 +156,7 @@ def process(self):
                     cmd = self.get_pseudo_labeling_command(
                         train_command_generator_config,
                         inference_command_generator_config,
-                        num_ipl_epochs=script_config.model.ipl_training.num_ipl_epochs,
+                        num_ipl_epochs=cluster_cfg.num_ipl_epochs,
                         new_manifest_files=manifests,
                         new_tarr_files=tarr_paths,
                         first_run=False

From e99bbaedc567b9e3e0e47b5f9c1187db8a7b01fe Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 29 May 2025 15:46:12 +0400
Subject: [PATCH 10/36] Small changes

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index ce3b71b5..43df6448 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -114,7 +114,6 @@
     MakeLettersUppercaseAfterPeriod,
 )
 from sdp.processors.nemo.asr_inference import ASRInference
-from sdp.processors.nemo.nemo_run_processor import NemoRunIPLProcessor
 from sdp.processors.nemo.pc_inference import PCInference
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool

From fd64c043e33e9abf200cac3a1e6e7cdb8464a937 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 29 May 2025 15:46:53 +0400
Subject: [PATCH 11/36] Small changes

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 1 | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 1

diff --git a/1 b/1
deleted file mode 100644
index 94f612ba..00000000
--- a/1
+++ /dev/null
@@ -1,4 +0,0 @@
-script: asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py 
-script_config: /home/ntadevosyan/code/fork_ipl/NeMo/examples/asr/conf/update_script_config.yaml 
-inference_config: /home/ntadevosyan/code/nemo_run_ipl_PR/NeMo/examples/asr/inference_config_local_non_tarred.yaml 
-nemo_path: /home/ntadevosyan/code/pr_iplmixin/NeMo/examples/asr/

From 26c7fb869330c7cb157c4406bd216840eb1f56a6 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Fri, 30 May 2025 09:47:45 +0400
Subject: [PATCH 12/36] Config changes

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 dataset_configs/IPL/config.yaml               |  7 +++
 dataset_configs/IPL/nemo_run_config.yaml      | 60 +++++++++++++++++++
 .../{nemo => IPL}/nemo_run_processor.py       |  2 +-
 3 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 dataset_configs/IPL/config.yaml
 create mode 100644 dataset_configs/IPL/nemo_run_config.yaml
 rename sdp/processors/{nemo => IPL}/nemo_run_processor.py (100%)

diff --git a/dataset_configs/IPL/config.yaml b/dataset_configs/IPL/config.yaml
new file mode 100644
index 00000000..53ec6717
--- /dev/null
+++ b/dataset_configs/IPL/config.yaml
@@ -0,0 +1,7 @@
+processors_to_run: all
+
+processors:
+  - _target_: sdp.processors.IPL.nemo_run_processor.NemoRunIPLProcessor
+    config_path: ./nemo_run_config.yaml
+    output_manifest_file: ???
+
diff --git a/dataset_configs/IPL/nemo_run_config.yaml b/dataset_configs/IPL/nemo_run_config.yaml
new file mode 100644
index 00000000..1b2dbd44
--- /dev/null
+++ b/dataset_configs/IPL/nemo_run_config.yaml
@@ -0,0 +1,60 @@
+# The script to be run.
+script: # Script path  to run relative to directory 
+script_config: # Training config file for the script. ipl_epoch_stopper_callback should be provided in the config
+inference_config: # Inference config file of unlabeled data for transcribe_speech_parallel
+
+exp_name: null  # populated by exp_manager.name if not provided
+results_dir: # Where to store the results of the run
+
+nemo_directory: # Nemo directory path
+do_average: # Boolean value indicating whether to do average of checkpoints for pseudo-label generation
+p_cache: # Probability with which update pseudo-labeled set
+num_ipl_epochs: How many epochs do pseudo-labeling
+
+# Optional arguments
+num_runs: 
+num_gpus: 
+num_tasks_per_node: 
+max_runtime: # Specify for clusters
+
+########################################################################################################################
+
+executor: slurm # or local
+
+USER: ntadevosyan
+
+# Fields for cluster run
+ssh_tunnel:
+  host: 
+  # ------------------------------- Fill this up! -------------------------------
+  user: "${USER}"  # your username; or resolved from ${USER} environment variable ; or can be null which resolved from ${USER} environment variable
+  job_dir: "" # Job directory to keep created files
+  identity: ""
+  # -----------------------------------------------------------------------------
+
+account: 
+partition:
+job_name_prefix: 
+
+containers:
+  asr: # Container image
+
+
+env_vars:
+  - 'TOKENIZERS_PARALLELISM='
+  - 'AIS_ENDPOINT='
+  - 'LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE='
+  - 'TORCH_CUDNN_V8_API_ENABLED='
+  - 'PYTORCH_CUDA_ALLOC_CONF='
+  - 'HYDRA_FULL_ERROR=1'
+
+required_env_vars:
+  - 'HF_TOKEN='
+  - 'WANDB_KEY=' 
+
+mounts:
+  # Replace with your own paths in your cluster config
+  - /path/to/mount:/where/to/mount/
+
+timeouts:
+  partition_name: # Specify time
diff --git a/sdp/processors/nemo/nemo_run_processor.py b/sdp/processors/IPL/nemo_run_processor.py
similarity index 100%
rename from sdp/processors/nemo/nemo_run_processor.py
rename to sdp/processors/IPL/nemo_run_processor.py
index 26da6089..8e907e46 100644
--- a/sdp/processors/nemo/nemo_run_processor.py
+++ b/sdp/processors/IPL/nemo_run_processor.py
@@ -1,5 +1,5 @@
-from sdp.processors.base_processor import BaseProcessor
 from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator, InferenceCommandGenerator
+from sdp.processors.base_processor import BaseProcessor
 from omegaconf import OmegaConf, open_dict
 import os
 from pathlib import Path

From d0e41806f3940f876e8e91a4c0e9a3d8c110b001 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Fri, 30 May 2025 09:52:41 +0400
Subject: [PATCH 13/36] Config place change

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 {dataset_configs/IPL => sdp/processors/IPL/conf}/config.yaml      | 0
 .../IPL => sdp/processors/IPL/conf}/nemo_run_config.yaml          | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {dataset_configs/IPL => sdp/processors/IPL/conf}/config.yaml (100%)
 rename {dataset_configs/IPL => sdp/processors/IPL/conf}/nemo_run_config.yaml (100%)

diff --git a/dataset_configs/IPL/config.yaml b/sdp/processors/IPL/conf/config.yaml
similarity index 100%
rename from dataset_configs/IPL/config.yaml
rename to sdp/processors/IPL/conf/config.yaml
diff --git a/dataset_configs/IPL/nemo_run_config.yaml b/sdp/processors/IPL/conf/nemo_run_config.yaml
similarity index 100%
rename from dataset_configs/IPL/nemo_run_config.yaml
rename to sdp/processors/IPL/conf/nemo_run_config.yaml

From 792066334b388f3110f6f89561f2c7bc90997c13 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 4 Jun 2025 17:47:22 +0400
Subject: [PATCH 14/36] Moving configs

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 {sdp/processors/IPL/conf => dataset_configs/ipl}/config.yaml      | 0
 .../IPL/conf => dataset_configs/ipl}/nemo_run_config.yaml         | 0
 sdp/processors/{IPL => ipl}/__init__.py                           | 0
 sdp/processors/{IPL => ipl}/ipl_processors.py                     | 0
 sdp/processors/{IPL => ipl}/nemo_run_processor.py                 | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename {sdp/processors/IPL/conf => dataset_configs/ipl}/config.yaml (100%)
 rename {sdp/processors/IPL/conf => dataset_configs/ipl}/nemo_run_config.yaml (100%)
 rename sdp/processors/{IPL => ipl}/__init__.py (100%)
 rename sdp/processors/{IPL => ipl}/ipl_processors.py (100%)
 rename sdp/processors/{IPL => ipl}/nemo_run_processor.py (100%)

diff --git a/sdp/processors/IPL/conf/config.yaml b/dataset_configs/ipl/config.yaml
similarity index 100%
rename from sdp/processors/IPL/conf/config.yaml
rename to dataset_configs/ipl/config.yaml
diff --git a/sdp/processors/IPL/conf/nemo_run_config.yaml b/dataset_configs/ipl/nemo_run_config.yaml
similarity index 100%
rename from sdp/processors/IPL/conf/nemo_run_config.yaml
rename to dataset_configs/ipl/nemo_run_config.yaml
diff --git a/sdp/processors/IPL/__init__.py b/sdp/processors/ipl/__init__.py
similarity index 100%
rename from sdp/processors/IPL/__init__.py
rename to sdp/processors/ipl/__init__.py
diff --git a/sdp/processors/IPL/ipl_processors.py b/sdp/processors/ipl/ipl_processors.py
similarity index 100%
rename from sdp/processors/IPL/ipl_processors.py
rename to sdp/processors/ipl/ipl_processors.py
diff --git a/sdp/processors/IPL/nemo_run_processor.py b/sdp/processors/ipl/nemo_run_processor.py
similarity index 100%
rename from sdp/processors/IPL/nemo_run_processor.py
rename to sdp/processors/ipl/nemo_run_processor.py

From d5fe869e552550db7bc18972d5bc06f53f82e698 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 4 Jun 2025 18:38:36 +0400
Subject: [PATCH 15/36] Readme file

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/ipl/README.md | 40 ++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 sdp/processors/ipl/README.md

diff --git a/sdp/processors/ipl/README.md b/sdp/processors/ipl/README.md
new file mode 100644
index 00000000..d8b18cb7
--- /dev/null
+++ b/sdp/processors/ipl/README.md
@@ -0,0 +1,40 @@
+# 🧠 TopIPL: Iterative Pseudo-Labeling for ASR
+
+TopIPL is an **iterative pseudo-labeling algorithm** designed for training ASR models using both labeled and unlabeled data. It maintains a **dynamic pseudo-label cache** and leverages **top-N averaged checkpoints** as a teacher model to generate high-quality pseudo-labels across training iterations.
+
+## 📦 Contents
+
+- `NemoRunIPLProcessor` — Command generator and job submitter for IPL runs, compatible with local and cluster environments.
+- `nemo_run_config.yaml` — Main configuration file. Users should define all required paths and parameters here.
+
+## 🚀 Getting Started
+
+TopIPL runs like any other processor in the `nemo_run` framework. To use it, you must pass:
+
+- `output_manifest_file`: Path where the resulting manifest will be saved.
+- `nemo_run_config`: YAML file containing IPL setup, training/inference configs, and NeMo-Run settings.
+
+### 🔧 Training Config Requirements
+
+Your training config must:
+
+```yaml
+exp_manager:
+  create_ipl_epoch_stopper_callback: True
+```
+If you're not using Lhotse, also include:
+
+```yaml
+ipl_epoch_stopper_callback_params:
+stop_every_n_epochs: 2
+
+```
+
+### Prerequisites
+- nemo_run
+- `pip install -r ipl.txt`
+
+### Running the Code
+
+```bash
+python main.py --config-path=/path/to/directory/config --config-name=config.yaml
\ No newline at end of file

From c6e0cbc788dae03b4696c1910d90a375c93a2fef Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Sat, 7 Jun 2025 15:47:06 +0400
Subject: [PATCH 16/36] Fix test

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 tests/test_cfg_runtime_tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_cfg_runtime_tests.py b/tests/test_cfg_runtime_tests.py
index cce1b820..eb3cb4ec 100644
--- a/tests/test_cfg_runtime_tests.py
+++ b/tests/test_cfg_runtime_tests.py
@@ -25,7 +25,8 @@
 def get_test_cases():
     """Returns paths to all configs that are checked in."""
     for config_path in glob.glob(f"{DATASET_CONFIGS_ROOT}/**/*.yaml", recursive=True):
-        yield config_path
+        if not config_path.endswith("nemo_run_config.yaml"):
+            yield config_path
 
 
 @pytest.mark.parametrize("config_path", get_test_cases())

From 96bef792b5068199530713fb9efaad4bd1710075 Mon Sep 17 00:00:00 2001
From: nune-tadevosyan <152167970+nune-tadevosyan@users.noreply.github.com>
Date: Mon, 9 Jun 2025 12:50:56 +0400
Subject: [PATCH 17/36] Update nemo_run_config.yaml

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 dataset_configs/ipl/nemo_run_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_configs/ipl/nemo_run_config.yaml b/dataset_configs/ipl/nemo_run_config.yaml
index 1b2dbd44..3d518cb4 100644
--- a/dataset_configs/ipl/nemo_run_config.yaml
+++ b/dataset_configs/ipl/nemo_run_config.yaml
@@ -9,7 +9,7 @@ results_dir: # Where to store the results of the run
 nemo_directory: # Nemo directory path
 do_average: # Boolean value indicating whether to do average of checkpoints for pseudo-label generation
 p_cache: # Probability with which update pseudo-labeled set
-num_ipl_epochs: How many epochs do pseudo-labeling
+num_ipl_epochs: #How many epochs do pseudo-labeling
 
 # Optional arguments
 num_runs: 

From 3c4bda2f1efb038bc2a8097b7b813fca5d5761f4 Mon Sep 17 00:00:00 2001
From: nune-tadevosyan <152167970+nune-tadevosyan@users.noreply.github.com>
Date: Mon, 9 Jun 2025 12:51:16 +0400
Subject: [PATCH 18/36] Update nemo_run_config.yaml

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 dataset_configs/ipl/nemo_run_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_configs/ipl/nemo_run_config.yaml b/dataset_configs/ipl/nemo_run_config.yaml
index 3d518cb4..0389094c 100644
--- a/dataset_configs/ipl/nemo_run_config.yaml
+++ b/dataset_configs/ipl/nemo_run_config.yaml
@@ -9,7 +9,7 @@ results_dir: # Where to store the results of the run
 nemo_directory: # Nemo directory path
 do_average: # Boolean value indicating whether to do average of checkpoints for pseudo-label generation
 p_cache: # Probability with which update pseudo-labeled set
-num_ipl_epochs: #How many epochs do pseudo-labeling
+num_ipl_epochs: # How many epochs do pseudo-labeling
 
 # Optional arguments
 num_runs: 

From 4a121393bad739b9971cf3e029dec3bb3524b3c3 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 12 Jun 2025 13:17:23 +0400
Subject: [PATCH 19/36] Adding copyrights

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 dataset_configs/ipl/nemo_run_config.yaml | 16 +++++++++++++++-
 sdp/processors/ipl/nemo_run_processor.py | 17 ++++++++++++++++-
 sdp/utils/nemo_run_utils.py              |  2 +-
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/dataset_configs/ipl/nemo_run_config.yaml b/dataset_configs/ipl/nemo_run_config.yaml
index 0389094c..03846825 100644
--- a/dataset_configs/ipl/nemo_run_config.yaml
+++ b/dataset_configs/ipl/nemo_run_config.yaml
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # The script to be run.
 script: # Script path  to run relative to directory 
 script_config: # Training config file for the script. ipl_epoch_stopper_callback should be provided in the config
@@ -21,7 +35,7 @@ max_runtime: # Specify for clusters
 
 executor: slurm # or local
 
-USER: ntadevosyan
+USER:
 
 # Fields for cluster run
 ssh_tunnel:
diff --git a/sdp/processors/ipl/nemo_run_processor.py b/sdp/processors/ipl/nemo_run_processor.py
index 8e907e46..ff5c6374 100644
--- a/sdp/processors/ipl/nemo_run_processor.py
+++ b/sdp/processors/ipl/nemo_run_processor.py
@@ -1,4 +1,19 @@
-from sdp.processors.IPL.ipl_processors import TrainingCommandGenerator, InferenceCommandGenerator
+
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sdp.processors.ipl.ipl_processors import TrainingCommandGenerator, InferenceCommandGenerator
 from sdp.processors.base_processor import BaseProcessor
 from omegaconf import OmegaConf, open_dict
 import os
diff --git a/sdp/utils/nemo_run_utils.py b/sdp/utils/nemo_run_utils.py
index 6289da80..513ae48b 100644
--- a/sdp/utils/nemo_run_utils.py
+++ b/sdp/utils/nemo_run_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From a40f89c132e25d3642d34293745d3a9766024bdc Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 12 Jun 2025 13:21:03 +0400
Subject: [PATCH 20/36] Adding imports from main

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/__init__.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 43df6448..c3ff70b6 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -24,6 +24,8 @@
 from sdp.processors.datasets.fleurs.create_initial_manifest import (
     CreateInitialManifestFleurs,
 )
+from sdp.processors.datasets.hifitts2.download_dataset import DownloadHiFiTTS2
+from sdp.processors.datasets.hifitts2.remove_failed_chapters import RemovedFailedChapters
 from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
     CreateInitialManifestUzbekvoice,
 )
@@ -64,6 +66,9 @@
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
+from sdp.processors.datasets.ytc.create_initial_manifest import (
+    CreateInitialManifestYTC,
+)
 from sdp.processors.huggingface.speech_recognition import ASRTransformers
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
 
@@ -78,14 +83,23 @@
     SortManifest,
     SplitOnFixedDuration,
 )
-from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt
+from sdp.processors.modify_manifest.create_manifest import (
+    CreateCombinedManifests,
+    CreateInitialManifestByExt,
+)
 from sdp.processors.modify_manifest.data_to_data import (
+    ASRFileCheck,
+    CopyManifestData,
     CountNumWords,
+    ExtractFromBrackets,
     FfmpegConvert,
     GetAudioDuration,
+    GetWER,
     InsIfASRInsertion,
     InverseNormalizeText,
     NormalizeText,
+    MakeSentence,
+    ReadDocxLines,
     ReadTxtLines,
     SoxConvert,
     SplitLineBySentence,
@@ -96,6 +110,7 @@
 from sdp.processors.modify_manifest.data_to_dropbool import (
     DropASRError,
     DropASRErrorBeginningEnd,
+    DropDuplicates,
     DropHighCER,
     DropHighLowCharrate,
     DropHighLowDuration,
@@ -114,6 +129,7 @@
     MakeLettersUppercaseAfterPeriod,
 )
 from sdp.processors.nemo.asr_inference import ASRInference
+from sdp.processors.nemo.estimate_bandwidth import EstimateBandwidth
 from sdp.processors.nemo.pc_inference import PCInference
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool

From 87d7912739334fc0e2742ed4ef8dd80b84d9c047 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 12 Jun 2025 13:29:15 +0400
Subject: [PATCH 21/36] Adding copyrights

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/ipl/ipl_processors.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/sdp/processors/ipl/ipl_processors.py b/sdp/processors/ipl/ipl_processors.py
index 18c159ef..cf73830f 100644
--- a/sdp/processors/ipl/ipl_processors.py
+++ b/sdp/processors/ipl/ipl_processors.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Standard library imports
 import os
 import subprocess

From 9bceadf75af23100bf2a290ed9df1f2408419cc4 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Sat, 14 Jun 2025 13:04:41 +0400
Subject: [PATCH 22/36] Doc update

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 docs/src/sdp/api.rst              |  8 ++++++++
 docs/src/sdp/existing_configs.rst | 14 ++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index bfa2bc62..92d700ab 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -379,6 +379,14 @@ Miscellaneous
 .. autodata:: sdp.processors.tts.prepare_tts_segments.PrepareTTSSegmentsProcessor
    :annotation:
 
+.. autodata:: sdp.processors.ipl.NemoRunIPLProcessor
+   :annotation:
+
+.. autodata:: sdp.processors.ipl.TrainingCommandGenerator
+   :annotation:
+
+.. autodata:: sdp.processors.ipl.InferenceCommandGenerator
+   :annotation:
 
 .. _sdp-base-classes:
 
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 5e7b7c97..2b9c9036 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -407,3 +407,17 @@ HiFiTTS-2
    config-docs/english/hifitts2/config_22khz
    config-docs/english/hifitts2/config_44khz
    config-docs/english/hifitts2/config_bandwidth
+
+Unlabeled
+~~~~~~~~~
+
+**Supported configs**.
+
+* **Portuguese**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/portuguese/unlabeled/config.yaml>`__ |
+  :doc:`documentation <config-docs/portuguese/unlabeled/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/portuguese/unlabeled/config
\ No newline at end of file

From c6ea89c547639f0ca57eceff75cd71a865bd7285 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Sat, 14 Jun 2025 13:19:53 +0400
Subject: [PATCH 23/36] Doc update

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 docs/src/sdp/api.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 92d700ab..dcdd13bc 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -379,13 +379,13 @@ Miscellaneous
 .. autodata:: sdp.processors.tts.prepare_tts_segments.PrepareTTSSegmentsProcessor
    :annotation:
 
-.. autodata:: sdp.processors.ipl.NemoRunIPLProcessor
+.. autodata:: sdp.processors.ipl.nemo_run_processor.NemoRunIPLProcessor
    :annotation:
 
-.. autodata:: sdp.processors.ipl.TrainingCommandGenerator
+.. autodata:: sdp.processors.ipl.ipl_processors.TrainingCommandGenerator
    :annotation:
 
-.. autodata:: sdp.processors.ipl.InferenceCommandGenerator
+.. autodata:: sdp.processors.ipl.ipl_processors.InferenceCommandGenerator
    :annotation:
 
 .. _sdp-base-classes:

From d2c61ff457b45eda4acc2e6ba7435545978a3d07 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Sat, 14 Jun 2025 13:30:15 +0400
Subject: [PATCH 24/36] Doc update

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 docs/src/sdp/existing_configs.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 2b9c9036..7e68556a 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -408,16 +408,16 @@ HiFiTTS-2
    config-docs/english/hifitts2/config_44khz
    config-docs/english/hifitts2/config_bandwidth
 
-Unlabeled
-~~~~~~~~~
+NemoRunIPL
+~~~~~~~~~~
 
 **Supported configs**.
 
-* **Portuguese**:
-  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/portuguese/unlabeled/config.yaml>`__ |
-  :doc:`documentation <config-docs/portuguese/unlabeled/config>`
+* **IPL**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/ipl/config.yaml>`__ |
+  :doc:`documentation <config-docs/ipl/config>`
 
 .. toctree::
    :hidden:
 
-   config-docs/portuguese/unlabeled/config
\ No newline at end of file
+   config-docs/ipl/config
\ No newline at end of file

From 8f303d1ec9303cebc3a23077a1b412a913de987a Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Sat, 14 Jun 2025 13:44:09 +0400
Subject: [PATCH 25/36] Update config

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 dataset_configs/ipl/config.yaml | 43 +++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/dataset_configs/ipl/config.yaml b/dataset_configs/ipl/config.yaml
index 53ec6717..3c95e62c 100644
--- a/dataset_configs/ipl/config.yaml
+++ b/dataset_configs/ipl/config.yaml
@@ -1,3 +1,46 @@
+
+documentation: |
+  TopIPL
+  ######
+
+  This config is used to run the `TopIPL: Iterative Pseudo-Labeling for ASR <https://arxiv.org/abs/2506.07659>`_ training algorithm using NeMo-Run.
+
+  TopIPL is a **semi-supervised training method** for automatic speech recognition (ASR) that iteratively alternates between model training and pseudo-label generation for unlabeled data. It uses a **top-N checkpoint averaging strategy** to create a strong teacher model and maintains a **dynamic cache** of pseudo-labels throughout the process.
+
+  The pipeline is implemented as a processor compatible with the `nemo_run` framework. It generates an output manifest containing updated labels based on pseudo-labeling iterations.
+
+  This config performs the following steps:
+
+  1. Runs training and inference commands using NeMo-Run.
+  2. Periodically stops training to generate pseudo-labels with a top-N checkpoint ensemble.
+  3. Maintains a dynamic cache of pseudo-labels for unlabeled data.
+  4. Produces a new output manifest after each iteration.
+
+  **Required arguments**.
+
+  * **output_manifest_file**: path where the final manifest with pseudo-labels will be saved.
+  * **nemo_run_config**: YAML config file specifying the training, inference, and IPL parameters.
+
+  **Training config requirements**.
+
+  Your training config must include the following setting to enable IPL:
+
+  ```yaml
+  exp_manager:
+    create_ipl_epoch_stopper_callback: True
+  ```
+  If you're not using Lhotse, also include:
+
+  ```yaml
+  ipl_epoch_stopper_callback_params:
+  stop_every_n_epochs: 2
+
+  ```
+  ### Prerequisites
+  - nemo_run
+  - `pip install -r ipl.txt`
+
+
 processors_to_run: all
 
 processors:

From ecfdaf491b0c108b787f472d131f9a741164538c Mon Sep 17 00:00:00 2001
From: nune-tadevosyan <152167970+nune-tadevosyan@users.noreply.github.com>
Date: Mon, 16 Jun 2025 16:01:10 +0400
Subject: [PATCH 26/36] Update nemo_run_config.yaml

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 dataset_configs/ipl/nemo_run_config.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dataset_configs/ipl/nemo_run_config.yaml b/dataset_configs/ipl/nemo_run_config.yaml
index 03846825..df968da1 100644
--- a/dataset_configs/ipl/nemo_run_config.yaml
+++ b/dataset_configs/ipl/nemo_run_config.yaml
@@ -20,6 +20,12 @@ inference_config: # Inference config file of unlabeled data for transcribe_speec
 exp_name: null  # populated by exp_manager.name if not provided
 results_dir: # Where to store the results of the run
 
+# Path to the local NeMo repository. This is used to locate scripts and configs from NeMo.
+# To set this up:
+#   1. Clone the NeMo repository:
+#        git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo
+#   2. Set the path here:
+# Make sure this path is valid and NeMo is up to date if you're using its scripts.
 nemo_directory: # Nemo directory path
 do_average: # Boolean value indicating whether to do average of checkpoints for pseudo-label generation
 p_cache: # Probability with which update pseudo-labeled set

From 39c822110ddcb781ab74a555d2ce72b6045b68b1 Mon Sep 17 00:00:00 2001
From: nune-tadevosyan <152167970+nune-tadevosyan@users.noreply.github.com>
Date: Mon, 16 Jun 2025 16:02:27 +0400
Subject: [PATCH 27/36] Update ipl.txt

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 requirements/ipl.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/requirements/ipl.txt b/requirements/ipl.txt
index 2950b05a..de76dca4 100644
--- a/requirements/ipl.txt
+++ b/requirements/ipl.txt
@@ -1 +1,11 @@
 nemo_run
+
+# Nemo repository path is also required, it is used to locate scripts and configs from NeMo.
+#
+# To set this up:
+#   1. Clone the NeMo repository:
+#        git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo
+#   2. Set the path in nemo_run_config.yaml:
+#        nemo_directory: /your/desired/path/to/nemo
+#
+# Make sure this path is valid and NeMo is up to date if you're using its scripts.

From 4f6c355babe367240cecafeedee5a6a29646abf8 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Mon, 16 Jun 2025 17:32:42 +0400
Subject: [PATCH 28/36] update

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/ipl/README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/sdp/processors/ipl/README.md b/sdp/processors/ipl/README.md
index d8b18cb7..e7d9872c 100644
--- a/sdp/processors/ipl/README.md
+++ b/sdp/processors/ipl/README.md
@@ -31,8 +31,15 @@ stop_every_n_epochs: 2
 ```
 
 ### Prerequisites
-- nemo_run
-- `pip install -r ipl.txt`
+
+Before using TopIPL, make sure the following are set up:
+
+- Clone the NeMo repository:
+  ```bash
+  git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo
+
+- Set the path to NeMo in your `nemo_run_config.yaml`: `nemo_directory: /your/desired/path/to/nemo`
+- `pip install -r requirements/ipl.txt`
 
 ### Running the Code
 

From 9cbac0f9cbe9674fe89c4692bd2531197631bad0 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 26 Jun 2025 18:52:10 +0400
Subject: [PATCH 29/36] Small change

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/ipl/nemo_run_processor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sdp/processors/ipl/nemo_run_processor.py b/sdp/processors/ipl/nemo_run_processor.py
index ff5c6374..d8d5586c 100644
--- a/sdp/processors/ipl/nemo_run_processor.py
+++ b/sdp/processors/ipl/nemo_run_processor.py
@@ -148,11 +148,12 @@ def process(self):
                 "output_manifest_file": "./inference_output_manifest_filepath.json",
             }
 
+            print(f"cluster_cf {cluster_cfg}")
             # Generate the complete IPL command
             cmd = self.get_pseudo_labeling_command(
                 train_command_generator_config,
                 inference_command_generator_config,
-                num_ipl_epochs=cluster_cfg.num_ipl_epochs,
+                num_ipl_epochs=cluster_cfg['num_ipl_epochs'],
                 new_manifest_files=manifests,
                 new_tarr_files=tarr_paths,
                 first_run=True,
@@ -171,7 +172,7 @@ def process(self):
                     cmd = self.get_pseudo_labeling_command(
                         train_command_generator_config,
                         inference_command_generator_config,
-                        num_ipl_epochs=cluster_cfg.num_ipl_epochs,
+                        num_ipl_epochs=cluster_cfg['num_ipl_epochs'],
                         new_manifest_files=manifests,
                         new_tarr_files=tarr_paths,
                         first_run=False

From 0b4a9d61a4dcf61178cd6e6a2517f16a5f33540f Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Thu, 26 Jun 2025 20:46:53 +0400
Subject: [PATCH 30/36] small update

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/processors/ipl/ipl_processors.py     | 2 +-
 sdp/processors/ipl/nemo_run_processor.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sdp/processors/ipl/ipl_processors.py b/sdp/processors/ipl/ipl_processors.py
index cf73830f..6a87fae0 100644
--- a/sdp/processors/ipl/ipl_processors.py
+++ b/sdp/processors/ipl/ipl_processors.py
@@ -291,7 +291,7 @@ def process(self, first_run=False):
         update_inference_config_path = os.path.join(self.nemo_directory, "scripts/pseudo_labeling/update_inference_config.py")
         
         if first_run:
-            cmd += f"{self.get_pl_inference_command(self.inference_config_paths, shuffle=False)}"
+            cmd += f" && {self.get_pl_inference_command(self.inference_config_paths, shuffle=False)}"
             cmd += (
                 f" && python {write_transcription_path} "
                 f"--prediction_filepaths {prediction_directories_str} --full_pass"
diff --git a/sdp/processors/ipl/nemo_run_processor.py b/sdp/processors/ipl/nemo_run_processor.py
index d8d5586c..5cc78433 100644
--- a/sdp/processors/ipl/nemo_run_processor.py
+++ b/sdp/processors/ipl/nemo_run_processor.py
@@ -278,7 +278,8 @@ def get_pseudo_labeling_command(
         exec_cmd += " && sleep 10"
         if avg_cmd:
             exec_cmd += " && " + avg_cmd
-        exec_cmd += " && " + infer_proc.process(first_run=first_run)
+    
+        exec_cmd += " " + infer_proc.process(first_run=first_run)
 
         for _ in range(num_ipl_epochs):
             exec_cmd += " && sleep 10"

From 6cb8b406969f8b2b552b9ec34489ae261b4d4c39 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 26 Jun 2025 13:11:40 -0700
Subject: [PATCH 31/36] forse jiwer

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 requirements/main.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/main.txt b/requirements/main.txt
index d133867a..74ce0255 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -18,7 +18,7 @@ python-docx
 pydub
 dask
 distributed
-
+jiwer>=3.1.0,<4.0.0
 # toloka-kit  # Temporarily disabled due to Toloka's technical pause; keep as reference for past and future API support
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
 # for some processers, additionally nemo_text_processing is required

From 77b64f272d0d2918c0bc9099e166aa9c46e962af Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 26 Jun 2025 15:35:14 -0700
Subject: [PATCH 32/36] attempt 1 to fix certificates

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .github/workflows/tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2ad8e665..51012aa8 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -75,6 +75,8 @@ jobs:
         pip install nemo-toolkit[asr,nlp]==1.23.0
         pip install nemo_text_processing
         pip install -r requirements/huggingface.txt
+        pip install certifi
+        export SSL_CERT_FILE=$(python -m certifi)
         python -m pip cache purge
 
     - name: Run all tests

From a559483383a12fe7b922ef6dbd0c8c5e5575332e Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 26 Jun 2025 16:33:41 -0700
Subject: [PATCH 33/36] attempt 2 to fix cert

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .github/workflows/tests.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 51012aa8..f1e4860a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -78,6 +78,7 @@ jobs:
         pip install certifi
         export SSL_CERT_FILE=$(python -m certifi)
         python -m pip cache purge
+        
 
     - name: Run all tests
       env:
@@ -85,6 +86,9 @@ jobs:
         AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
         CLEAN_UP_TMP_PATH: 1
       run: |
+        wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem
+        sudo cp incommon-rsa-ca2.pem     /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt
+        sudo update-ca-certificates
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt
 

From 3a92ee29cab1f74f98f34faeee7398e4a1567445 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Fri, 27 Jun 2025 20:16:23 +0400
Subject: [PATCH 34/36] small change

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 dataset_configs/ipl/config.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dataset_configs/ipl/config.yaml b/dataset_configs/ipl/config.yaml
index 3c95e62c..902876c7 100644
--- a/dataset_configs/ipl/config.yaml
+++ b/dataset_configs/ipl/config.yaml
@@ -1,4 +1,3 @@
-
 documentation: |
   TopIPL
   ######
@@ -47,4 +46,3 @@ processors:
   - _target_: sdp.processors.IPL.nemo_run_processor.NemoRunIPLProcessor
     config_path: ./nemo_run_config.yaml
     output_manifest_file: ???
-

From c8ba85af7a2836bc2cc4bff64ebad27c9e1fe330 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 2 Jul 2025 16:04:21 +0400
Subject: [PATCH 35/36] Doc changes

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 dataset_configs/ipl/config.yaml          | 29 ++++++++++++------------
 docs/src/sdp/existing_configs.rst        |  6 ++++-
 sdp/processors/ipl/ipl_processors.py     | 14 ++++++------
 sdp/processors/ipl/nemo_run_processor.py |  2 +-
 sdp/utils/ipl_utils.py                   |  4 ++--
 sdp/utils/nemo_run_utils.py              |  4 ++--
 sdp/utils/skills_utils.py                |  9 +++++---
 7 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/dataset_configs/ipl/config.yaml b/dataset_configs/ipl/config.yaml
index 902876c7..5d69c742 100644
--- a/dataset_configs/ipl/config.yaml
+++ b/dataset_configs/ipl/config.yaml
@@ -15,30 +15,31 @@ documentation: |
   3. Maintains a dynamic cache of pseudo-labels for unlabeled data.
   4. Produces a new output manifest after each iteration.
 
-  **Required arguments**.
+  **Required arguments**
 
-  * **output_manifest_file**: path where the final manifest with pseudo-labels will be saved.
-  * **nemo_run_config**: YAML config file specifying the training, inference, and IPL parameters.
+  - **output_manifest_file**: path where the final manifest with pseudo-labels will be saved.
+  - **nemo_run_config**: YAML config file specifying the training, inference, and IPL parameters.
 
-  **Training config requirements**.
+  **Training config requirements**
 
   Your training config must include the following setting to enable IPL:
 
-  ```yaml
-  exp_manager:
-    create_ipl_epoch_stopper_callback: True
-  ```
+  .. code-block:: yaml
+
+    exp_manager:
+      create_ipl_epoch_stopper_callback: True
+
   If you're not using Lhotse, also include:
 
-  ```yaml
-  ipl_epoch_stopper_callback_params:
-  stop_every_n_epochs: 2
+  .. code-block:: yaml
+
+    ipl_epoch_stopper_callback_params:
+      stop_every_n_epochs: 2
 
-  ```
   ### Prerequisites
-  - nemo_run
-  - `pip install -r ipl.txt`
 
+  - nemo_run
+  - ``pip install -r ipl.txt``
 
 processors_to_run: all
 
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 7e68556a..d0a3e64e 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -416,8 +416,12 @@ NemoRunIPL
 * **IPL**:
   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/ipl/config.yaml>`__ |
   :doc:`documentation <config-docs/ipl/config>`
+* **NeMoRun**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/ipl/nemo_run_config.yaml>`__ |
+  :doc:`documentation <config-docs/ipl/nemo_run_config>`
 
 .. toctree::
    :hidden:
 
-   config-docs/ipl/config
\ No newline at end of file
+   config-docs/ipl/config
+   config-docs/ipl/nemo_run_config
\ No newline at end of file
diff --git a/sdp/processors/ipl/ipl_processors.py b/sdp/processors/ipl/ipl_processors.py
index 6a87fae0..0a29c656 100644
--- a/sdp/processors/ipl/ipl_processors.py
+++ b/sdp/processors/ipl/ipl_processors.py
@@ -36,8 +36,8 @@ class TrainingCommandGenerator(BaseProcessor):
         training_config_cluster (str): Path to the cluster configuration file
         training_script_path (str): Path to the training script relative to nemo_directory
         nemo_directory (str): Base directory for NeMo framework
-        new_manifest_files (str, optional): New manifest files to add to the training configuration
-        new_tarred_audio_filepaths (str, optional): New tarred audio filepaths to add to the training configuration
+        new_manifest_files (str, Optional): New manifest files to add to the training configuration
+        new_tarred_audio_filepaths (str, Optional): New tarred audio filepaths to add to the training configuration
         **kwargs: Additional arguments passed to the parent BaseProcessor class
     """
 
@@ -109,8 +109,8 @@ def get_execution_script(
             cluster_script_path (str): Path to the script to run on the cluster
             local_config (DictConfig): Local configuration loaded from training_config_local
             cluster_config_path (str): Path to the cluster configuration file
-            updated_manifest_filepaths (str, optional): Path to the updated manifest file
-            updated_tarred_filepaths (str, optional): Path to the updated tarred audio filepaths
+            updated_manifest_filepaths (str, Optional): Path to the updated manifest file
+            updated_tarred_filepaths (str, Optional): Path to the updated tarred audio filepaths
 
         Returns:
             str: Command to run the script on the cluster
@@ -279,7 +279,7 @@ def process(self, first_run=False):
         Generate the pseudo-labeling command for the given configuration and training parameters.
 
         Args:
-            first_run (bool, optional): Whether this is the first run of pseudo-labeling.
+            first_run (bool, Optional): Whether this is the first run of pseudo-labeling.
 
         Returns:
             str: The constructed pseudo-labeling command.
@@ -323,7 +323,7 @@ def get_pl_inference_command(self, inference_configs, shuffle=None):
         Generate a command to run PL inference with multiple configuration files.
         Args:
             inference_configs (list): List of configuration file paths.
-            shuffle (bool, optional): Whether to enable shuffling in predict_ds.
+            shuffle (bool, Optional): Whether to enable shuffling in predict_ds.
 
         Returns:
             str: Combined command string to execute PL inference.
@@ -338,4 +338,4 @@ def get_pl_inference_command(self, inference_configs, shuffle=None):
             cmd_list.append(cmd)
 
         return " && ".join(cmd_list)
-    
\ No newline at end of file
+    
diff --git a/sdp/processors/ipl/nemo_run_processor.py b/sdp/processors/ipl/nemo_run_processor.py
index 5cc78433..529a128c 100644
--- a/sdp/processors/ipl/nemo_run_processor.py
+++ b/sdp/processors/ipl/nemo_run_processor.py
@@ -30,7 +30,7 @@ class NemoRunIPLProcessor(BaseProcessor):
     Args:
         config_path (str): Path to the YAML configuration file containing IPL settings
         output_manifest_file (str): Path where the output manifest file will be written
-        input_manifest_file (str, optional): Path to the input manifest file
+        input_manifest_file (str, Optional): Path to the input manifest file
     """
     
     def __init__(
diff --git a/sdp/utils/ipl_utils.py b/sdp/utils/ipl_utils.py
index 0630be4f..07d50c5d 100644
--- a/sdp/utils/ipl_utils.py
+++ b/sdp/utils/ipl_utils.py
@@ -27,7 +27,7 @@ def separate_multiple_transcriptions(inference_config: dict) -> Tuple[List[str],
     Returns:
         Tuple[List[str], Optional[List[str]]]: A tuple containing:
             - A list of manifest file paths.
-            - An optional list of tarred audio file paths, or None if not applicable.
+            - An Optional list of tarred audio file paths, or None if not applicable.
     """
     
     if hasattr(inference_config.predict_ds, "is_tarred") and inference_config.predict_ds.is_tarred:
@@ -327,4 +327,4 @@ def count_files_for_pseudo_labeling(manifest_filepath: str, is_tarred: bool) ->
         with open(manifest_filepath, 'r') as f:
             number_of_files = len(f.readlines())
 
-    return number_of_files
\ No newline at end of file
+    return number_of_files
diff --git a/sdp/utils/nemo_run_utils.py b/sdp/utils/nemo_run_utils.py
index 513ae48b..5cbd8575 100644
--- a/sdp/utils/nemo_run_utils.py
+++ b/sdp/utils/nemo_run_utils.py
@@ -121,7 +121,7 @@ def create_remote_directory(directory: str | list, cluster_config: dict):
         raise ValueError(f"Unsupported executor: {cluster_config.get('executor')}")
 
 
-def create_remote_config(config: dict | DictConfig, config_name: str, config_directory: str, cluster_config: dict):
+def create_remote_config(config: dict, config_name: str, config_directory: str, cluster_config: dict):
     """
     Utility to write a remote config file on the cluster using the cluster config.
 
@@ -403,4 +403,4 @@ def get_mounted_filepath(cluster_config: dict, filepath: str):
     mount_source, mount_dest = mount_path.split(':')
     filepath = mount_dest + filepath[len(mount_source) :]  # replace the mount destination with the mount source
 
-    return filepath
\ No newline at end of file
+    return filepath
diff --git a/sdp/utils/skills_utils.py b/sdp/utils/skills_utils.py
index 892fdcd1..5e370f0e 100644
--- a/sdp/utils/skills_utils.py
+++ b/sdp/utils/skills_utils.py
@@ -31,7 +31,10 @@
 import nemo_run as run
 import yaml
 from huggingface_hub import get_token
-from invoke import StreamWatcher
+try:
+    from invoke import StreamWatcher
+except ImportError:
+    StreamWatcher = object  # fallback if invoke is not installed
 from nemo_run.config import set_nemorun_home
 from nemo_run.core.execution.docker import DockerExecutor
 from nemo_run.core.execution.slurm import SlurmJobDetails, get_packaging_job_key
@@ -1001,7 +1004,7 @@ def add_task(
     with_sandbox=False,
     sandbox_port: int | None = None,
     server_config=None,
-    reuse_code_exp: str | run.Experiment | None = None,
+    reuse_code_exp: str = None,
     reuse_code: bool = True,
     task_dependencies: list[str] = None,
     run_after: str | list[str] | None = None,
@@ -1223,4 +1226,4 @@ def run_exp(exp, cluster_config, sequential=None):
         if isinstance(tunnel, run.SSHTunnel):
             ssh_hash = tunnel_hash(tunnel)
             if ssh_hash not in REUSE_CODE_EXP:
-                REUSE_CODE_EXP[ssh_hash] = exp
\ No newline at end of file
+                REUSE_CODE_EXP[ssh_hash] = exp

From 4392ef2839637a810193c7583d316782ea48e4e6 Mon Sep 17 00:00:00 2001
From: Nune <ntadevosyan@nvidia.com>
Date: Wed, 2 Jul 2025 16:15:43 +0400
Subject: [PATCH 36/36] Doc changes

Signed-off-by: Nune <ntadevosyan@nvidia.com>
---
 sdp/utils/skills_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdp/utils/skills_utils.py b/sdp/utils/skills_utils.py
index 5e370f0e..ac536043 100644
--- a/sdp/utils/skills_utils.py
+++ b/sdp/utils/skills_utils.py
@@ -40,7 +40,6 @@
 from nemo_run.core.execution.slurm import SlurmJobDetails, get_packaging_job_key
 from nemo_run.core.tunnel import SSHTunnel
 from omegaconf import DictConfig
-from torchx.specs.api import AppState
 
 LOG = logging.getLogger(__file__)
 
@@ -136,6 +135,7 @@ def get_exp_handles(expname: str, ignore_finished=True, ignore_exp_not_exists=Tr
     TODO: it's still possible that job submission fails if the tasks exist when this function
           is called, but finish before nemo-run submits a new job (which might take minutes)
     """
+    from torchx.specs.api import AppState
 
     def _get_handles(exp):
         handles = []