From ca14a61c68fd71392a872b994c2ad00a88214518 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Wed, 11 Feb 2026 17:46:22 +0530
Subject: [PATCH 01/77] Update omnia.sh

---
 omnia.sh | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 320 insertions(+), 11 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 235cc1dbc1..9c46a04dc9 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -979,10 +979,11 @@ start_container_session() {
 }
 
 show_help() {
-    echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]"
+    echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]"
     echo "  -i, --install     Install and start the Omnia core container"
     echo "  -u, --uninstall   Uninstall the Omnia core container and clean up configuration"
-    echo "      --upgrade     Upgrade the Omnia core container from image tag 1.0 to 1.1"
+    echo "      --upgrade     Upgrade the Omnia core container to newer version
+    echo "      --rollback    Rollback the Omnia core container to previous version
     echo "  -v, --version     Display Omnia version information"
     echo "  -h, --help        More information about usage"
 }
@@ -1248,15 +1249,6 @@ phase1_validate() {
         return 1
     fi
 
-    if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then
-        echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image"
-        return 1
-    fi
-
-    echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)"
-
-   
-
     if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
         echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
         echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
@@ -1372,6 +1364,9 @@ phase4_container_swap() {
 
     if [ ! -f "$quadlet_file" ]; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
@@ -1385,27 +1380,42 @@ phase4_container_swap() {
 
     if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop 1.0 container"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
     echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit"
     if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
         echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 image not available"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
     if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
     systemctl daemon-reload || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     }
 
     systemctl start omnia_core.service || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 1.1 container"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     }
 
@@ -1419,6 +1429,9 @@ phase4_container_swap() {
 
     if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 container failed health check"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
@@ -1436,6 +1449,9 @@ phase4_container_swap() {
         fi
     "; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        rollback_omnia_core
         return 1
     fi
 
@@ -1490,6 +1506,296 @@ upgrade_omnia_core() {
     exit 0
 }
 
+# Validate backup directory structure and files
+validate_backup_directory() {
+    local backup_path="$1"
+    
+    echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path"
+    
+    # Check if backup directory exists
+    if ! podman exec -u root omnia_core test -d "$backup_path"; then
+        echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path"
+        return 1
+    fi
+    
+    # Check for required subdirectories
+    for subdir in input metadata configs; do
+        if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then
+            echo "[ERROR] [ROLLBACK] Missing required subdirectory: $backup_path/$subdir"
+            return 1
+        fi
+    done
+    
+    # Check for required files
+    if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then
+        echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml"
+        return 1
+    fi
+    
+    if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then
+        echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container"
+        return 1
+    fi
+    
+    # Verify metadata contains version information
+    if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then
+        echo "[ERROR] [ROLLBACK] Metadata file does not contain version information"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Backup validation successful"
+    return 0
+}
+
+# Stop container gracefully with timeout
+stop_container_gracefully() {
+    local container_name="$1"
+    local timeout="${2:-30}"
+    
+    echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..."
+    
+    # Try graceful stop first
+    if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then
+        echo "[INFO] [ROLLBACK] Container stopped gracefully"
+        return 0
+    fi
+    
+    # Check if container is still running
+    if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
+        echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..."
+        if podman stop "$container_name" >/dev/null 2>&1; then
+            echo "[INFO] [ROLLBACK] Container force stopped"
+            return 0
+        else
+            echo "[ERROR] [ROLLBACK] Failed to stop container"
+            return 1
+        fi
+    fi
+    
+    return 0
+}
+
+# Restore files from backup
+restore_from_backup() {
+    local backup_path="$1"
+    
+    echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path"
+    
+    # Restore input files
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        rm -rf /opt/omnia/input
+        cp -a '$backup_path/input' /opt/omnia/
+    "; then
+        echo "[ERROR] [ROLLBACK] Failed to restore input files"
+        return 1
+    fi
+    
+    # Restore metadata
+    if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then
+        echo "[ERROR] [ROLLBACK] Failed to restore metadata"
+        return 1
+    fi
+    
+    # Restore container config on host
+    if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then
+        echo "[ERROR] [ROLLBACK] Failed to restore container config"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Files restored successfully"
+    return 0
+}
+
+# Main rollback function
+rollback_omnia_core() {
+    echo -e "${GREEN}================================================================================${NC}"
+    echo -e "${GREEN}                         OMNIA CORE ROLLBACK${NC}"
+    echo -e "${GREEN}================================================================================${NC}"
+    echo ""
+    
+    # Audit log start
+    local rollback_start=$(date -Iseconds)
+    echo "[AUDIT] Rollback operation started at: $rollback_start"
+    
+    # Check if omnia_core container is running
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo -e "${RED}ERROR: Omnia core container is not running.${NC}"
+        exit 1
+    fi
+    
+    # Get current version
+    if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then
+        echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}"
+        exit 1
+    fi
+    
+    local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
+    if [ "$current_version" != "2.1.0.0" ]; then
+        echo -e "${RED}ERROR: Cannot rollback from version $current_version. Rollback is only supported from version 2.1.0.0.${NC}"
+        exit 1
+    fi
+    
+    # List available backups
+    echo "[INFO] [ROLLBACK] Scanning for available backups..."
+    local backup_dirs=()
+    while IFS= read -r line; do
+        backup_dirs+=("$line")
+    done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_*" 2>/dev/null | sort -r)
+    
+    if [ ${#backup_dirs[@]} -eq 0 ]; then
+        echo -e "${RED}ERROR: No backup directories found.${NC}"
+        exit 1
+    fi
+    
+    echo ""
+    echo "Available backup versions:"
+    for i in "${!backup_dirs[@]}"; do
+        local version=$(basename "${backup_dirs[$i]}" | sed 's/version_//')
+        local backup_date=$(podman exec -u root omnia_core stat -c '%y' "${backup_dirs[$i]}" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1)
+        echo "  $((i+1)). Version $version (created: $backup_date)"
+    done
+    
+    # Prompt for backup selection
+    echo ""
+    echo -n "Select backup to restore from (1-${#backup_dirs[@]}): "
+    read -r selection
+    
+    # Validate selection
+    if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#backup_dirs[@]} ]; then
+        echo -e "${RED}ERROR: Invalid selection.${NC}"
+        exit 1
+    fi
+    
+    local selected_backup="${backup_dirs[$((selection-1))]}"
+    local backup_version=$(basename "$selected_backup" | sed 's/version_//')
+    
+    echo ""
+    echo "Selected backup: Version $backup_version"
+    echo -n "Are you sure you want to rollback to version $backup_version? [y/N]: "
+    read -r confirm
+    
+    if [[ ! "$confirm" =~ ^[yY] ]]; then
+        echo "Rollback cancelled by user."
+        exit 0
+    fi
+    
+    # Validate selected backup - only check if directory exists without podman exec
+    if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then
+        # Try to check on host if container check fails
+        # Get shared path from metadata to check on host
+        local shared_path=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
+        local host_backup_path="${selected_backup#/opt/omnia}"
+        if [ -z "$shared_path" ] || [ ! -d "$shared_path$host_backup_path" ]; then
+            echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}"
+            exit 1
+        fi
+    fi
+    
+    echo ""
+    echo "[INFO] [ROLLBACK] Starting rollback process..."
+    
+    # Step 1: Stop 1.1 container gracefully
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 1.1 container..."
+    if ! stop_container_gracefully "omnia_core" 30; then
+        echo -e "${RED}ERROR: Failed to stop container.${NC}"
+        exit 1
+    fi
+    
+    # Step 2: Check for 1.0 image
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 2: Checking for Omnia core 1.0 image..."
+    if ! podman inspect omnia_core:1.0 >/dev/null 2>&1; then
+        echo -e "${YELLOW}WARNING: Omnia core 1.0 image not found locally.${NC}"
+        echo -e "${YELLOW}Attempting to tag image...${NC}"
+        
+        # Try to tag latest as 1.0 if available
+        if podman inspect omnia_core:latest >/dev/null 2>&1; then
+            podman tag omnia_core:latest omnia_core:1.0
+        else
+            echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}"
+            exit 1
+        fi
+    fi
+    
+    # Step 3: Start 1.0 container
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core 1.0 container..."
+    systemctl daemon-reload
+    if ! systemctl start omnia_core.service; then
+        echo -e "${RED}ERROR: Failed to start container service.${NC}"
+        exit 1
+    fi
+    
+    # Step 4: Wait for container to be healthy
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 4: Waiting for container to be healthy..."
+    local health_timeout=60
+    local health_count=0
+    
+    while [ $health_count -lt $health_timeout ]; do
+        if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then
+            echo "[INFO] [ROLLBACK] Container is healthy"
+            break
+        fi
+        sleep 1
+        health_count=$((health_count + 1))
+        echo -n "."
+    done
+    
+    if [ $health_count -ge $health_timeout ]; then
+        echo ""
+        echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}"
+        exit 1
+    fi
+    
+    # Step 5: Validate backup directory structure
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..."
+    if ! validate_backup_directory "$selected_backup"; then
+        echo -e "${RED}ERROR: Backup validation failed.${NC}"
+        exit 1
+    fi
+    
+    # Step 6: Restore files from backup
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..."
+    if ! restore_from_backup "$selected_backup"; then
+        echo -e "${RED}ERROR: Failed to restore from backup.${NC}"
+        exit 1
+    fi
+    
+    # Step 7: Verify container version
+    echo ""
+    echo "[INFO] [ROLLBACK] Step 7: Verifying container version..."
+    local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
+    
+    if [ "$verify_version" != "$backup_version" ]; then
+        echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}"
+        exit 1
+    fi
+    
+    # Audit log end
+    local rollback_end=$(date -Iseconds)
+    echo "[AUDIT] Rollback operation completed at: $rollback_end"
+    echo "[AUDIT] Rolled back from version $current_version to $backup_version"
+    
+    echo ""
+    echo -e "${GREEN}================================================================================${NC}"
+    echo -e "${GREEN}                    ROLLBACK COMPLETED SUCCESSFULLY${NC}"
+    echo -e "${GREEN}================================================================================${NC}"
+    echo ""
+    echo -e "${GREEN}✓ Omnia core has been rolled back to version $backup_version${NC}"
+    echo -e "${GREEN}✓ Container is running and healthy${NC}"
+    echo -e "${GREEN}✓ Configuration restored from backup${NC}"
+    echo ""
+    
+    # Initialize SSH config and start container session
+    init_ssh_config
+    start_container_session
+}
+
 # Main function to check if omnia_core container is already running.
 # If yes, ask the user if they want to enter the container or reinstall.
 # If no, set it up.
@@ -1504,6 +1810,9 @@ main() {
         --upgrade)
             upgrade_omnia_core
             ;;
+        --rollback)
+            rollback_omnia_core
+            ;;
         --version|-v)
             display_version
             ;;

From 46c63c095c51a3f2df5097a3b9739e61e7b8b6ad Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Wed, 11 Feb 2026 18:06:48 +0530
Subject: [PATCH 02/77] cleanup of files under offline_repo dir during pulp
 cleanup

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 common/library/modules/pulp_cleanup.py | 104 ++++++++++++++++++++++---
 local_repo/pulp_cleanup.yml            |   2 +
 2 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index 00ed27d0dd..f3da3e2004 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -27,6 +27,7 @@
 import csv
 import glob
 import json
+import shutil
 import subprocess
 from typing import Dict, List, Any, Tuple
 
@@ -399,7 +400,7 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
         return False, f"Pulp deletion error: {str(e)}"
 
 
-def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
+def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]:
     """Cleanup a pip module from Pulp Python repository.
     
     Pip modules are stored as: pip_module<package_name>==<version>
@@ -408,6 +409,7 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
     result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""}
     messages = []
     pulp_deleted = False
+    content_removed = False
 
     try:
         # Pulp Python repo name format: pip_module<name>
@@ -467,11 +469,17 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]:
                 messages.append("Status files updated")
                 mark_software_partial(affected, base_path, logger, 'pip_module')
 
-        if pulp_deleted:
+        # Clean up uploaded content from filesystem
+        fs_result = cleanup_content_directory(name, 'pip_module', repo_store_path, logger)
+        if fs_result["status"] == "Success":
+            content_removed = True
+            messages.append(fs_result["message"])
+
+        if pulp_deleted or content_removed:
             result["status"] = "Success"
             result["message"] = "; ".join(messages) if messages else "Cleaned up"
         else:
-            result["message"] = f"pip_module '{name}' not found in Pulp"
+            result["message"] = f"pip_module '{name}' not found in Pulp or filesystem"
 
     except Exception as e:
         result["message"] = f"Error: {str(e)}"
@@ -493,7 +501,7 @@ def get_pulp_file_repo_name(name: str, file_type: str) -> str:
     return name
 
 
-def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -> Dict[str, Any]:
+def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]:
     """Cleanup artifact from Pulp File repository.
     
     Handles: tarball, git, manifest, ansible_galaxy_collection
@@ -503,6 +511,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
     messages = []
     pulp_deleted = False
     status_removed = False
+    content_removed = False
 
     try:
         # Get the expected Pulp repository name
@@ -559,12 +568,18 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
                 messages.append("Status files updated")
                 mark_software_partial(affected, base_path, logger, file_type)
 
+        # Clean up uploaded content from filesystem
+        fs_result = cleanup_content_directory(name, file_type, repo_store_path, logger)
+        if fs_result["status"] == "Success":
+            content_removed = True
+            messages.append(fs_result["message"])
+
         # Determine overall result
-        if pulp_deleted or status_removed:
+        if pulp_deleted or status_removed or content_removed:
             result["status"] = "Success"
             result["message"] = "; ".join(messages) if messages else "Cleaned up"
         else:
-            result["message"] = f"{file_type} '{name}' not found in Pulp or status files"
+            result["message"] = f"{file_type} '{name}' not found in Pulp, status files, or filesystem"
 
     except Exception as e:
         result["message"] = f"Error: {str(e)}"
@@ -572,7 +587,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -
     return result
 
 
-def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]:
+def cleanup_file(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]:
     """Cleanup a file artifact.
     
     Routes to appropriate handler:
@@ -583,10 +598,75 @@ def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]:
 
     # Handle pip modules separately - they use Python repositories
     if file_type == "pip_module":
-        return cleanup_pip_module(name, base_path, logger)
+        return cleanup_pip_module(name, base_path, repo_store_path, logger)
 
     # All other file types use Pulp File repository
-    return cleanup_file_repository(name, file_type, base_path, logger)
+    return cleanup_file_repository(name, file_type, base_path, repo_store_path, logger)
+
+
+# =============================================================================
+# FILESYSTEM CONTENT CLEANUP
+# =============================================================================
+
+def cleanup_content_directory(content_name: str, content_type: str, repo_store_path: str, logger) -> Dict[str, Any]:
+    """Remove uploaded content directory from the filesystem.
+
+    Builds the content path the same way as download_common.py:
+        <repo_store_path>/offline_repo/cluster/<arch>/rhel/<version>/<content_type>/<content_name>
+
+    This mirrors how remove_from_status_files iterates over ARCH_SUFFIXES to
+    clean status.csv entries.
+
+    Args:
+        content_name: Name of the content item (e.g., 'helm-v3.19.0-amd64')
+        content_type: Directory category (tarball, git, pip_module, manifest,
+                      ansible_galaxy_collection, rpm_file)
+        repo_store_path: Root store path (e.g., '/opt/omnia')
+        logger: Logger instance
+
+    Returns:
+        Dict with name, type, status, and message keys
+    """
+    result = {"name": content_name, "type": f"filesystem_{content_type}",
+              "status": "Failed", "message": ""}
+    removed_dirs = []
+
+    cluster_path = os.path.join(repo_store_path, "offline_repo", "cluster")
+    if not os.path.exists(cluster_path):
+        result["message"] = f"Content store path not found: {cluster_path}"
+        logger.warning(result["message"])
+        return result
+
+    try:
+        for arch in ARCH_SUFFIXES:
+            # Walk version directories (e.g., rhel/10.0)
+            arch_path = os.path.join(cluster_path, arch)
+            if not os.path.isdir(arch_path):
+                continue
+
+            for version_dir in glob.glob(f"{arch_path}/rhel/*/"):
+                content_dir = os.path.join(version_dir, content_type, content_name)
+                if os.path.exists(content_dir):
+                    logger.info(f"Removing content directory: {content_dir}")
+                    if os.path.isdir(content_dir):
+                        shutil.rmtree(content_dir)
+                    else:
+                        os.remove(content_dir)
+                    removed_dirs.append(content_dir)
+
+        if removed_dirs:
+            result["status"] = "Success"
+            result["message"] = f"Removed content: {', '.join(removed_dirs)}"
+        else:
+            result["message"] = (f"No filesystem content found for "
+                                 f"'{content_name}' under {content_type}")
+            logger.info(result["message"])
+
+    except Exception as e:
+        result["message"] = f"Filesystem cleanup error: {str(e)}"
+        logger.error(f"Failed to cleanup content {content_name}: {e}")
+
+    return result
 
 
 # =============================================================================
@@ -868,7 +948,8 @@ def run_module():
             cleanup_repos=dict(type='list', elements='str', default=[]),
             cleanup_containers=dict(type='list', elements='str', default=[]),
             cleanup_files=dict(type='list', elements='str', default=[]),
-            base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT)
+            base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT),
+            repo_store_path=dict(type='str', default='/opt/omnia')
         ),
         supports_check_mode=True
     )
@@ -877,6 +958,7 @@ def run_module():
     cleanup_containers = module.params['cleanup_containers']
     cleanup_files = module.params['cleanup_files']
     base_path = module.params['base_path']
+    repo_store_path = module.params['repo_store_path']
 
     # Setup logger - setup_standard_logger expects a directory, creates standard.log inside
     log_dir = os.path.join(base_path, "cleanup")
@@ -915,7 +997,7 @@ def run_module():
 
     # Process files
     for file in cleanup_files:
-        result = cleanup_file(file, base_path, logger)
+        result = cleanup_file(file, base_path, repo_store_path, logger)
         all_results.append(result)
         logger.info(f"File {file}: {result['status']} - {result['message']}")
 
diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml
index 5d409bbc1f..93e379833b 100644
--- a/local_repo/pulp_cleanup.yml
+++ b/local_repo/pulp_cleanup.yml
@@ -77,6 +77,8 @@
         cleanup_repos: "{{ repo_list | default([]) }}"
         cleanup_containers: "{{ container_list | default([]) }}"
         cleanup_files: "{{ file_list | default([]) }}"
+        base_path: "{{ base_path | default('/opt/omnia/log/local_repo') }}"
+        repo_store_path: "{{ repo_store_path | default('/opt/omnia') }}"
       register: cleanup_result
 
   post_tasks:

From 7ef0c3153135cfdd1d82b59f09ceb9bcc30da584 Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Thu, 12 Feb 2026 11:44:15 +0530
Subject: [PATCH 03/77] removing doca-ofed from nfs share

Signed-off-by: Katakam-Rakesh <katakam.rakesh@dell.com>
---
 .../templates/doca-ofed/doca-install.sh.j2          |  3 ---
 discovery/roles/k8s_config/vars/main.yml            | 13 ++-----------
 discovery/roles/slurm_config/vars/main.yml          | 12 ++----------
 3 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2
index 111abcb3a1..db8a7cb9cc 100644
--- a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2
+++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2
@@ -44,9 +44,6 @@ else
     dnf install -y kernel-headers-$(uname -r)
 fi
 
-echo "Bootstrap doca-ofed package..."
-rpm -i "/var/lib/packages/${arch}/doca-ofed/doca-host-3.2.1-044000_25.10_rhel10.${arch}.rpm"
-
 echo "Installing doca-ofed..."
 if rpm -q doca-ofed >/dev/null 2>&1; then
     echo "doca-ofed package is already installed."
diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml
index 433b8e9f76..a80fb9b257 100644
--- a/discovery/roles/k8s_config/vars/main.yml
+++ b/discovery/roles/k8s_config/vars/main.yml
@@ -78,19 +78,10 @@ packages_base_dir_aarch64: "{{ k8s_client_mount_path }}/packages/aarch64"
 offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso"
 offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso"
 packages_layout_x86_64:
-  - doca-ofed
   - cuda
 packages_layout_aarch64:
-  - doca-ofed
   - cuda
 print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}"
-offline_path_x86_64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed"
-offline_path_aarch64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed"
-
+offline_path_x86_64: []
+offline_path_aarch64: []
 ssh_private_key_path: /root/.ssh/oim_rsa
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 43ee995e5a..3616b55068 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -141,19 +141,11 @@ packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64"
 offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso"
 offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso"
 packages_layout_x86_64:
-  - doca-ofed
   - cuda
 packages_layout_aarch64:
-  - doca-ofed
   - cuda
 print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}"
-offline_path_x86_64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed"
-offline_path_aarch64:
-  - name: doca-ofed
-    source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed"
-    dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed"
+offline_path_x86_64: []
+offline_path_aarch64: []
 
 ssh_private_key_path: /root/.ssh/oim_rsa

From b4f064ee0d7feed5bf0b3bd6233e992a5bd133e1 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 15:52:26 +0530
Subject: [PATCH 04/77] Upgrade of input credential files to 2.1

---
 .../tasks/display_warnings.yml                |  53 ++++++
 .../import_input_parameters/tasks/main.yml    |  12 ++
 .../restore_omnia_config_credentials.yml      | 171 ++++++++++++++++++
 .../restore_user_registry_credential.yml      | 130 +++++++++++++
 .../tasks/set_backup_location.yml             |  33 ++++
 .../templates/omnia_config_credentials.yml.j2 |  48 +++++
 .../import_input_parameters/vars/main.yml     |  66 ++++++-
 7 files changed, 512 insertions(+), 1 deletion(-)
 create mode 100644 upgrade/roles/import_input_parameters/tasks/display_warnings.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
 create mode 100644 upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
 create mode 100644 upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2

diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
new file mode 100644
index 0000000000..ac1eb69998
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
@@ -0,0 +1,53 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Display collected warnings
+  ansible.builtin.debug:
+    msg: |
+      =================================
+           UPGRADE WARNINGS SUMMARY
+      =================================
+
+      {% if upgrade_warnings | length > 0 %}
+      {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected.
+      You will now be shown the detailed list.
+      {% else %}
+      No warnings detected. Upgrade completed successfully!
+      {% endif %}
+  when: upgrade_warnings is defined
+
+
+- name: Pause for user to review warnings
+  ansible.builtin.pause:
+    prompt: |
+      ╔════════════════════════════════════════════╗
+      ║       ⚠️  UPGRADE WARNINGS REVIEW  ⚠️        ║
+      ╚════════════════════════════════════════════╝
+
+      {% if upgrade_warnings | length > 0 %}
+      {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected:
+
+      {% for warning in upgrade_warnings %}
+      {{ loop.index }}. {{ warning }}
+      {% endfor %}
+
+      Please review these warnings carefully.
+      Press ENTER to continue or CTRL+C to abort.
+      {% else %}
+      No warnings detected. Upgrade completed successfully!
+
+      Press ENTER to continue...
+      {% endif %}
+  when: upgrade_warnings is defined
diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml
index ff77cf2c0e..2aacba7451 100644
--- a/upgrade/roles/import_input_parameters/tasks/main.yml
+++ b/upgrade/roles/import_input_parameters/tasks/main.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Set backup location based on oim_metadata.yml
+  ansible.builtin.include_tasks: set_backup_location.yml
+
 - name: Validate backup location for upgrade input processing
   ansible.builtin.include_tasks: precheck_backup_location.yml
 
@@ -39,3 +42,12 @@
 
 - name: Restore input files from backup
   ansible.builtin.include_tasks: restore_input_files.yml
+
+- name: Restore user_registry_credential.yml from backup
+  ansible.builtin.include_tasks: restore_user_registry_credential.yml
+
+- name: Restore omnia_config_credentials.yml from backup
+  ansible.builtin.include_tasks: restore_omnia_config_credentials.yml
+
+- name: Display upgrade warnings summary
+  ansible.builtin.include_tasks: display_warnings.yml
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
new file mode 100644
index 0000000000..0abafee26b
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -0,0 +1,171 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup omnia_config_credentials.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/omnia_config_credentials.yml"
+  register: backup_omnia_config_credentials_stat
+
+- name: Check if backup omnia_config_credentials_key exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/.omnia_config_credentials_key"
+  register: backup_omnia_config_credentials_key_stat
+
+- name: Add warning for missing omnia_config_credentials.yml to list
+  ansible.builtin.set_fact:
+    upgrade_warnings: >-
+      {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }}
+  when: 
+    - not backup_omnia_config_credentials_stat.stat.exists
+    - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))"
+
+- name: Process omnia_config_credentials.yml when present in backup
+  block:
+    - name: Check if backup file is encrypted
+      ansible.builtin.command:
+        cmd: cat "{{ backup_location }}/omnia_config_credentials.yml"
+      register: backup_omnia_config_credentials_content
+      changed_when: false
+      failed_when: false
+      no_log: true
+
+    - name: "Case 1: Key present and file encrypted - Process and update"
+      block:
+        - name: Copy encrypted omnia_config_credentials.yml from backup to temp location
+          ansible.builtin.copy:
+            src: "{{ backup_location }}/omnia_config_credentials.yml"
+            dest: "{{ input_project_dir }}/omnia_config_credentials.yml.tmp"
+            mode: '0600'
+            remote_src: true
+
+        - name: Copy omnia_config_credentials_key from backup
+          ansible.builtin.copy:
+            src: "{{ backup_location }}/.omnia_config_credentials_key"
+            dest: "{{ input_project_dir }}/.omnia_config_credentials_key"
+            mode: '0600'
+            remote_src: true
+
+        - name: Decrypt omnia_config_credentials.yml using the key
+          ansible.builtin.shell:
+            cmd: |
+              ansible-vault decrypt "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" \
+                --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \
+                --output "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted"
+          args:
+            executable: /bin/bash
+          no_log: true
+          register: vault_decrypt_result
+          failed_when: vault_decrypt_result.rc != 0
+
+        - name: Read decrypted content
+          ansible.builtin.slurp:
+            src: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted"
+          register: decrypted_content
+          no_log: true
+
+        - name: Parse YAML content and extract credentials
+          ansible.builtin.set_fact:
+            credentials_dict: >-
+              {{ decrypted_content.content | b64decode | from_yaml }}
+          no_log: true
+
+      rescue:
+        - name: Fail with decryption error message
+          ansible.builtin.fail:
+            msg: "{{ msg_omnia_config_decrypt_error }}"
+
+    - name: "Case 1.1: Apply template and encrypt"
+      block:
+        - name: Set template variables from credentials
+          ansible.builtin.set_fact:
+            provision_password: "{{ credentials_dict.provision_password | default('') }}"
+            bmc_username: "{{ credentials_dict.bmc_username | default('') }}"
+            bmc_password: "{{ credentials_dict.bmc_password | default('') }}"
+            minio_s3_password: "{{ credentials_dict.minio_s3_password | default('') }}"
+            pulp_password: "{{ credentials_dict.pulp_password | default('') }}"
+            docker_username: "{{ credentials_dict.docker_username | default('') }}"
+            docker_password: "{{ credentials_dict.docker_password | default('') }}"
+            slurm_db_password: "{{ credentials_dict.slurm_db_password | default('') }}"
+            openldap_db_username: "{{ credentials_dict.openldap_db_username | default('') }}"
+            openldap_db_password: "{{ credentials_dict.openldap_db_password | default('') }}"
+            mysqldb_user: "{{ credentials_dict.mysqldb_user | default('') }}"
+            mysqldb_password: "{{ credentials_dict.mysqldb_password | default('') }}"
+            mysqldb_root_password: "{{ credentials_dict.mysqldb_root_password | default('') }}"
+            csi_username: "{{ credentials_dict.csi_username | default('') }}"
+            csi_password: "{{ credentials_dict.csi_password | default('') }}"
+            ldms_sampler_password: "{{ credentials_dict.ldms_sampler_password | default('') }}"
+          no_log: true
+
+        - name: Write updated content using template
+          ansible.builtin.template:
+            src: omnia_config_credentials.yml.j2
+            dest: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted"
+            mode: '0600'
+          no_log: true
+
+        - name: Encrypt updated file using the same key
+          ansible.builtin.shell:
+            cmd: |
+              ansible-vault encrypt "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" \
+                --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \
+                --output "{{ input_project_dir }}/omnia_config_credentials.yml"
+          args:
+            executable: /bin/bash
+          no_log: true
+          register: vault_encrypt_result
+          failed_when: vault_encrypt_result.rc != 0
+
+        - name: Clean up temporary files
+          ansible.builtin.file:
+            path: "{{ item }}"
+            state: absent
+          loop:
+            - "{{ input_project_dir }}/omnia_config_credentials.yml.tmp"
+            - "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted"
+
+        - name: Display success message
+          ansible.builtin.debug:
+            msg: "{{ msg_omnia_config_credentials_success }}"
+
+      rescue:
+        - name: Fail with template/encryption error message
+          ansible.builtin.fail:
+            msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}"
+      when: >-
+        backup_omnia_config_credentials_key_stat.stat.exists and
+        backup_omnia_config_credentials_content.stdout is defined and
+        '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout
+
+    - name: "Case 2: Both key and file missing - Add info warning"
+      ansible.builtin.set_fact:
+        upgrade_warnings: >-
+          {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }}
+      when: >-
+        not backup_omnia_config_credentials_key_stat.stat.exists and
+        (backup_omnia_config_credentials_content.stdout is not defined or 
+         '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and
+        "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))"
+
+    - name: "Case 3: Error - Mismatched state"
+      ansible.builtin.fail:
+        msg: "{{ msg_omnia_config_credentials_error }}"
+      when: >-
+        (not backup_omnia_config_credentials_key_stat.stat.exists and 
+         backup_omnia_config_credentials_content.stdout is defined and 
+         '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or
+        (backup_omnia_config_credentials_key_stat.stat.exists and 
+         backup_omnia_config_credentials_content.stdout is defined and 
+         '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout)
+  when: backup_omnia_config_credentials_stat.stat.exists
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
new file mode 100644
index 0000000000..de337310b8
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -0,0 +1,130 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if backup user_registry_credential.yml exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/user_registry_credential.yml"
+  register: backup_user_registry_credential_stat
+
+- name: Check if user_registry_credential.yml exists in current directory
+  ansible.builtin.stat:
+    path: "{{ input_project_dir }}/user_registry_credential.yml"
+  register: user_registry_credential_stat
+
+- name: Check if backup local_repo_credentials_key exists
+  ansible.builtin.stat:
+    path: "{{ backup_location }}/.local_repo_credentials_key"
+  register: backup_local_repo_credentials_key_stat
+
+- name: Add warning for missing user_registry_credential.yml to list
+  ansible.builtin.set_fact:
+    upgrade_warnings: >-
+      {{ upgrade_warnings + [
+        "WARNING: user_registry_credential.yml not found in backup at " +
+        backup_location + "/user_registry_credential.yml. " +
+        "This might be due to complete Omnia execution not being completed. " +
+        "Skipping restoration of this file."
+      ] }}
+  when: 
+    - not backup_user_registry_credential_stat.stat.exists
+    - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))"
+
+- name: Process user_registry_credential.yml when present in backup
+  block:
+    - name: Check if backup file is encrypted
+      ansible.builtin.command:
+        cmd: cat "{{ backup_location }}/user_registry_credential.yml"
+      register: backup_user_registry_content
+      changed_when: false
+      failed_when: false
+      no_log: true
+
+    - name: "Case 1: Key present and file encrypted - Copy both"
+      block:
+        - name: Decrypt user_registry_credential.yml using the key
+          ansible.builtin.shell:
+            cmd: |
+              ansible-vault decrypt "{{ input_project_dir }}/user_registry_credential.yml.tmp" \
+                --vault-password-file "{{ input_project_dir }}/.local_repo_credentials_key" \
+                --output "{{ input_project_dir }}/user_registry_credential.yml.decrypted"
+          args:
+            executable: /bin/bash
+          no_log: true
+          register: vault_decrypt_result
+          failed_when: vault_decrypt_result.rc != 0
+
+        - name: Copy encrypted user_registry_credential.yml from backup
+          ansible.builtin.copy:
+            src: "{{ backup_location }}/user_registry_credential.yml"
+            dest: "{{ input_project_dir }}/user_registry_credential.yml"
+            mode: '0600'
+            remote_src: true
+
+        - name: Copy local_repo_credentials_key from backup
+          ansible.builtin.copy:
+            src: "{{ backup_location }}/.local_repo_credentials_key"
+            dest: "{{ input_project_dir }}/.local_repo_credentials_key"
+            mode: '0600'
+            remote_src: true
+
+        - name: Display success message for encrypted file restoration
+          ansible.builtin.debug:
+            msg: |
+              user_registry_credential.yml restored from backup.
+              Backup: {{ backup_location }}/user_registry_credential.yml
+              Target: {{ input_project_dir }}/user_registry_credential.yml
+              Status: Encrypted (key file also restored)
+      rescue:
+        - name: Fail with decryption error message
+          ansible.builtin.fail:
+            msg: "{{ msg_user_registry_decrypt_error }}"
+      when: >-
+        backup_local_repo_credentials_key_stat.stat.exists and
+        backup_user_registry_content.stdout is defined and
+        '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout
+
+    - name: "Case 2: Both key and file missing - Add info warning"
+      ansible.builtin.set_fact:
+        upgrade_warnings: >-
+          {{ upgrade_warnings + [
+            "INFO: Both user_registry_credential.yml and .local_repo_credentials_key " +
+            "are not present in backup. This is expected if registry credentials " +
+            "were not configured in the source installation."
+          ] }}
+      when: >-
+        not backup_local_repo_credentials_key_stat.stat.exists and
+        (backup_user_registry_content.stdout is not defined or 
+         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and
+        "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))"
+
+    - name: "Case 3: Error - Mismatched state"
+      ansible.builtin.fail:
+        msg: |
+          ERROR: Inconsistent state detected for user_registry_credential.yml:
+          {% if not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %}
+          - File is encrypted but key file (.local_repo_credentials_key) is missing
+          {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %}
+          - Key file exists but file is not encrypted
+          {% endif %}
+          Please check the backup integrity and ensure both files are present
+          in consistent states.
+      when: >-
+        (not backup_local_repo_credentials_key_stat.stat.exists and 
+         backup_user_registry_content.stdout is defined and 
+         '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or
+        (backup_local_repo_credentials_key_stat.stat.exists and 
+         backup_user_registry_content.stdout is defined and 
+         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout)
+  when: backup_user_registry_credential_stat.stat.exists
diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
new file mode 100644
index 0000000000..4f6a96e83f
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
@@ -0,0 +1,33 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Read oim_metadata.yml to get upgrade_backup_dir
+  ansible.builtin.slurp:
+    src: /opt/omnia/.data/oim_metadata.yml
+  register: oim_metadata_slurp
+
+- name: Parse oim_metadata.yml
+  ansible.builtin.set_fact:
+    oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}"
+
+- name: Set backup_location from metadata
+  ansible.builtin.set_fact:
+    backup_location: "{{ oim_metadata.upgrade_backup_dir }}/input/project_default"
+  when: oim_metadata.upgrade_backup_dir is defined
+
+- name: Fail if upgrade_backup_dir is not defined in metadata
+  ansible.builtin.fail:
+    msg: "{{ msg_upgrade_backup_dir_missing }}"
+  when: oim_metadata.upgrade_backup_dir is not defined
diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2
new file mode 100644
index 0000000000..4b3b63d8c7
--- /dev/null
+++ b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2
@@ -0,0 +1,48 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+# Provision credentials
+provision_password: "{{ provision_password | default('') }}"
+bmc_username: "{{ bmc_username | default('') }}"
+bmc_password: "{{ bmc_password | default('') }}"
+
+# Prepare_oim credentials
+minio_s3_password: "{{ minio_s3_password | default('') }}"
+pulp_password: "{{ pulp_password | default('') }}"
+docker_username: "{{ docker_username | default('') }}"
+docker_password: "{{ docker_password | default('') }}"
+
+# Omnia credentials
+slurm_db_password: "{{ slurm_db_password | default('') }}"
+
+# Security credentials
+openldap_db_username: "{{ openldap_db_username | default('') }}"
+openldap_db_password: "{{ openldap_db_password | default('') }}"
+
+# iDrac Telemetry credentials
+mysqldb_user: "{{ mysqldb_user | default('') }}"
+mysqldb_password: "{{ mysqldb_password | default('') }}"
+mysqldb_root_password: "{{ mysqldb_root_password | default('') }}"
+
+# csi powerscale credentials
+csi_username: "{{ csi_username | default('') }}"
+csi_password: "{{ csi_password | default('') }}"
+
+# LDMS sampler
+ldms_sampler_password: "{{ ldms_sampler_password | default('') }}"
+
+# postgres credentials
+postgres_user: "{{ postgres_user | default('') }}"
+postgres_password: "{{ postgres_password | default('') }}"
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index c27f111cde..5eee4a2f50 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -13,18 +13,82 @@
 # limitations under the License.
 ---
 
-backup_location: /opt/omnia/backups/upgrade/input/project_default
+# backup_location will be set from oim_metadata.yml upgrade_backup_dir
+# Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default
+backup_location: ""
 
 backup_dir_mode: '0755'
 default_file_mode: '0644'
 
+# List to collect warnings during execution
+upgrade_warnings: []
+
 # Precheck backup location messages
 msg_backup_location_missing: "backup_location must be provided"
+msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.data/oim_metadata.yml"
 
 # Restore input files messages
 msg_restore_item_name_missing: "restore_item must define 'name'"
 msg_validation_failed: "Validation failed for {{ restore_item.name }}"
 msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}"
+msg_user_registry_credential_missing: |- 
+  \033[93mWARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml\033[0m
+  This might be due to complete Omnia execution not being completed.
+  Skipping restoration of this file.
+
+# Omnia config credentials messages
+msg_omnia_config_credentials_missing: |- 
+  WARNING: omnia_config_credentials.yml not found in backup at {{ backup_location }}/omnia_config_credentials.yml.
+  This might be due to complete Omnia execution not being completed.
+  Skipping restoration of this file.
+
+msg_omnia_config_credentials_info_missing: |- 
+  INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key 
+  are not present in backup. This is expected if credentials 
+  were not configured in the source installation.
+
+msg_omnia_config_credentials_success: |- 
+  omnia_config_credentials.yml restored and updated from backup.
+  Backup: {{ backup_location }}/omnia_config_credentials.yml
+  Target: {{ input_project_dir }}/omnia_config_credentials.yml
+  Status: Updated with postgres credentials and re-encrypted (key file also restored)
+
+msg_omnia_config_credentials_error: |- 
+  ERROR: Inconsistent state detected for omnia_config_credentials.yml:
+  {% if not backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %}
+  - File is encrypted but key file (.omnia_config_credentials_key) is missing
+  {% elif backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %}
+  - Key file exists but file is not encrypted
+  {% endif %}
+  Please check the backup integrity and ensure both files are present
+  in consistent states.
+
+# Rescue warning messages
+msg_user_registry_decrypt_error: |- 
+  ERROR: Failed to decrypt user_registry_credential.yml. 
+  The backup key file may be corrupted or incompatible. 
+  Please check the backup integrity and ensure the key file 
+  matches the encrypted file.
+
+msg_omnia_config_decrypt_error: |- 
+  ERROR: Failed to decrypt omnia_config_credentials.yml. 
+  The backup key file may be corrupted or incompatible. 
+  Please check the backup integrity and ensure the key file 
+  matches the encrypted file.
+
+msg_omnia_config_template_error: |- 
+  ERROR: Failed to generate updated omnia_config_credentials.yml. 
+  Template processing may have failed due to invalid data format. 
+  Please check the backup file format and ensure it contains valid YAML.
+
+msg_omnia_config_encrypt_error: |- 
+  ERROR: Failed to encrypt updated omnia_config_credentials.yml. 
+  The key file may be corrupted or there may be permission issues. 
+  Please check the key file integrity and file permissions.
+
+msg_decryption_failed: "Decryption failed. Check warnings for details."
+msg_template_failed: "Template processing failed. Check warnings for details."
+msg_encryption_failed: "Encryption failed. Check warnings for details."
 
 # Network spec transformation messages
 msg_backup_network_spec_missing: "Backup network_spec.yml missing"

From d3b9c749b5096eaa4ca708def872e51ad38e1ed4 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Thu, 12 Feb 2026 16:16:44 +0530
Subject: [PATCH 05/77] Added new package type rpm_repo

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../input_validation/common_utils/config.py   |   1 +
 .../library/module_utils/local_repo/config.py |   2 +-
 .../local_repo/parse_and_download.py          | 183 ++++++++++++------
 .../module_utils/local_repo/software_utils.py |   6 +-
 common/library/modules/parallel_tasks.py      | 163 ++++++++++------
 common/library/modules/pulp_cleanup.py        | 177 +++++++++++------
 local_repo/pulp_cleanup.yml                   |  13 +-
 7 files changed, 354 insertions(+), 191 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py
index e6e8a09042..0f369f3950 100644
--- a/common/library/module_utils/input_validation/common_utils/config.py
+++ b/common/library/module_utils/input_validation/common_utils/config.py
@@ -147,6 +147,7 @@
     "rpm": ["package", "repo_name"],
     "rpm_list": ["package_list", "repo_name"],
     "rpm_file": ["package", "url"],
+    "rpm_repo": ["package", "repo_name"],
     "ansible_galaxy_collection": ["package", "version"],
     "git": ["package", "version", "url"],
     "image": ["package", ["tag", "digest"]],  # Special: one of tag or digest
diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 0518e2bb01..cfc3b20c9d 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -51,7 +51,7 @@
 # Used by software_utils.py
 # ----------------------------
 PACKAGE_TYPES = ['rpm', 'deb', 'tarball', 'image', 'manifest', 'git',
-                 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file']
+                 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file', 'rpm_repo']
 CSV_COLUMNS = {"column1": "name", "column2": "status"}
 SOFTWARE_CONFIG_SUBDIR = "config"
 RPM_LABEL_TEMPLATE = "RPMs for {key}"
diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index 367f9561f5..72efd4566b 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=import-error,no-name-in-module
+"""
+Utility functions for parsing and downloading artifacts.
+
+This module provides common functions for command execution, status file management,
+and repository operations used across the local repo management system.
+"""
+
 import os
 import subprocess
 import json
 import re
 from multiprocessing import Lock
-from ansible.module_utils.local_repo.standard_logger import setup_standard_logger
+from ansible.module_utils.local_repo.config import ARCH_SUFFIXES, STATUS_CSV_HEADER
 
 
 def mask_sensitive_data(cmd_string):
@@ -57,35 +64,87 @@ def execute_command(cmd_string, logger, type_json=False):
             stderr=subprocess.PIPE,
             shell=True,
         )
-
-        status["returncode"] = cmd.returncode
-        status["stdout"] = cmd.stdout.strip() if cmd.stdout else None
-        status["stderr"] = cmd.stderr.strip() if cmd.stderr else None
-
-        if cmd.returncode != 0:
-            logger.error(f"Command failed with return code {cmd.returncode}")
-            logger.error(f"Error: {status['stderr']}")
-            return False
-
-        if type_json and status["stdout"]:
-            try:
-                status["stdout"] = json.loads(status["stdout"])
-            except json.JSONDecodeError as error:
-                logger.error(f"Failed to parse JSON output: {error}")
-                return False
-
-        return status
-
-    except Exception as error:
-        logger.error(f"Error executing command: {error}")
+        logger.info(f"Command succeeded: {cmd_string}")
+        return True
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Command failed: {cmd_string} - {e}")
+        return False
+    except subprocess.TimeoutExpired as e:
+        logger.error(f"Command timed out: {cmd_string} - {e}")
+        return False
+    except OSError as e:
+        logger.error(f"OS error during command: {cmd_string} - {e}")
         return False
 
     finally:
         logger.info("#" * 30 + f" {execute_command.__name__} end " + "#" * 30)
 
+def get_arch_from_status_path(status_file_path):
+    """Extract architecture from status file path.
+    
+    Args:
+        status_file_path: Path like '/opt/omnia/log/local_repo/x86_64/software_name/status.csv'
+        
+    Returns:
+        str: Architecture ('x86_64' or 'aarch64') or None if not found
+    """
+    for arch in ARCH_SUFFIXES:
+        if f"/{arch}/" in status_file_path:
+            return arch
+    return None
+
+def _prefix_repo_name_with_arch(repo_name: str, status_file_path: str, logger) -> str:
+    """Add architecture prefix to repo_name if not already present.
+    
+    Args:
+        repo_name: Repository name to prefix
+        status_file_path: Path to extract architecture from
+        logger: Logger instance
+        
+    Returns:
+        str: Repository name with architecture prefix
+    """
+    if not repo_name:
+        return repo_name
+        
+    arch = get_arch_from_status_path(status_file_path)
+    if arch and not any(repo_name.startswith(f"{prefix}_") for prefix in ARCH_SUFFIXES):
+        prefixed_name = f"{arch}_{repo_name}"
+        logger.info(f"Auto-prefixed repo_name with architecture: {prefixed_name}")
+        return prefixed_name
+    return repo_name
+
+
+def _update_existing_line(line: str, package_name: str, package_type: str, status: str, repo_name: str, status_file_path: str) -> str:
+    """Update an existing line in status file.
+    
+    Args:
+        line: Existing line content
+        package_name: Package name to match
+        package_type: Package type
+        status: New status
+        repo_name: Repository name
+        status_file_path: Path for architecture extraction
+        
+    Returns:
+        str: Updated line content
+    """
+    parts = line.strip().split(',')
+    if len(parts) >= 4:
+        final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None)
+        parts[2] = final_repo_name if final_repo_name else ''
+        parts[3] = status
+        return ','.join(parts) + '\n'
+    
+    # Handle short lines
+    final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None)
+    return f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n"
+
+
 def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock, repo_name=None):
     """
-    Writes or updates the status of a package in the status file, using a lock to ensure safe access across processes.
+    Writes or updates the status of a package in the status file.
+    
     Args:
         status_file_path: Path to the status file
         package_name: Name of the package
@@ -97,44 +156,56 @@ def write_status_to_file(status_file_path, package_name, package_type, status, l
     """
     logger.info("#" * 30 + f" {write_status_to_file.__name__} start " + "#" * 30)
 
+    # Auto-prefix repo_name with architecture if needed
+    repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, logger)
+
     try:
         with file_lock:  # Ensure only one process can write at a time
             if os.path.exists(status_file_path):
-                with open(status_file_path, "r") as f:
-                    lines = f.readlines()
-
-                updated = False
-                with open(status_file_path, "w") as f:
-                      # Write header (new files always have repo_name column)
-                    if lines:
-                        f.write(lines[0])  # Keep existing header
-
-                    # Write data lines
-                    for line in lines[1:]:  # Skip header
-                        if line.startswith(f"{package_name},"):
-                           # f.write(f"{package_name},{package_type},{status}\n")
-                            # Update existing line with repo_name (order: name,type,repo_name,status)
-                            parts = line.strip().split(',')
-                            if len(parts) >= 4:
-                                parts[2] = repo_name if repo_name else ''
-                                parts[3] = status
-                                f.write(','.join(parts) + '\n')
-                            else:
-                                f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
-                            updated = True
-                        else:
-                            f.write(line)
-
-                    if not updated:
-                        f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
+                _update_existing_file(status_file_path, package_name, package_type, status, repo_name)
             else:
-                with open(status_file_path, "w") as f:
-                    f.write(STATUS_CSV_HEADER)
-                    f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n")
+                _create_new_file(status_file_path, package_name, package_type, status, repo_name)
 
             logger.info(f"Status written to {status_file_path} for {package_name}.")
-    except Exception as e:
+    except OSError as e:
         logger.error(f"Failed to write to status file: {status_file_path}. Error: {str(e)}")
-        raise RuntimeError(f"Failed to write to status file: {status_file_path}. Error: {str(e)}")
+        raise RuntimeError(
+            f"Failed to write to status file: {status_file_path}. Error: {str(e)}"
+        ) from e
     finally:
         logger.info("#" * 30 + f" {write_status_to_file.__name__} end " + "#" * 30)
+
+
+def _update_existing_file(status_file_path, package_name, package_type, status, repo_name):
+    """Update existing status file with new package status."""
+    with open(status_file_path, "r", encoding='utf-8') as f:
+        lines = f.readlines()
+
+    updated = False
+    with open(status_file_path, "w", encoding='utf-8') as f:
+        # Write header
+        if lines:
+            f.write(lines[0])
+
+        # Write data lines
+        for line in lines[1:]:  # Skip header
+            if line.startswith(f"{package_name},"):
+                updated_line = _update_existing_line(
+                    line, package_name, package_type, status, repo_name, status_file_path
+                )
+                f.write(updated_line)
+                updated = True
+            else:
+                f.write(line)
+
+        if not updated:
+            final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None)
+            f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n")
+
+
+def _create_new_file(status_file_path, package_name, package_type, status, repo_name):
+    """Create new status file with package status."""
+    with open(status_file_path, "w", encoding='utf-8') as f:
+        f.write(STATUS_CSV_HEADER)
+        final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None)
+        f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n")
diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index a915f25f8b..3e06ddc7cd 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -179,7 +179,7 @@ def transform_package_dict(data, arch_val,logger):
         repo_mapping = {}
 
         for item in items:
-            if item.get("type") == "rpm":
+            if item.get("type") in ("rpm", "rpm_repo"):
                 rpm_packages.append(item["package"])
                 # Preserve repo_name if available
                 if "repo_name" in item:
@@ -832,7 +832,7 @@ def remove_duplicates_from_trans(trans):
 
             if group == "default_packages":  # Handle nested rpm_list case
                 for pkg in items:
-                    if pkg.get("type") == "rpm" and "rpm_list" in pkg:
+                    if pkg.get("type") in ("rpm", "rpm_repo") and "rpm_list" in pkg:
                         pkg["rpm_list"] = list(dict.fromkeys(pkg["rpm_list"]))
                 continue
 
@@ -856,7 +856,7 @@ def remove_duplicates_from_trans(trans):
                 elif type_ == "git":
                     key = (item.get("url"), item.get("version"))
 
-                elif type_ == "rpm" and "rpm_list" in item:
+                elif type_ in ("rpm", "rpm_repo") and "rpm_list" in item:
                     item["rpm_list"] = list(dict.fromkeys(item["rpm_list"]))
                     key = item.get("package")
 
diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py
index 5951a525b2..17c14cf51f 100644
--- a/common/library/modules/parallel_tasks.py
+++ b/common/library/modules/parallel_tasks.py
@@ -34,7 +34,9 @@
 from ansible.module_utils.local_repo.download_image import process_image
 from ansible.module_utils.local_repo.download_rpm import process_rpm
 from ansible.module_utils.local_repo.standard_logger import setup_standard_logger
-from ansible.module_utils.local_repo.common_functions import generate_vault_key, process_file, is_encrypted
+from ansible.module_utils.local_repo.common_functions import (
+    generate_vault_key, process_file, is_encrypted
+)
 from ansible.module_utils.local_repo.software_utils import (
     load_json,
     set_version_variables,
@@ -125,7 +127,10 @@ def update_status_csv(csv_dir, software, overall_status,slogger):
     slogger.info(f"Successfully updated status CSV at {status_file}")
 
 
-def determine_function(task, repo_store_path, csv_file_path, user_data, version_variables, arc, user_registries, docker_username, docker_password):
+def determine_function(
+    task, repo_store_path, csv_file_path, user_data, version_variables, arc,
+    user_registries, docker_username, docker_password
+):
     """
     Determines the appropriate function and its arguments to process a given task.
 
@@ -160,27 +165,55 @@ def determine_function(task, repo_store_path, csv_file_path, user_data, version_
 
         task_type = task.get("type")
         if task_type == "manifest":
-            return process_manifest, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_manifest, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "git":
-            return process_git, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_git, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "tarball":
-            return process_tarball, [task, repo_store_path, status_file, version_variables, cluster_os_type, cluster_os_version, arc]
+            return process_tarball, [
+                task, repo_store_path, status_file, version_variables,
+                cluster_os_type, cluster_os_version, arc
+            ]
         if task_type == "shell":
-            return process_shell, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_shell, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "ansible_galaxy_collection":
-            return process_ansible_galaxy_collection, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_ansible_galaxy_collection, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "iso":
-            return process_iso, [task, repo_store_path, status_file,
-                                 cluster_os_type, cluster_os_version, version_variables, arc]
+            return process_iso, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, version_variables, arc
+            ]
         if task_type == "pip_module":
-            return process_pip, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
+            return process_pip, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
         if task_type == "image":
-            return process_image, [task, status_file, version_variables, user_registries, docker_username, docker_password]
+            return process_image, [
+                task, status_file, version_variables, user_registries,
+                docker_username, docker_password
+            ]
         if task_type == "rpm_file":
-            return process_rpm_file, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc]
-        if task_type == "rpm":
-            return process_rpm, [task, repo_store_path, status_file,
-                                 cluster_os_type, cluster_os_version, repo_config_value, arc]
+            return process_rpm_file, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, arc
+            ]
+        if task_type in ("rpm", "rpm_repo"):
+            return process_rpm, [
+                task, repo_store_path, status_file, cluster_os_type,
+                cluster_os_version, repo_config_value, arc
+            ]
 
         raise ValueError(f"Unknown task type: {task_type}")
     except Exception as e:
@@ -272,57 +305,43 @@ def main():
     Args:
         tasks (list): A list of tasks (dictionaries) that need to be processed in parallel.
         nthreads (int): The number of worker processes to run in parallel.
-        timeout (int): The maximum time allowed for all tasks to execute. If `None`, no timeout is enforced.
+        timeout (int): The maximum time allowed for all tasks to execute.
+                    If `None`, no timeout is enforced.
         log_dir (str): The directory where log files for the worker processes will be saved.
         log_file (str): The path to the log file for the overall task execution.
         slog_file (str): The path to the log file for the standard logger.
         csv_file_path (str): The path to a CSV file that may be needed for processing some tasks.
         repo_store_path (str): The path to the repository where task-related files are stored.
         software (list): A list of software names.
-        user_json_file (str): The path to the JSON file containing use
-        show_softwares_status (bool): Whether to display the software status; optional, defaults to False.  
-        overall_status_dict (dict): A list containing overall software status information; optional, defaults to an empty dict.
-          Dictionary containing software status information grouped by software names.  
-          Each key (e.g., 'service_k8s') maps to a list of dictionaries,  
-          where each dictionary contains:
-              - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'.  
-              - 'overall_status' (str): Status of the software on that architecture, e.g., 'SUCCESS'.  
-          Example:
-              {
-                  "service_k8s": [
-                      {"arch": "x86_64", "overall_status": "SUCCESS"},
-                      {"arch": "aarch64", "overall_status": "SUCCESS"}
-                  ]
-              }
-          Defaults to an empty dict if not provided.
+        user_json_file (str): The path to the JSON file containing user data.
+        show_softwares_status (bool): Whether to display the software status;
+                                optional, defaults to False.
+        overall_status_dict (dict): A dictionary containing overall software status
+                                information; optional, defaults to an empty dict.
+            Dictionary containing software status information grouped by software names.
+            Each key (e.g., 'service_k8s') maps to a list of dictionaries,
+            where each dictionary contains:
+                - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'.
+                - 'overall_status' (str): Status of the software on that architecture,
+                                        e.g., 'SUCCESS'.
+            Example:
+                {
+                    "service_k8s": [
+                        {"arch": "x86_64", "overall_status": "SUCCESS"},
+                        {"arch": "aarch64", "overall_status": "SUCCESS"}
+                    ]
+                }
+            Defaults to an empty dict if not provided.
 
     Returns:
         tuple: A tuple containing:
-            - overall_status (str): The overall status of task execution ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT").
-            - task_results_data (list): A list of dictionaries, each containing the result of an individual task.
+            - overall_status (str): The overall status of task execution
+                                 ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT").
+            - task_results_data (list): A list of dictionaries, each containing
+                                    the result of an individual task.
     Raises:
         Exception: If an error occurs during execution.
     """
-    # module_args = {
-    #     "tasks": {"type": "list", "required": True},
-    #     "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS},
-    #     "timeout": {"type": "int", "required": False, "default": DEFAULT_TIMEOUT},
-    #     "log_dir": {"type": "str", "required": False, "default": LOG_DIR_DEFAULT},
-    #     "log_file": {"type": "str", "required": False, "default": DEFAULT_LOG_FILE},
-    #     "slog_file": {"type": "str", "required": False, "default": DEFAULT_SLOG_FILE},
-    #     "csv_file_path": {"type": "str", "required": False, "default": CSV_FILE_PATH_DEFAULT},
-    #     "repo_store_path": {"type": "str", "required": False, "default": DEFAULT_REPO_STORE_PATH},
-    #     "software": {"type": "list", "elements": "str", "required": True},
-    #     "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT},
-    #     "show_softwares_status": {"type": "bool", "required": False, "default": False},
-    #     "overall_status_dict": {"type": "dict","required": True},
-    #     "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT},
-    #     "arch": {"type": "str", "required": False},
-    #     "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT},
-    #     "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH},
-    #     "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH},
-    #     "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH}
-    # }
 
     module_args = {
         "tasks": {"type": "list", "required": True},
@@ -337,10 +356,19 @@ def main():
         "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT},
         "show_softwares_status": {"type": "bool", "required": False, "default": False},
         "overall_status_dict": {"type": "dict","required": True},
-        "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT},
+        "local_repo_config_path": {
+            "type": "str", "required": False,
+            "default": LOCAL_REPO_CONFIG_PATH_DEFAULT
+        },
         "arch": {"type": "str", "required": False},
-        "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH},
-        "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH}
+        "omnia_credentials_yaml_path": {
+            "type": "str", "required": False,
+            "default": OMNIA_CREDENTIALS_YAML_PATH
+        },
+        "omnia_credentials_vault_path": {
+            "type": "str", "required": False,
+            "default": OMNIA_CREDENTIALS_VAULT_PATH
+        }
     }
     module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
     tasks = module.params["tasks"]
@@ -386,24 +414,29 @@ def main():
         cluster_os_type = user_data['cluster_os_type']
         cluster_os_version = user_data['cluster_os_version']
 
-        subgroup_dict, software_names = get_subgroup_dict(user_data,slogger)
-        version_variables = set_version_variables(user_data, software_names, cluster_os_version,slogger)
+        subgroup_dict, software_names = get_subgroup_dict(user_data, slogger)
+        version_variables = set_version_variables(
+            user_data, software_names, cluster_os_version, slogger
+        )
         slogger.info(f"Cluster OS: {cluster_os_type}")
         slogger.info(f"Version Variables: {version_variables}")
         # gen_result = {}
         # if not os.path.isfile(user_reg_key_path):
         #     gen_result = generate_vault_key(user_reg_key_path)
         # if gen_result is None:
-        #     module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}")
+        #     module.fail_json(
+        #         msg=f"Unable to generate local_repo key at path: {user_reg_key_path}"
+        #     )
 
         overall_status, task_results = execute_parallel(
             tasks, determine_function, nthreads, repo_store_path, csv_file_path,
-            log_dir, user_data, version_variables, arc, slogger, local_repo_config_path,
-            omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout
+            log_dir, user_data, version_variables, arc, slogger,
+            local_repo_config_path, omnia_credentials_yaml_path,
+            omnia_credentials_vault_path, timeout
         )
 
         # if not is_encrypted(user_reg_cred_input):
-        #     process_file(user_reg_cred_input,user_reg_key_path,'encrypt')
+        #     process_file(user_reg_cred_input, user_reg_key_path, 'encrypt')
 
         end_time = datetime.now()
         formatted_end_time = end_time.strftime("%I:%M:%S %p")
@@ -442,7 +475,9 @@ def main():
 
 
     except Exception as e:
-        result["table_output"] = table_output if "table_output" in locals() else "No table generated."
+        result["table_output"] = (
+            table_output if "table_output" in locals() else "No table generated."
+        )
         slogger.error(f"Execution failed: {str(e)}")
         module.fail_json(msg=f"Error during execution: {str(e)}", **result)
 
diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index f3da3e2004..a3c155ebdb 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -137,7 +137,10 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]:
 
     # Must contain at least one '/' to indicate registry/image format
     if '/' not in image_name:
-        return False, f"Invalid format '{image_name}'. Must include registry (e.g., registry.k8s.io/pause, docker.io/library/busybox)"
+        return False, (
+            f"Invalid format '{image_name}'. Must include registry "
+            "(e.g., registry.k8s.io/pause, docker.io/library/busybox)"
+        )
 
     # Must have a registry part (contains '.' or is a known registry)
     parts = image_name.split('/')
@@ -145,7 +148,10 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]:
 
     # Check if registry looks valid (contains dot or is localhost)
     if '.' not in registry and registry != 'localhost' and ':' not in registry:
-        return False, f"Invalid registry '{registry}' in '{image_name}'. Registry must be a domain (e.g., docker.io, registry.k8s.io)"
+        return False, (
+            f"Invalid registry '{registry}' in '{image_name}'. "
+            "Registry must be a domain (e.g., docker.io, registry.k8s.io)"
+        )
 
     return True, ""
 
@@ -173,7 +179,9 @@ def detect_file_type(name: str) -> str:
     if '==' in name:
         return "pip_module"
     # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix)
-    if '.' in name and '/' not in name and '==' not in name and any(x in name.lower() for x in ['ansible', 'community', 'galaxy']):
+    if '.' in name and '/' not in name and '==' not in name and any(
+        x in name.lower() for x in ['ansible', 'community', 'galaxy']
+    ):
         return "ansible_galaxy_collection"
     if name.startswith('ansible_galaxy_collection'):
         return "ansible_galaxy_collection"
@@ -296,7 +304,9 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any]
 
     # Check existence
     if not container_exists(pulp_name, logger):
-        result["message"] = f"Container not found in Pulp (looked for: {pulp_name})"
+        result["message"] = (
+            f"Container not found in Pulp (looked for: {pulp_name})"
+        )
         return result
 
     try:
@@ -368,7 +378,8 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
         # 1. Remove content from repository
         if content_href:
             remove_result = run_cmd(
-                f"pulp file repository content remove --repository {repo_name} --href {content_href}",
+                f"pulp file repository content remove --repository {repo_name} "
+                f"--href {content_href}",
                 logger
             )
             if remove_result["rc"] == 0:
@@ -376,7 +387,8 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger)
             else:
                 # Try alternative: modify repository to remove content
                 run_cmd(
-                    f"pulp file repository content modify --repository {repo_name} --remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'",
+                    f"pulp file repository content modify --repository {repo_name} "
+                    f"--remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'",
                     logger
                 )
 
@@ -444,7 +456,9 @@ def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger)
                     messages.append("Orphan cleanup completed")
         else:
             # Try listing repos to find partial match
-            repo_list = run_cmd(pulp_python_commands["list_repositories"], logger)
+            repo_list = run_cmd(
+                pulp_python_commands["list_repositories"], logger
+            )
             if repo_list["rc"] == 0:
                 repos = safe_json_parse(repo_list["stdout"])
                 for repo in repos:
@@ -533,7 +547,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor
                 messages.append("Repository deleted")
         else:
             # Try listing repos to find partial match
-            repo_list = run_cmd(pulp_file_commands["list_repositories"], logger)
+            repo_list = run_cmd(
+                pulp_file_commands["list_repositories"], logger
+            )
             if repo_list["rc"] == 0:
                 repos = safe_json_parse(repo_list["stdout"])
                 for repo in repos:
@@ -569,7 +585,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor
                 mark_software_partial(affected, base_path, logger, file_type)
 
         # Clean up uploaded content from filesystem
-        fs_result = cleanup_content_directory(name, file_type, repo_store_path, logger)
+        fs_result = cleanup_content_directory(
+            name, file_type, repo_store_path, logger
+        )
         if fs_result["status"] == "Success":
             content_removed = True
             messages.append(fs_result["message"])
@@ -673,67 +691,82 @@ def cleanup_content_directory(content_name: str, content_type: str, repo_store_p
 # STATUS FILE UPDATES
 # =============================================================================
 
-def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[str]:
+def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> Dict[str, List[str]]:
     """Remove RPMs that belong to a specific repository from status files.
-    
+
     Uses the repo_name column in status.csv to accurately identify RPMs from the repository.
-    
+    Now that all repo_names include architecture prefixes, the logic is simplified.
+
     Args:
-        repo_name: Repository name (e.g., 'x86_64_appstream')
+        repo_name: Repository name (e.g., 'x86_64_appstream', 'aarch64_epel')
         base_path: Base path for status files
         logger: Logger instance
-        
+
     Returns:
-        List of software names that were affected
+        Dict mapping architecture to list of affected software names
     """
-    affected_software = []
+    affected_software = {}
     logger.info(f"Removing RPMs from status.csv for repository: {repo_name}")
-    try:
-        for arch in ARCH_SUFFIXES:
-            for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"):
-                rows = []
-                removed = False
-                has_repo_column = False
 
-                # Check if file has repo_name column
-                with open(status_file, 'r', encoding='utf-8') as f:
-                    header = f.readline().strip().lower()
-                    has_repo_column = "repo_name" in header
+    # Extract architecture from repo_name (all repo_names should now have arch prefixes)
+    target_arch = None
+    for arch in ARCH_SUFFIXES:
+        if repo_name.startswith(f"{arch}_"):
+            target_arch = arch
+            break
+    
+    if not target_arch:
+        logger.error(f"Repository name {repo_name} does not have architecture prefix")
+        return {}
+    
+    logger.info(f"Processing architecture: {target_arch}")
+    affected_software[target_arch] = []
+    
+    try:        
+        for status_file in glob.glob(f"{base_path}/{target_arch}/*/status.csv"):
+            rows = []
+            removed = False
+            has_repo_column = False
 
-                with open(status_file, 'r', encoding='utf-8') as f:
-                    reader = csv.DictReader(f)
-                    fieldnames = reader.fieldnames
-                    for row in reader:
-                        name = row.get('name', '')
-                        row_type = row.get('type', '')
-                        rpm_repo = row.get('repo_name', '')
-
-                        logger.info(f"Processing row: {row}")
-                        # For RPMs, check if they belong to the deleted repository
-                        if row_type == 'rpm' or row_type == 'rpm_file':
-                            if has_repo_column and rpm_repo == repo_name:
-                                removed = True
-                                logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)")
-                            else:
-                                rows.append(row)
+            # Check if file has repo_name column
+            with open(status_file, 'r', encoding='utf-8') as f:
+                header = f.readline().strip().lower()
+                has_repo_column = "repo_name" in header
+
+            with open(status_file, 'r', encoding='utf-8') as f:
+                reader = csv.DictReader(f)
+                fieldnames = reader.fieldnames
+                for row in reader:
+                    name = row.get('name', '')
+                    row_type = row.get('type', '')
+                    rpm_repo = row.get('repo_name', '')
+
+                    logger.info(f"Processing row: {row}")
+                    # For RPMs, check if they belong to the deleted repository
+                    if row_type in ('rpm', 'rpm_repo', 'rpm_file'):
+                        if has_repo_column and rpm_repo == repo_name:
+                            removed = True
+                            logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)")
                         else:
                             rows.append(row)
+                    else:
+                        rows.append(row)
 
-                if removed and fieldnames:
-                    with open(status_file, 'w', newline='', encoding='utf-8') as f:
-                        writer = csv.DictWriter(f, fieldnames=fieldnames)
-                        writer.writeheader()
-                        writer.writerows(rows)
+            if removed and fieldnames:
+                with open(status_file, 'w', newline='', encoding='utf-8') as f:
+                    writer = csv.DictWriter(f, fieldnames=fieldnames)
+                    writer.writeheader()
+                    writer.writerows(rows)
 
-                    # Track affected software
-                    software_name = os.path.basename(os.path.dirname(status_file))
-                    if software_name not in affected_software:
-                        affected_software.append(software_name)
+                # Track affected software
+                software_name = os.path.basename(os.path.dirname(status_file))
+                if software_name not in affected_software[target_arch]:
+                    affected_software[target_arch].append(software_name)
 
         return affected_software
     except Exception as e:
         logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}")
-        return []
+        return {}
 
 def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]:
     """Remove artifact from status.csv files and return affected software names by architecture.
@@ -798,10 +831,10 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path:
 
 def mark_software_partial(affected_software, base_path: str, logger, artifact_type: str = None):
     """Mark software entries as partial in software.csv.
-    
+
     Args:
-        affected_software: Either a List[str] of software names (from remove_rpms_from_repository)
-                          or a Dict[str, List[str]] mapping arch to software names (from remove_from_status_files)
+        affected_software: Either a List[str] of software names (legacy support)
+                          or a Dict[str, List[str]] mapping arch to software names
         base_path: Base path for software.csv
         logger: Logger instance
         artifact_type: Type of artifact being removed (for logging purposes)
@@ -811,8 +844,11 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty
         logger.info("No affected software to mark as partial")
         return
 
-    # Normalize input: if a flat list is passed, apply to all architectures
+    # Normalize input: convert to arch_software_map if needed
     if isinstance(affected_software, list):
+        # Legacy list input - this should not happen with new remove_rpms_from_repository
+        # but we keep it for backward compatibility
+        logger.warning("Received list input to mark_software_partial, applying to all architectures (legacy behavior)")
         arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES}
     else:
         arch_software_map = affected_software
@@ -869,7 +905,7 @@ def software_has_rpms(software_name: str, arch: str, base_path: str, logger) ->
         with open(status_file, 'r', encoding='utf-8') as f:
             reader = csv.DictReader(f)
             for row in reader:
-                if row.get('type', '').lower() == 'rpm':
+                if row.get('type', '').lower() in ('rpm', 'rpm_repo'):
                     return True
         return False
     except OSError as e:
@@ -892,7 +928,9 @@ def mark_all_software_partial(base_path: str, logger):
     try:
         for arch in ARCH_SUFFIXES:
             software_file = f"{base_path}/{arch}/software.csv"
-            logger.info(f"Processing software file: {software_file}")
+            logger.info(
+                f"Processing software file: {software_file}"
+            )
 
             if not os.path.exists(software_file):
                 logger.info(f"Software file not found: {software_file}")
@@ -948,8 +986,12 @@ def run_module():
             cleanup_repos=dict(type='list', elements='str', default=[]),
             cleanup_containers=dict(type='list', elements='str', default=[]),
             cleanup_files=dict(type='list', elements='str', default=[]),
-            base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT),
-            repo_store_path=dict(type='str', default='/opt/omnia')
+            base_path=dict(
+                type='str', default=CLEANUP_BASE_PATH_DEFAULT
+            ),
+            repo_store_path=dict(
+                type='str', default='/opt/omnia'
+            )
         ),
         supports_check_mode=True
     )
@@ -966,16 +1008,25 @@ def run_module():
     logger = setup_standard_logger(log_dir)
 
     # Handle 'all' keyword for repositories only
-    cleanup_all_repos = cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all'
+    cleanup_all_repos = (
+        cleanup_repos and len(cleanup_repos) == 1 and 
+        cleanup_repos[0].lower() == 'all'
+    )
     #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all':
     if cleanup_all_repos:
         logger.info("cleanup_repos='all' - fetching all repositories from Pulp")
         cleanup_repos = get_all_repositories(logger)
         if not cleanup_repos:
-            module.fail_json(msg="Failed to retrieve repository list from Pulp. Please check if Pulp services are running.")
+            module.fail_json(
+                msg="Failed to retrieve repository list from Pulp. "
+                "Please check if Pulp services are running."
+            )
         logger.info(f"Found {len(cleanup_repos)} repositories to cleanup: {cleanup_repos}")
 
-    logger.info(f"Starting cleanup - repos: {cleanup_repos}, containers: {cleanup_containers}, files: {cleanup_files}")
+    logger.info(
+        f"Starting cleanup - repos: {cleanup_repos}, "
+        f"containers: {cleanup_containers}, files: {cleanup_files}"
+    )
 
     all_results = []
 
diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml
index 93e379833b..6f54e5f45f 100644
--- a/local_repo/pulp_cleanup.yml
+++ b/local_repo/pulp_cleanup.yml
@@ -15,10 +15,15 @@
 # Pulp Cleanup Playbook - Clean Architecture
 #
 # Usage:
-#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel", "baseos"]}'
-#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_containers": ["nginx", "redis"]}'
-#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_files": ["git", "chart-0.48.0"]}'
-#   ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel"], "cleanup_containers": ["nginx"]}' -e force=true
+#   # Repository cleanup (include architecture prefix)
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel,aarch64_epel"
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_appstream"
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_containers=nginx,redis"
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_files=git,chart-0.48.0"
+#   ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel -e cleanup_containers=nginx -e force=true"
+#
+#   # Examples: x86_64_epel, aarch64_epel, x86_64_appstream, aarch64_baseos
+#   # Note: Use architecture prefix (x86_64_ or aarch64_) for repository names
 
 - name: Pulp Cleanup
   hosts: localhost

From 2898ff029a86ea9c326bea156f2162d9548e1d86 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Thu, 12 Feb 2026 17:36:48 +0530
Subject: [PATCH 06/77] input config changes

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 input/config/aarch64/rhel/10.0/slurm_custom.json | 5 +----
 input/config/x86_64/rhel/10.0/slurm_custom.json  | 5 +----
 input/local_repo_config.yml                      | 4 +++-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json
index 2483775495..2bdfda0ab9 100644
--- a/input/config/aarch64/rhel/10.0/slurm_custom.json
+++ b/input/config/aarch64/rhel/10.0/slurm_custom.json
@@ -9,10 +9,7 @@
             {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"},
             {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"},
             {"package": "apptainer", "type": "rpm", "repo_name": "epel" },
-            {"package": "doca-ofed",
-             "type": "iso",
-             "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm"
-            }
+	    {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }
         ]
     },
     "slurm_control_node": {
diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json
index 9531239fd2..8781885cca 100644
--- a/input/config/x86_64/rhel/10.0/slurm_custom.json
+++ b/input/config/x86_64/rhel/10.0/slurm_custom.json
@@ -7,10 +7,7 @@
             {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"},
             {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"},
             {"package": "apptainer", "type": "rpm", "repo_name": "epel" },
-            {"package": "doca-ofed",
-             "type": "iso",
-             "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"
-            }
+	    {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }
         ]
     },
     "slurm_control_node": {
diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml
index 2f318f1deb..8428e6d94c 100644
--- a/input/local_repo_config.yml
+++ b/input/local_repo_config.yml
@@ -138,10 +138,12 @@ omnia_repo_url_rhel_x86_64:
   - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"}
   - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"}
   - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"}
-  - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key'", name: "cri-o"}
+  - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"}
+  - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"}
 omnia_repo_url_rhel_aarch64:
   - { url: "https://download.docker.com/linux/centos/10/aarch64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"}
   - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/aarch64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"}
+  - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/repodata/repomd.xml.key", name: "doca"}
 # Example:
 # additional_repos_x86_64:
 #  - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" }

From 680aef3efb7c0249d2d88447e9f0d7f83541a80f Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 17:44:57 +0530
Subject: [PATCH 07/77] Fixed ansible lint issues

---
 .../tasks/display_warnings.yml                | 18 ++++------
 .../restore_omnia_config_credentials.yml      | 23 ++++++++-----
 .../restore_user_registry_credential.yml      | 33 ++++++++++---------
 .../import_input_parameters/vars/main.yml     | 10 +++---
 4 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
index ac1eb69998..2cc6dfed26 100644
--- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
+++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
@@ -20,13 +20,11 @@
            UPGRADE WARNINGS SUMMARY
       =================================
 
-      {% if upgrade_warnings | length > 0 %}
       {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected.
       You will now be shown the detailed list.
-      {% else %}
-      No warnings detected. Upgrade completed successfully!
-      {% endif %}
-  when: upgrade_warnings is defined
+  when:
+    - upgrade_warnings is defined
+    - upgrade_warnings | length > 0
 
 
 - name: Pause for user to review warnings
@@ -36,7 +34,6 @@
       ║       ⚠️  UPGRADE WARNINGS REVIEW  ⚠️        ║
       ╚════════════════════════════════════════════╝
 
-      {% if upgrade_warnings | length > 0 %}
       {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected:
 
       {% for warning in upgrade_warnings %}
@@ -45,9 +42,6 @@
 
       Please review these warnings carefully.
       Press ENTER to continue or CTRL+C to abort.
-      {% else %}
-      No warnings detected. Upgrade completed successfully!
-
-      Press ENTER to continue...
-      {% endif %}
-  when: upgrade_warnings is defined
+  when:
+    - upgrade_warnings is defined
+    - upgrade_warnings | length > 0
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
index 0abafee26b..71e8fb7db2 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -31,16 +31,21 @@
     - not backup_omnia_config_credentials_stat.stat.exists
     - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))"
 
+- name: Check if backup file is encrypted
+  ansible.builtin.command:
+    cmd: cat "{{ backup_location }}/omnia_config_credentials.yml"
+  register: backup_omnia_config_credentials_content
+  changed_when: false
+  failed_when: false
+  no_log: true
+  when: backup_omnia_config_credentials_stat.stat.exists
+
 - name: Process omnia_config_credentials.yml when present in backup
+  when: >-
+    backup_omnia_config_credentials_key_stat.stat.exists and
+    backup_omnia_config_credentials_content.stdout is defined and
+    '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout
   block:
-    - name: Check if backup file is encrypted
-      ansible.builtin.command:
-        cmd: cat "{{ backup_location }}/omnia_config_credentials.yml"
-      register: backup_omnia_config_credentials_content
-      changed_when: false
-      failed_when: false
-      no_log: true
-
     - name: "Case 1: Key present and file encrypted - Process and update"
       block:
         - name: Copy encrypted omnia_config_credentials.yml from backup to temp location
@@ -68,6 +73,7 @@
           no_log: true
           register: vault_decrypt_result
           failed_when: vault_decrypt_result.rc != 0
+          changed_when: false
 
         - name: Read decrypted content
           ansible.builtin.slurp:
@@ -126,6 +132,7 @@
           no_log: true
           register: vault_encrypt_result
           failed_when: vault_encrypt_result.rc != 0
+          changed_when: false
 
         - name: Clean up temporary files
           ansible.builtin.file:
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
index de337310b8..fe02a3d750 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -31,25 +31,26 @@
 - name: Add warning for missing user_registry_credential.yml to list
   ansible.builtin.set_fact:
     upgrade_warnings: >-
-      {{ upgrade_warnings + [
-        "WARNING: user_registry_credential.yml not found in backup at " +
-        backup_location + "/user_registry_credential.yml. " +
-        "This might be due to complete Omnia execution not being completed. " +
-        "Skipping restoration of this file."
-      ] }}
-  when: 
+      {{ upgrade_warnings + [msg_user_registry_credential_missing] }}
+  when:
     - not backup_user_registry_credential_stat.stat.exists
     - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))"
 
+- name: Check if backup file is encrypted
+  ansible.builtin.command:
+    cmd: cat "{{ backup_location }}/user_registry_credential.yml"
+  register: backup_user_registry_content
+  changed_when: false
+  failed_when: false
+  no_log: true
+  when: backup_user_registry_credential_stat.stat.exists
+
 - name: Process user_registry_credential.yml when present in backup
+  when: >-
+    backup_local_repo_credentials_key_stat.stat.exists and
+    backup_user_registry_content.stdout is defined and
+    '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout
   block:
-    - name: Check if backup file is encrypted
-      ansible.builtin.command:
-        cmd: cat "{{ backup_location }}/user_registry_credential.yml"
-      register: backup_user_registry_content
-      changed_when: false
-      failed_when: false
-      no_log: true
 
     - name: "Case 1: Key present and file encrypted - Copy both"
       block:
@@ -64,6 +65,7 @@
           no_log: true
           register: vault_decrypt_result
           failed_when: vault_decrypt_result.rc != 0
+          changed_when: false
 
         - name: Copy encrypted user_registry_credential.yml from backup
           ansible.builtin.copy:
@@ -118,8 +120,7 @@
           {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %}
           - Key file exists but file is not encrypted
           {% endif %}
-          Please check the backup integrity and ensure both files are present
-          in consistent states.
+          Please check the backup integrity and ensure both files are present in consistent states.
       when: >-
         (not backup_local_repo_credentials_key_stat.stat.exists and 
          backup_user_registry_content.stdout is defined and 
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 5eee4a2f50..9808da58bc 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -31,14 +31,16 @@ msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.dat
 msg_restore_item_name_missing: "restore_item must define 'name'"
 msg_validation_failed: "Validation failed for {{ restore_item.name }}"
 msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}"
-msg_user_registry_credential_missing: |- 
-  \033[93mWARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml\033[0m
+msg_user_registry_credential_missing: |-
+  [93mWARNING:[0m user_registry_credential.yml not found in backup at
+  {{ backup_location }}/user_registry_credential.yml
   This might be due to complete Omnia execution not being completed.
   Skipping restoration of this file.
 
 # Omnia config credentials messages
-msg_omnia_config_credentials_missing: |- 
-  WARNING: omnia_config_credentials.yml not found in backup at {{ backup_location }}/omnia_config_credentials.yml.
+msg_omnia_config_credentials_missing: |-
+  WARNING: omnia_config_credentials.yml not found in backup at
+  {{ backup_location }}/omnia_config_credentials.yml.
   This might be due to complete Omnia execution not being completed.
   Skipping restoration of this file.
 

From ad7a5c08a6cf917814aefea6bef04145ad485534 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 18:05:14 +0530
Subject: [PATCH 08/77] fixed lint issues

---
 .../restore_omnia_config_credentials.yml      | 34 +++++++--------
 .../restore_user_registry_credential.yml      | 43 +++++++++++--------
 .../import_input_parameters/vars/main.yml     |  2 +-
 3 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
index 71e8fb7db2..a129603dcc 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -27,7 +27,7 @@
   ansible.builtin.set_fact:
     upgrade_warnings: >-
       {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }}
-  when: 
+  when:
     - not backup_omnia_config_credentials_stat.stat.exists
     - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))"
 
@@ -93,6 +93,10 @@
             msg: "{{ msg_omnia_config_decrypt_error }}"
 
     - name: "Case 1.1: Apply template and encrypt"
+      when: >
+        backup_omnia_config_credentials_key_stat.stat.exists and
+        backup_omnia_config_credentials_content.stdout is defined and
+        '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout
       block:
         - name: Set template variables from credentials
           ansible.builtin.set_fact:
@@ -150,29 +154,25 @@
         - name: Fail with template/encryption error message
           ansible.builtin.fail:
             msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}"
-      when: >-
-        backup_omnia_config_credentials_key_stat.stat.exists and
-        backup_omnia_config_credentials_content.stdout is defined and
-        '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout
 
     - name: "Case 2: Both key and file missing - Add info warning"
-      ansible.builtin.set_fact:
-        upgrade_warnings: >-
-          {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }}
-      when: >-
+      when: >
         not backup_omnia_config_credentials_key_stat.stat.exists and
-        (backup_omnia_config_credentials_content.stdout is not defined or 
+        (backup_omnia_config_credentials_content.stdout is not defined or
          '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and
         "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))"
+      ansible.builtin.set_fact:
+        upgrade_warnings: >
+          {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }}
 
     - name: "Case 3: Error - Mismatched state"
-      ansible.builtin.fail:
-        msg: "{{ msg_omnia_config_credentials_error }}"
-      when: >-
-        (not backup_omnia_config_credentials_key_stat.stat.exists and 
-         backup_omnia_config_credentials_content.stdout is defined and 
+      when: >
+        (not backup_omnia_config_credentials_key_stat.stat.exists and
+         backup_omnia_config_credentials_content.stdout is defined and
          '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or
-        (backup_omnia_config_credentials_key_stat.stat.exists and 
-         backup_omnia_config_credentials_content.stdout is defined and 
+        (backup_omnia_config_credentials_key_stat.stat.exists and
+         backup_omnia_config_credentials_content.stdout is defined and
          '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout)
+      ansible.builtin.fail:
+        msg: "{{ msg_omnia_config_credentials_error }}"
   when: backup_omnia_config_credentials_stat.stat.exists
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
index fe02a3d750..69a6a391a2 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -53,6 +53,10 @@
   block:
 
     - name: "Case 1: Key present and file encrypted - Copy both"
+      when: >
+        backup_local_repo_credentials_key_stat.stat.exists and
+        backup_user_registry_content.stdout is defined and
+        '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout
       block:
         - name: Decrypt user_registry_credential.yml using the key
           ansible.builtin.shell:
@@ -92,12 +96,13 @@
         - name: Fail with decryption error message
           ansible.builtin.fail:
             msg: "{{ msg_user_registry_decrypt_error }}"
-      when: >-
-        backup_local_repo_credentials_key_stat.stat.exists and
-        backup_user_registry_content.stdout is defined and
-        '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout
 
     - name: "Case 2: Both key and file missing - Add info warning"
+      when: >-
+        not backup_local_repo_credentials_key_stat.stat.exists and
+        (backup_user_registry_content.stdout is not defined or
+         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and
+        "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))"
       ansible.builtin.set_fact:
         upgrade_warnings: >-
           {{ upgrade_warnings + [
@@ -105,27 +110,27 @@
             "are not present in backup. This is expected if registry credentials " +
             "were not configured in the source installation."
           ] }}
-      when: >-
-        not backup_local_repo_credentials_key_stat.stat.exists and
-        (backup_user_registry_content.stdout is not defined or 
-         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and
-        "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))"
 
     - name: "Case 3: Error - Mismatched state"
+      when: >-
+        (not backup_local_repo_credentials_key_stat.stat.exists and
+         backup_user_registry_content.stdout is defined and
+         '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or
+        (backup_local_repo_credentials_key_stat.stat.exists and
+         backup_user_registry_content.stdout is defined and
+         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout)
       ansible.builtin.fail:
         msg: |
           ERROR: Inconsistent state detected for user_registry_credential.yml:
-          {% if not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %}
+          {% if not backup_local_repo_credentials_key_stat.stat.exists and
+             backup_user_registry_content.stdout is defined and
+             '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %}
           - File is encrypted but key file (.local_repo_credentials_key) is missing
-          {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %}
+          {% elif backup_local_repo_credentials_key_stat.stat.exists and
+             backup_user_registry_content.stdout is defined and
+             '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %}
           - Key file exists but file is not encrypted
           {% endif %}
-          Please check the backup integrity and ensure both files are present in consistent states.
-      when: >-
-        (not backup_local_repo_credentials_key_stat.stat.exists and 
-         backup_user_registry_content.stdout is defined and 
-         '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or
-        (backup_local_repo_credentials_key_stat.stat.exists and 
-         backup_user_registry_content.stdout is defined and 
-         '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout)
+          Please check the backup integrity and ensure both files are present
+          in consistent states.
   when: backup_user_registry_credential_stat.stat.exists
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 9808da58bc..3bdf596641 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -32,7 +32,7 @@ msg_restore_item_name_missing: "restore_item must define 'name'"
 msg_validation_failed: "Validation failed for {{ restore_item.name }}"
 msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}"
 msg_user_registry_credential_missing: |-
-  [93mWARNING:[0m user_registry_credential.yml not found in backup at
+  WARNING: user_registry_credential.yml not found in backup at
   {{ backup_location }}/user_registry_credential.yml
   This might be due to complete Omnia execution not being completed.
   Skipping restoration of this file.

From 31c5600391bad02cd31c9c2d3ad167100371f5d2 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 18:13:29 +0530
Subject: [PATCH 09/77] Fixed ansible lint issues

---
 .../restore_omnia_config_credentials.yml      |  2 +-
 .../restore_user_registry_credential.yml      |  2 +-
 .../import_input_parameters/vars/main.yml     | 46 ++++++++++---------
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
index a129603dcc..e04964e461 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -175,4 +175,4 @@
          '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout)
       ansible.builtin.fail:
         msg: "{{ msg_omnia_config_credentials_error }}"
-  when: backup_omnia_config_credentials_stat.stat.exists
+
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
index 69a6a391a2..47b62fedb1 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -133,4 +133,4 @@
           {% endif %}
           Please check the backup integrity and ensure both files are present
           in consistent states.
-  when: backup_user_registry_credential_stat.stat.exists
+
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 3bdf596641..2bd20f0076 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -44,48 +44,52 @@ msg_omnia_config_credentials_missing: |-
   This might be due to complete Omnia execution not being completed.
   Skipping restoration of this file.
 
-msg_omnia_config_credentials_info_missing: |- 
-  INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key 
-  are not present in backup. This is expected if credentials 
+msg_omnia_config_credentials_info_missing: |-
+  INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key
+  are not present in backup. This is expected if credentials
   were not configured in the source installation.
 
-msg_omnia_config_credentials_success: |- 
+msg_omnia_config_credentials_success: |-
   omnia_config_credentials.yml restored and updated from backup.
   Backup: {{ backup_location }}/omnia_config_credentials.yml
   Target: {{ input_project_dir }}/omnia_config_credentials.yml
   Status: Updated with postgres credentials and re-encrypted (key file also restored)
 
-msg_omnia_config_credentials_error: |- 
+msg_omnia_config_credentials_error: |-
   ERROR: Inconsistent state detected for omnia_config_credentials.yml:
-  {% if not backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %}
+  {% if not backup_omnia_config_credentials_key_stat.stat.exists and
+     backup_omnia_config_credentials_content.stdout is defined and
+     '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %}
   - File is encrypted but key file (.omnia_config_credentials_key) is missing
-  {% elif backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %}
+  {% elif backup_omnia_config_credentials_key_stat.stat.exists and
+     backup_omnia_config_credentials_content.stdout is defined and
+     '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %}
   - Key file exists but file is not encrypted
   {% endif %}
   Please check the backup integrity and ensure both files are present
   in consistent states.
 
 # Rescue warning messages
-msg_user_registry_decrypt_error: |- 
-  ERROR: Failed to decrypt user_registry_credential.yml. 
-  The backup key file may be corrupted or incompatible. 
-  Please check the backup integrity and ensure the key file 
+msg_user_registry_decrypt_error: |-
+  ERROR: Failed to decrypt user_registry_credential.yml.
+  The backup key file may be corrupted or incompatible.
+  Please check the backup integrity and ensure the key file
   matches the encrypted file.
 
-msg_omnia_config_decrypt_error: |- 
-  ERROR: Failed to decrypt omnia_config_credentials.yml. 
-  The backup key file may be corrupted or incompatible. 
-  Please check the backup integrity and ensure the key file 
+msg_omnia_config_decrypt_error: |-
+  ERROR: Failed to decrypt omnia_config_credentials.yml.
+  The backup key file may be corrupted or incompatible.
+  Please check the backup integrity and ensure the key file
   matches the encrypted file.
 
-msg_omnia_config_template_error: |- 
-  ERROR: Failed to generate updated omnia_config_credentials.yml. 
-  Template processing may have failed due to invalid data format. 
+msg_omnia_config_template_error: |-
+  ERROR: Failed to generate updated omnia_config_credentials.yml.
+  Template processing may have failed due to invalid data format.
   Please check the backup file format and ensure it contains valid YAML.
 
-msg_omnia_config_encrypt_error: |- 
-  ERROR: Failed to encrypt updated omnia_config_credentials.yml. 
-  The key file may be corrupted or there may be permission issues. 
+msg_omnia_config_encrypt_error: |-
+  ERROR: Failed to encrypt updated omnia_config_credentials.yml.
+  The key file may be corrupted or there may be permission issues.
   Please check the key file integrity and file permissions.
 
 msg_decryption_failed: "Decryption failed. Check warnings for details."

From da5423411cb969b8ddfd41856c195c4e8e443ac1 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 18:21:52 +0530
Subject: [PATCH 10/77] fixed ansible lint issues

---
 .../tasks/restore_omnia_config_credentials.yml                   | 1 -
 .../tasks/restore_user_registry_credential.yml                   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
index e04964e461..6a20f371f8 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml
@@ -175,4 +175,3 @@
          '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout)
       ansible.builtin.fail:
         msg: "{{ msg_omnia_config_credentials_error }}"
-
diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
index 47b62fedb1..158b029ed3 100644
--- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
+++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml
@@ -133,4 +133,3 @@
           {% endif %}
           Please check the backup integrity and ensure both files are present
           in consistent states.
-

From cdaa98d829d7e32ee0a13955145a96c6b67f25db Mon Sep 17 00:00:00 2001
From: "balajikumaran.cs" <balajikumaran.cs@dellteam.com>
Date: Thu, 12 Feb 2026 19:05:57 +0530
Subject: [PATCH 11/77] offline build-image and discovery updates (#3956)

* Use Pulp-hosted builder images for x86_64 builds

* added x86_64 image-builder image

* Update default_packages.json

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Refine image build prereqs and regctl handling

* Update omnia_metadata_file path to use variable

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Airgap: move telemetry/NFS prep offline and package installs to prepare_oim

* added nolog true

* Update prepare_oim_completion.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Update aarch64_prereq.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Update main.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Update main.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Update main.yml

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Replace command with podman_image module for image tasks

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Replace Podman command with Ansible module

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>

* Align podman image pull with retries and tagging for x86_64 and aarch64

* Fix podman tagging for x86_64 and aarch64 images

---------

Signed-off-by: balajikumaran.cs <balajikumaran.c.s@gmail.com>
---
 .../roles/image_creation/vars/main.yml        |  5 +-
 .../roles/prepare_arm_node/tasks/main.yml     | 58 ++++++++------
 .../roles/prepare_arm_node/vars/main.yml      | 10 ++-
 build_image_x86_64/build_image_x86_64.yml     |  4 +-
 .../image_creation/tasks/build_image_tag.yml  | 28 -------
 .../tasks/prepare_pulp_image.yml              | 79 +++++++++++++++++++
 .../roles/image_creation/vars/main.yml        | 10 ++-
 .../roles/nfs_client/tasks/nfs_client.yml     |  5 --
 discovery/roles/nfs_client/vars/main.yml      |  7 --
 discovery/roles/telemetry/tasks/main.yml      |  4 +
 .../telemetry/tasks/telemetry_prereq.yml      | 27 ++++---
 .../tasks/update_ldms_agg_config.yml          |  5 --
 discovery/roles/telemetry/vars/main.yml       | 14 ++--
 .../x86_64/rhel/10.0/default_packages.json    |  3 +-
 prepare_oim/prepare_oim.yml                   | 10 +++
 .../common/tasks/aarch64_prereq.yml           | 26 ++++++
 .../deploy_containers/common/tasks/main.yml   |  2 +-
 .../common/tasks/package_installation.yml     | 29 +++++++
 .../common/tasks/prepare_oim_completion.yml   | 20 ++++-
 .../deploy_containers/common/vars/main.yml    | 28 ++++++-
 20 files changed, 272 insertions(+), 102 deletions(-)
 delete mode 100644 build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml
 create mode 100644 build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml
 create mode 100644 prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml
 create mode 100644 prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml

diff --git a/build_image_aarch64/roles/image_creation/vars/main.yml b/build_image_aarch64/roles/image_creation/vars/main.yml
index 67d11422ef..984f2497d8 100644
--- a/build_image_aarch64/roles/image_creation/vars/main.yml
+++ b/build_image_aarch64/roles/image_creation/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@ input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
 omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml"
 dir_permissions_644: "0644"
 dir_permissions_755: "0755"
+aarch64_local_tag: "aarch64-image-builder/ochami"
 openchami_dir: "/opt/omnia/openchami"
 openchami_clone_path: /opt/omnia/openchami/deployment-recipes
 job_retry: "120"
@@ -32,7 +33,7 @@ ochami_compute_mounts:
   - -v {{ openchami_work_dir }}/images/rhel-{{ item.key }}-{{ rhel_tag }}.yaml:/home/builder/config.yaml:z
 ochami_aarch64_image:
   - --entrypoint /bin/bash
-  - localhost/arm-image/ochami
+  - "localhost/{{ aarch64_local_tag }}"
 ochami_base_command:
   - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG'
 
diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml
index 1801448611..4a9d150850 100644
--- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml
+++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml
@@ -167,32 +167,42 @@
 
 - name: Build full Podman image path
   ansible.builtin.set_fact:
-    pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.1"
-
-- name: Pull aarch64 image using Podman
-  ansible.builtin.command:
-    cmd: "podman pull {{ pulp_aarch_image }}"
-  register: podman_pull_result
-  ignore_errors: true
-  changed_when: false
+    pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/{{ pulp_aarch64_image_name }}"
+
+- name: Pull and tag aarch64 image
+  block:
+    - name: Pull aarch64 image using Podman
+      containers.podman.podman_image:
+        name: "{{ pulp_aarch_image }}"
+        state: present
+      register: podman_pull_result
+      retries: "{{ pull_image_retries }}"
+      delay: "{{ pull_image_delay }}"
+      until: podman_pull_result is not failed
+      changed_when: false
+
+    - name: Tag pulled image
+      containers.podman.podman_tag:
+        image: "{{ pulp_aarch_image }}"
+        target_names:
+          - "{{ aarch64_local_tag }}"
+      changed_when: false
+
+  rescue:
+    - name: Fail if Podman pull failed
+      ansible.builtin.fail:
+        msg: "Failed to pull image {{ pulp_aarch_image }}"
+
+- name: Check if regctl binary exists
+  ansible.builtin.stat:
+    path: "{{ ochami_aarch_64_dir }}/regctl"
+  register: regctl_stat
+  delegate_to: localhost
 
-- name: Fail if Podman pull failed
+- name: Fail if regctl binary not found
   ansible.builtin.fail:
-    msg: "{{ aarch64_image_fail_msg }}"
-  when: podman_pull_result.rc != 0
-
-- name: Tag pulled image
-  ansible.builtin.command:
-    cmd: "podman tag {{ pulp_aarch_image }} arm-image/ochami"
-  when: podman_pull_result.rc == 0
-  changed_when: false
-
-- name: Download regctl binary to NFS shared path
-  ansible.builtin.get_url:
-    url: "{{ aarch64_regctl_url }}"
-    dest: "{{ ochami_aarch_64_dir }}/regctl"
-    mode: "{{ hostvars['localhost']['dir_permissions_755'] }}"
-  delegate_to: localhost
+    msg: "{{ regctl_not_found_msg }}"
+  when: not regctl_stat.stat.exists
 
 - name: Copy regctl binary to /usr/local/bin on target host
   ansible.builtin.copy:
diff --git a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml
index d240f27de4..c0ce2868aa 100644
--- a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml
+++ b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,10 +15,13 @@
 
 # input files
 input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
+pulp_aarch64_image_name: "dellhpcomniaaisolution/image-build-aarch64:1.1"
+aarch64_local_tag: "aarch64-image-builder/ochami"
+pull_image_retries: "3"
+pull_image_delay: "10"
 network_spec: "{{ input_project_dir }}/network_spec.yml"
 ochami_aarch_64_dir: "/opt/omnia/openchami/aarch64"
 pulp_repo_store_path: "{{ ochami_aarch_64_dir }}/pulp.repo"
-aarch64_regctl_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64"
 pulp_repo_file_path: "/etc/yum.repos.d/pulp.repo"
 pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt"
 anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt"
@@ -39,3 +42,6 @@ aarch64_image_fail_msg: >
   Unable to pull the Ochami aarch64 image builder image.
   Make sure you have added the default package for aarch64 in the software_config.json file and ran local_repo.yml.
   If not, add that package and rerun local_repo.yml.
+regctl_not_found_msg: >
+  regctl binary not found at {{ ochami_aarch_64_dir }}/regctl.
+  Please run prepare_oim.yml playbook to download the regctl binary.
diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml
index 85ecaf93cd..676d8adbd6 100644
--- a/build_image_x86_64/build_image_x86_64.yml
+++ b/build_image_x86_64/build_image_x86_64.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@
     - name: Tag OpenCHAMI image
       ansible.builtin.include_role:
         name: image_creation
-        tasks_from: build_image_tag.yml
+        tasks_from: prepare_pulp_image.yml
 
 - name: OpenCHAMI build image for x86_64
   hosts: localhost
diff --git a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml
deleted file mode 100644
index 0b7a56072d..0000000000
--- a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
----
-
-- name: Pull image-build image
-  ansible.builtin.command:
-    cmd: "podman pull {{ image_build_el10 }}"
-  register: pull_result
-  retries: "{{ pull_image_retries }}"
-  delay: "{{ pull_image_delay }}"
-  until: pull_result.rc == 0
-  changed_when: "'Image is up to date' not in pull_result.stdout"
-
-- name: Fail if image not pulled successfully
-  ansible.builtin.fail:
-    msg: "{{ pull_result.stdout }}"
-  when: pull_result.rc != 0
diff --git a/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml
new file mode 100644
index 0000000000..22f336b849
--- /dev/null
+++ b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml
@@ -0,0 +1,79 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+# Load network specification
+- name: Load network spec file
+  ansible.builtin.include_vars:
+    file: "{{ network_spec }}"
+  register: include_network_spec
+  no_log: true
+
+- name: Fail if network spec cannot be loaded
+  ansible.builtin.fail:
+    msg: "{{ network_spec_syntax_fail_msg }} Error: {{ include_network_spec.message }}"
+  when: include_network_spec is failed
+
+# Parse network spec data
+- name: Parse network spec
+  ansible.builtin.set_fact:
+    network_data: "{{ network_data | default({}) | combine({item.key: item.value}) }}"
+  with_dict: "{{ Networks }}"
+
+# Set PXE IP fact
+- name: Set PXE IP fact
+  ansible.builtin.set_fact:
+    oim_pxe_ip: "{{ network_data.admin_network.primary_oim_admin_ip }}"
+    cacheable: true
+
+# Copy pulp certificate and update CA trust
+- name: Copy pulp webserver certificate to anchors
+  ansible.builtin.copy:
+    src: "{{ pulp_webserver_cert_path }}"
+    dest: "{{ anchors_path }}"
+    mode: "{{ dir_permissions_644 }}"
+  become: true
+
+- name: Update CA trust
+  ansible.builtin.command: update-ca-trust
+  register: update_ca
+  changed_when: false
+
+- name: Build full Podman image path for x86_64
+  ansible.builtin.set_fact:
+    pulp_x86_image: "{{ oim_pxe_ip }}:2225/{{ pulp_x86_64_image_name }}"
+
+- name: Pull and tag x86_64 image
+  block:
+    - name: Pull x86_64 image using Podman
+      containers.podman.podman_image:
+        name: "{{ pulp_x86_image }}"
+        state: present
+      register: pull_result
+      retries: "{{ pull_image_retries }}"
+      delay: "{{ pull_image_delay }}"
+      until: pull_result is not failed
+      changed_when: false
+
+    - name: Tag pulled image for x86_64 build
+      containers.podman.podman_tag:
+        image: "{{ pulp_x86_image }}"
+        target_names:
+          - "{{ x86_64_local_tag }}"
+      changed_when: false
+
+  rescue:
+    - name: Fail if Podman pull failed
+      ansible.builtin.fail:
+        msg: "Failed to pull image {{ pulp_x86_image }}."
diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml
index a05a39d37d..60dcf0bc6f 100644
--- a/build_image_x86_64/roles/image_creation/vars/main.yml
+++ b/build_image_x86_64/roles/image_creation/vars/main.yml
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-image_build_el10: "docker.io/dellhpcomniaaisolution/image-build-el10:1.0"
+pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.0"
+x86_64_local_tag: "x86_64-image-builder/ochami"
 pull_image_retries: "3"
 pull_image_delay: "10"
 input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
@@ -23,6 +24,9 @@ openchami_dir: "/opt/omnia/openchami"
 openchami_clone_path: /opt/omnia/openchami/deployment-recipes
 job_retry: "120"
 job_delay: "30"
+network_spec: "{{ input_project_dir }}/network_spec.yml"
+pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt"
+anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt"
 openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir"
 ochami_mounts:
   - --user 0 --privileged
@@ -35,7 +39,7 @@ ochami_compute_mounts:
 
 ochami_x86_64_image:
   - --entrypoint /bin/bash
-  - docker.io/dellhpcomniaaisolution/image-build-el10:1.0
+  - "localhost/{{ x86_64_local_tag }}"
 ochami_base_command:
   - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG'
 
@@ -54,3 +58,5 @@ compute_image_failure_msg: |
 # build_compute_image.yml
 openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2"
 openchami_compute_image_vars_path: "/opt/omnia/openchami/compute_images_template.yaml"
+
+network_spec_syntax_fail_msg: "Failed to load network_spec.yml due to syntax error"
diff --git a/discovery/roles/nfs_client/tasks/nfs_client.yml b/discovery/roles/nfs_client/tasks/nfs_client.yml
index 079933c26b..ca8a3c7660 100644
--- a/discovery/roles/nfs_client/tasks/nfs_client.yml
+++ b/discovery/roles/nfs_client/tasks/nfs_client.yml
@@ -32,11 +32,6 @@
     nfs_server_ip: "{{ hostvars['127.0.0.1']['admin_nic_ip'] }}"
   when: item.server_ip == "localhost"
 
-- name: Package installation for NFS
-  ansible.builtin.package:
-    name: "{{ nfs_packages[ansible_os_family] }}"
-    state: present
-
 - name: Mount facts items to dict
   ansible.builtin.set_fact:
     nfs_src: "{{ nfs_server_ip }}:{{ item.server_share_path }}"
diff --git a/discovery/roles/nfs_client/vars/main.yml b/discovery/roles/nfs_client/vars/main.yml
index b5e01fd82a..a3c20c054c 100644
--- a/discovery/roles/nfs_client/vars/main.yml
+++ b/discovery/roles/nfs_client/vars/main.yml
@@ -20,13 +20,6 @@ software_config_file: "{{ hostvars['localhost']['input_project_dir'] }}/software
 # Usage: nfs_client.yml
 mounted_dir_perm: "0755"
 default_client_mount_options: "nosuid,rw,sync,hard,intr"
-nfs_packages:
-  RedHat:
-    - nfs-utils
-    - nfs4-acl-tools
-  Debian:
-    - nfs-common
-    - nfs4-acl-tools
 slurm_nfs_fail_msg: "Failed to mount NFS share. Please check if the NFS server is reachable or NFS is configured properly."
 
 omnia_config_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml"
diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml
index c5a3dbefba..825c3988d7 100644
--- a/discovery/roles/telemetry/tasks/main.yml
+++ b/discovery/roles/telemetry/tasks/main.yml
@@ -28,6 +28,10 @@
   when:
     - hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support']
   block:
+    - name: Set NFS info fact
+      ansible.builtin.set_fact:
+        oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}"
+
     - name: Service cluster prerequisite
       ansible.builtin.include_tasks: telemetry_prereq.yml
 
diff --git a/discovery/roles/telemetry/tasks/telemetry_prereq.yml b/discovery/roles/telemetry/tasks/telemetry_prereq.yml
index d720c57822..7eb45a89ab 100644
--- a/discovery/roles/telemetry/tasks/telemetry_prereq.yml
+++ b/discovery/roles/telemetry/tasks/telemetry_prereq.yml
@@ -47,23 +47,24 @@
     state: directory
     mode: "{{ hostvars['localhost']['dir_permissions_755'] }}"
 
-- name: Git clone for iDRAC Telemetry script
+- name: Ensure iDRAC Telemetry scripting destination exists
+  ansible.builtin.file:
+    path: "{{ idrac_telemetry_scripting_git_clone_path }}"
+    state: directory
+    mode: "{{ hostvars['localhost']['dir_permissions_755'] }}"
+
+- name: Copy iDRAC Telemetry Scripting to NFS share
   block:
-    - name: Checkout iDRAC Telemetry GitHub repo
-      ansible.builtin.git:
-        repo: "{{ idrac_telemetry_scripting_repo }}"
+    - name: Copy pre-cloned iDRAC Telemetry Scripting directory
+      ansible.builtin.copy:
+        src: "{{ idrac_telemetry_scripting_src_path }}/"
         dest: "{{ idrac_telemetry_scripting_git_clone_path }}"
-        version: "{{ idrac_telemetry_scripting_stable_commit }}"
-        update: false
-      register: clone_idrac_script
-      until: clone_idrac_script is succeeded
-      retries: "{{ max_retries }}"
-      delay: "{{ delay_count }}"
+        remote_src: true
+        mode: preserve
   rescue:
-    - name: Fail if iDRAC telemetry Git clone fails
+    - name: Fail if iDRAC telemetry copy fails
       ansible.builtin.fail:
-        msg: "{{ idrac_script_git_clone_error_msg.splitlines() | join(' ') }}"
-      when: clone_idrac_script is failed
+        msg: "{{ idrac_telemetry_scripting_copy_fail_msg.splitlines() | join(' ') }}"
 
 - name: Set kafka_support to true
   ansible.builtin.set_fact:
diff --git a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml
index db4d4b1d3f..ee6c0c7d75 100644
--- a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml
+++ b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml
@@ -13,11 +13,6 @@
 #  limitations under the License.
 ---
 
-- name: Install make
-  ansible.builtin.package:
-    name: make
-    state: present
-
 - name: Verify values.yaml exists
   ansible.builtin.stat:
     path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/values.yaml"
diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml
index 473fd74e19..5c5838ce29 100644
--- a/discovery/roles/telemetry/vars/main.yml
+++ b/discovery/roles/telemetry/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -32,14 +32,12 @@ telemetry_namespace: "telemetry"
 idrac_telemetry_k8s_name: idrac-telemetry
 
 # iDRAC Telemetry scripting repository
-idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git"
-idrac_telemetry_scripting_stable_commit: "f6999f5"
+idrac_telemetry_scripting_src_path: "{{ oim_shared_path }}/omnia/telemetry/iDRAC-Telemetry-Scripting"
 idrac_telemetry_scripting_git_clone_path: "{{ service_cluster_idrac_telemetry_dir_path }}/iDRAC-Telemetry-Scripting"
-idrac_script_git_clone_error_msg: |
-  Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }}
-  to {{ idrac_telemetry_scripting_git_clone_path }} directory in NFS share.
-max_retries: 10
-delay_count: 5
+idrac_telemetry_scripting_copy_fail_msg: |
+  Failed to copy iDRAC Telemetry Scripting from {{ idrac_telemetry_scripting_src_path }}
+  to {{ idrac_telemetry_scripting_git_clone_path }}. Please ensure prepare_oim.yml has been
+  executed successfully before running discovery.
 
 # Pre-built container images for iDRAC telemetry components
 # These default to your published images but can be overridden via telemetry_images
diff --git a/input/config/x86_64/rhel/10.0/default_packages.json b/input/config/x86_64/rhel/10.0/default_packages.json
index 813f9ad993..6002894568 100644
--- a/input/config/x86_64/rhel/10.0/default_packages.json
+++ b/input/config/x86_64/rhel/10.0/default_packages.json
@@ -34,7 +34,8 @@
       {"package": "wget", "type": "rpm", "repo_name": "x86_64_appstream"},
       {"package": "cloud-init", "type": "rpm", "repo_name": "x86_64_appstream"},
       {"package": "glibc-langpack-en", "type": "rpm", "repo_name": "x86_64_baseos"},
-      {"package": "gedit", "type": "rpm", "repo_name": "epel"}
+      {"package": "gedit", "type": "rpm", "repo_name": "epel"},
+      {"package": "docker.io/dellhpcomniaaisolution/image-build-el10", "tag": "1.0", "type": "image" }
     ]
   }
 }
diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml
index a78d21e8d9..50c48fd3e5 100644
--- a/prepare_oim/prepare_oim.yml
+++ b/prepare_oim/prepare_oim.yml
@@ -63,6 +63,11 @@
         name: deploy_containers/common
         tasks_from: add_known_hosts.yml
 
+    - name: Download aarch64 prerequisites  # noqa:role-name[path]
+      ansible.builtin.include_role:
+        name: deploy_containers/common
+        tasks_from: aarch64_prereq.yml
+
 - name: OpenLDAP Pre_req generate ssha password
   hosts: localhost
   connection: local
@@ -156,6 +161,11 @@
         name: deploy_containers/common
         tasks_from: omnia_service.yml
 
+    - name: Install required packages  # noqa:role-name[path]
+      ansible.builtin.include_role:
+        name: deploy_containers/common
+        tasks_from: package_installation.yml
+
 - name: Prepare oim completion
   hosts: localhost
   connection: local
diff --git a/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml
new file mode 100644
index 0000000000..f5eae768bb
--- /dev/null
+++ b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml
@@ -0,0 +1,26 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Create openchami aarch64 directory if not exists
+  ansible.builtin.file:
+    path: "{{ ochami_aarch64_dir }}"
+    state: directory
+    mode: "{{ dir_permissions_755 }}"
+
+- name: Download regctl binary (aarch64)
+  ansible.builtin.get_url:
+    url: "{{ regctl_aarch64_url }}"
+    dest: "{{ ochami_aarch64_dir }}/regctl"
+    mode: "{{ dir_permissions_755 }}"
diff --git a/prepare_oim/roles/deploy_containers/common/tasks/main.yml b/prepare_oim/roles/deploy_containers/common/tasks/main.yml
index 78c28e98ba..00287c628c 100644
--- a/prepare_oim/roles/deploy_containers/common/tasks/main.yml
+++ b/prepare_oim/roles/deploy_containers/common/tasks/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml
new file mode 100644
index 0000000000..1d84877307
--- /dev/null
+++ b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml
@@ -0,0 +1,29 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+- name: Install required packages
+  block:
+    - name: Install required packages
+      ansible.builtin.package:
+        name: "{{ item }}"
+        state: present
+      loop: "{{ oim_packages }}"
+      register: oim_pkg_result
+  rescue:
+    - name: Fail if required package installation fails
+      ansible.builtin.fail:
+        msg: >-
+          {{ prepare_oim_pkg_fail_msg.splitlines() | join(' ') }}
+          Failed package(s): {{ oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='item') | list | join(', ') }}
+          Error: {{ (oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='msg') | list | first) | default('') }}
diff --git a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml
index 7c86cfaf6b..52e4009219 100644
--- a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml
+++ b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -32,6 +32,24 @@
     mode: "{{ file_permissions }}"
   when: not bmc_group_data_status.stat.exists
 
+- name: Clone iDRAC Telemetry Scripting repository
+  block:
+    - name: Checkout iDRAC Telemetry GitHub repo
+      ansible.builtin.git:
+        repo: "{{ idrac_telemetry_scripting_repo }}"
+        dest: "{{ idrac_telemetry_scripting_clone_dest }}"
+        version: "{{ idrac_telemetry_scripting_stable_commit }}"
+        update: false
+      register: clone_idrac_script
+      until: clone_idrac_script is succeeded
+      retries: "{{ max_retries }}"
+      delay: "{{ delay_count }}"
+  rescue:
+    - name: Fail if iDRAC telemetry Git clone fails
+      ansible.builtin.fail:
+        msg: "{{ idrac_script_git_clone_fail_msg.splitlines() | join(' ') }}"
+      when: clone_idrac_script is failed
+
 - name: Prepare oim completion
   ansible.builtin.debug:
     msg: "{{ prepare_oim_completion_msg.splitlines() | join(' ') }}"
diff --git a/prepare_oim/roles/deploy_containers/common/vars/main.yml b/prepare_oim/roles/deploy_containers/common/vars/main.yml
index 30bb7b8125..855e7350b1 100644
--- a/prepare_oim/roles/deploy_containers/common/vars/main.yml
+++ b/prepare_oim/roles/deploy_containers/common/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,12 +28,34 @@ internal_nfs_services:
 
 ntp_firewall_service: ntp
 
+# Packages required on OIM
+oim_packages:
+  - nfs-utils
+  - nfs4-acl-tools
+  - git
+  - make
+prepare_oim_pkg_fail_msg: |
+  Failed to install required packages. Please ensure the repository is
+  configured on OIM and rerun the playbook.
+
 # Usage: prepare_oim_completion.yml
 telemetry_dir: "/opt/omnia/telemetry"
 dir_permissions_755: "0755"
 bmc_group_data_filename: "{{ telemetry_dir }}/bmc_group_data.csv"
 bmc_group_data_template: "bmc_group_data.j2"
 file_permissions: "0644"
+idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git"
+idrac_telemetry_scripting_stable_commit: "f6999f5"
+idrac_telemetry_scripting_clone_dest: "{{ telemetry_dir }}/iDRAC-Telemetry-Scripting"
+max_retries: 10
+delay_count: 5
+git_install_timeout: 300
+git_install_fail_msg: |
+  Failed to install git. Please ensure the OS repository is configured on OIM.
+  Configure the repository and rerun the playbook.
+idrac_script_git_clone_fail_msg: |
+  Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }}
+  to {{ idrac_telemetry_scripting_clone_dest }}. Please check network connectivity and rerun the playbook.
 prepare_oim_completion_msg: |
   The playbook prepare_oim.yml has completed successfully. To create the offline repositories and
   registry for the cluster nodes, please execute the playbook local_repo/local_repo.yml as the next step.
@@ -58,3 +80,7 @@ network_services:
 # Usage: configure_chrony.yml
 chrony_conf_path: "/etc/chrony.conf"
 chrony_no_sources_msg: "No chrony sources are reachable. Please give a valid NTP server configuration in network_spec.yml and re-run prepare_oim playbook."
+
+# Usage: aarch64_prereq.yml
+ochami_aarch64_dir: "/opt/omnia/openchami/aarch64"
+regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64"

From 01dece90e8c421745419a1b81a46df85a3fa15eb Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 12 Feb 2026 19:24:06 +0530
Subject: [PATCH 12/77] Added flow if any munge key update, will be useful if
 munge key changes

---
 .../slurm_config/tasks/check_ctld_running.yml | 19 +----
 discovery/roles/slurm_config/tasks/confs.yml  |  2 +-
 .../slurm_config/tasks/create_slurm_dir.yml   | 19 ++++-
 .../tasks/read_slurm_hostnames.yml            |  1 +
 .../slurm_config/tasks/update_hosts_munge.yml | 84 +++++++++++++++++++
 discovery/roles/slurm_config/vars/main.yml    |  2 +-
 6 files changed, 106 insertions(+), 21 deletions(-)
 create mode 100644 discovery/roles/slurm_config/tasks/update_hosts_munge.yml

diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 0c7626f3dd..5f2d41a904 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -61,22 +61,11 @@
       ansible.builtin.set_fact:
         reachable_hosts: "{{ ip_map_ssh_check.results | rejectattr('failed', 'true') | map(attribute='host') | list }}"
 
-    - name: Update /etc/hosts with controller hostname and IP
-      ansible.builtin.lineinfile:
-        path: /etc/hosts
-        regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}'
-        line: "{{ host_entry.value }} {{ host_entry.key }}"
-        state: present
-      loop: "{{ reachable_hosts | product(ip_name_map | dict2items) | list }}"
+    - name: Update basics on reachable_hosts
+      ansible.builtin.include_tasks: update_hosts_munge.yml
+      loop: "{{ reachable_hosts }}"
       loop_control:
-        loop_var: host_combo
-      vars:
-        target_host: "{{ host_combo[0] }}"
-        host_entry: "{{ host_combo[1] }}"
-      delegate_to: "{{ target_host }}"
-      when: reachable_hosts | length > 0
-      ignore_unreachable: true
-      failed_when: false
+        loop_var: slurmhost_ip
 
     - name: Trigger the scontrol reconfigure
       ansible.builtin.command: scontrol reconfigure
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 12236d6ed8..799d4cd757 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -172,7 +172,7 @@
   ansible.builtin.copy:
     content: "{{ item.ini_lines | join('\n') }}\n"
     dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf"
-    mode: "{{ conf_file_mode }}"
+    mode: "0640"
     owner: "{{ slurm_user }}"
     group: "{{ slurm_user_group }}"
     remote_src: "{{ copy_from_oim }}"
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 81a08adfca..45e37ac243 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -84,11 +84,21 @@
     share_prefix: "{{ slurm_config_path }}"
   when: conf_in_nfs
 
-- name: Clear the share directory
+- name: Clear Slurm-related files and directories
   ansible.builtin.file:
-    path: "{{ slurm_config_path }}"
+    path: "{{ slurm_config_path }}/{{ slurm_item }}"
     state: absent
-  when: clear_slurm_files
+  loop: "{{ (ctld_list | default([])
+   + cmpt_list | default([])
+   + login_list | default([])
+   + compiler_login_list | default([])
+   + dbd_list | default([])
+   + ['munge.key']) | flatten }}"
+  loop_control:
+    loop_var: slurm_item
+  failed_when: false
+  when:
+    - clear_slurm_files
 
 - name: Create the slurm directory in share
   ansible.builtin.file:
@@ -151,8 +161,9 @@
   ansible.builtin.copy:
     src: "{{ slurm_config_path }}/munge.key"
     dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key"
-    mode: "{{ common_mode }}"
+    mode: "0600"
     remote_src: true
+  register: munge_key_copy
   loop: "{{ (ctld_list | default([])) +
             (cmpt_list | default([])) +
             (compiler_login_list | default([])) +
diff --git a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml
index df19821983..0f7b3a16b2 100644
--- a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml
+++ b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml
@@ -46,6 +46,7 @@
 - name: Get bmc_ip
   ansible.builtin.set_fact:
     bmc_ip_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='bmc_ip') }}"
+    name_ip_map: "{{ dict(ip_name_map.values() | zip(ip_name_map.keys())) }}"
 
 - name: Assign slurm lists
   ansible.builtin.set_fact:
diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
new file mode 100644
index 0000000000..ecaaad2beb
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
@@ -0,0 +1,84 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Update /etc/hosts with controller hostname and IP
+  ansible.builtin.lineinfile:
+    path: /etc/hosts
+    regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}'
+    line: "{{ host_entry.value }} {{ host_entry.key }}"
+    state: present
+  loop: "{{ ip_name_map | dict2items | list }}"
+  loop_control:
+    loop_var: host_entry
+  ignore_unreachable: true
+  failed_when: false
+  delegate_to: "{{ slurmhost_ip }}"
+
+- name: Get munge changes
+  ansible.builtin.set_fact:
+    munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}"
+  when: munge_key_copy is defined
+
+- name: Block when munge key changed
+  when:
+    - munge_key_changed is defined
+    - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false)
+    - restart_slurm_services
+  delegate_to: "{{ slurmhost_ip }}"
+  ignore_errors: true
+  ignore_unreachable: true
+  block:
+    - name: Update munge key permissions
+      ansible.builtin.file:
+        path: /etc/munge/munge.key
+        owner: munge
+        group: munge
+        mode: '0600'
+      register: munge_key_permissions_result
+
+    - name: Restart munge service if key changed
+      ansible.builtin.service:
+        name: munge
+        state: restarted
+      register: munge_restart_result
+      when:
+        - munge_key_permissions_result is defined
+        - munge_key_permissions_result is success
+
+    - name: Restart slurmctld if munge restarted
+      ansible.builtin.service:
+        name: slurmctld
+        state: restarted
+      when:
+        - name_ip_map[slurmhost_ip] in ctld_list
+        - munge_restart_result is defined
+        - munge_restart_result is success
+
+    - name: Restart slurmd if munge restarted
+      ansible.builtin.service:
+        name: slurmd
+        state: restarted
+      when:
+        - name_ip_map[slurmhost_ip] in (cmpt_list + login_list + compiler_login_list)
+        - munge_restart_result is defined
+        - munge_restart_result is success
+
+    - name: Restart slurmdbd if munge restarted
+      ansible.builtin.service:
+        name: slurmdbd
+        state: restarted
+      when:
+        - name_ip_map[slurmhost_ip] in dbd_list
+        - munge_restart_result is defined
+        - munge_restart_result is success
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 43ee995e5a..93aa0d2786 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -79,7 +79,7 @@ cluster_name: cluster # TODO: direct load vars omnia_config.yml
 slurm_uid: 6001
 slurm_user: slurm
 slurm_user_group: slurm
-restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] }}"
+restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] | default(true) }}"
 slurm_db_username: "{{ hostvars['localhost']['slurm_db_username'] | default('dbuser') }}"
 slurm_db_password: "{{ hostvars['localhost']['slurm_db_password'] }}"
 slurm_db_host: "{{ hostvars['localhost']['slurm_db_host'] | default(false) }}"

From 19a000cb663e94ed23a2e15c866c67b2bf4b7d26 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 12 Feb 2026 19:44:38 +0530
Subject: [PATCH 13/77] lint issue fix

---
 discovery/roles/slurm_config/tasks/update_hosts_munge.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
index ecaaad2beb..a326fa820d 100644
--- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
+++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
@@ -36,7 +36,6 @@
     - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false)
     - restart_slurm_services
   delegate_to: "{{ slurmhost_ip }}"
-  ignore_errors: true
   ignore_unreachable: true
   block:
     - name: Update munge key permissions
@@ -82,3 +81,7 @@
         - name_ip_map[slurmhost_ip] in dbd_list
         - munge_restart_result is defined
         - munge_restart_result is success
+  rescue:
+    - name: Handle munge restart failure
+      ansible.builtin.debug:
+        msg: "Failed task {{ ansible_failed_task.name }} on {{ slurmhost_ip }}"

From 471d4e781435703aa2dba6d55e41139ca9a8ede7 Mon Sep 17 00:00:00 2001
From: Katakam Rakesh Naga Sai
 <125246792+Katakam-Rakesh@users.noreply.github.com>
Date: Thu, 12 Feb 2026 20:12:46 +0530
Subject: [PATCH 14/77] Update main.yml for copyright

Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com>
---
 discovery/roles/k8s_config/vars/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml
index a80fb9b257..601cc07097 100644
--- a/discovery/roles/k8s_config/vars/main.yml
+++ b/discovery/roles/k8s_config/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 94a244fe9534c5feb3d950116c19e8f9b701aee9 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 12 Feb 2026 21:55:11 +0530
Subject: [PATCH 15/77] centralize oim_metadata.yml path and remove static
 backup_location variable

---
 .../import_input_parameters/tasks/set_backup_location.yml    | 2 +-
 upgrade/roles/import_input_parameters/vars/main.yml          | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
index 4f6a96e83f..94156606e5 100644
--- a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
+++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml
@@ -15,7 +15,7 @@
 
 - name: Read oim_metadata.yml to get upgrade_backup_dir
   ansible.builtin.slurp:
-    src: /opt/omnia/.data/oim_metadata.yml
+    src: "{{ oim_metadata_path }}"
   register: oim_metadata_slurp
 
 - name: Parse oim_metadata.yml
diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml
index 2bd20f0076..ebaa33e492 100644
--- a/upgrade/roles/import_input_parameters/vars/main.yml
+++ b/upgrade/roles/import_input_parameters/vars/main.yml
@@ -15,7 +15,10 @@
 
 # backup_location will be set from oim_metadata.yml upgrade_backup_dir
 # Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default
-backup_location: ""
+# Set dynamically from metadata, no static variable needed
+
+# Path to oim_metadata.yml
+oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml"
 
 backup_dir_mode: '0755'
 default_file_mode: '0644'

From b64916bd08990d83d4f5cf0cd6895604c20f7d14 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Fri, 13 Feb 2026 10:02:03 +0530
Subject: [PATCH 16/77] Update omnia.sh

---
 omnia.sh | 77 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 21 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 9c46a04dc9..81e2094ccc 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -766,7 +766,7 @@ Description=${container_name^} Container
 [Container]
 ContainerName=${container_name}
 HostName=${container_name}
-Image=${container_name}:1.1
+Image=${container_name}:2.1
 Network=host
 
 # Capabilities
@@ -1001,16 +1001,16 @@ install_omnia_core() {
         fi
     fi
 
-    local omnia_core_tag="1.1"
+    local omnia_core_tag="2.1"
     local omnia_core_registry=""
     
-    # Check if local omnia_core:1.1 exists
+    # Check if local omnia_core:2.1 exists
     if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then
         echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
     # Check if latest exists for backward compatibility
     elif podman inspect omnia_core:latest >/dev/null 2>&1; then
         echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}"
-        # Tag it as 1.1 for consistency
+        # Tag it as 2.1 for consistency
         podman tag omnia_core:latest omnia_core:${omnia_core_tag}
     else
         echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}"
@@ -1018,11 +1018,11 @@ install_omnia_core() {
         echo ""
         echo -e "${YELLOW}One way to build the image locally:${NC}"
         echo -e "1. Clone the Omnia Artifactory repository:"
-        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container"
+        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-<omnia version>"
         echo -e "2. Navigate to the repository directory:"
         echo -e "   cd omnia-artifactory"
         echo -e "3. Build the core image locally (loads into local Podman by default):"
-        echo -e "   ./build_images.sh core omnia_branch=<version/branch_name>"
+        echo -e "   ./build_images.sh core core_tag=2.1 omnia_branch=<omnia version/branch_name>"
         echo ""
         echo -e "${YELLOW}Then re-run:${NC}"
         echo -e "   ./omnia.sh --install"
@@ -1200,6 +1200,7 @@ phase1_validate() {
 
     if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running"
+        display_cleanup_instructions
         return 1
     fi
 
@@ -1249,9 +1250,19 @@ phase1_validate() {
         return 1
     fi
 
-    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
-        echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry."
+    if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1"
+        echo ""
+        echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}"
+        echo ""
+        echo -e "${YELLOW}To build the core image locally:${NC}"
+        echo -e "1. Clone the Omnia Artifactory repository:"
+        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-<omnia version>"
+        echo -e "2. Navigate to the repository directory:"
+        echo -e "   cd omnia-artifactory"
+        echo -e "3. Build the core image locally (loads into local Podman by default):"
+        echo -e "   ./build_images.sh core core_tag=2.1 omnia_branch=<omnia version/branch_name>"
+        echo ""
         return 1
     fi
 
@@ -1267,7 +1278,7 @@ phase2_approval() {
     echo "OMNIA UPGRADE SUMMARY"
     echo "============================================"
     echo "Current Container Tag: 1.0"
-    echo "Target Container Tag:  1.1"
+    echo "Target Container Tag:  2.1"
     echo "Current Omnia Release: 2.0.0.0"
     echo "Target Omnia Release:  2.1.0.0"
     echo "New Features:"
@@ -1386,17 +1397,17 @@ phase4_container_swap() {
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit"
-    if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 image not available"
+    echo "[INFO] [ORCHESTRATOR] Starting omnia_core 2.1 Quadlet unit"
+    if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 image not available"
         echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
         rollback_omnia_core
         return 1
     fi
 
-    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then
-        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file"
+    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:2.1/' "$quadlet_file"; then
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 2.1 in quadlet file"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag"
         echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
         rollback_omnia_core
@@ -1413,13 +1424,13 @@ phase4_container_swap() {
 
     systemctl start omnia_core.service || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 1.1 container"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 2.1 container"
         echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
         rollback_omnia_core
         return 1
     }
 
-    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 1.1 health check (60s)"
+    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 2.1 health check (60s)"
     for i in $(seq 1 60); do
         if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
             break
@@ -1429,7 +1440,7 @@ phase4_container_swap() {
 
     if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 container failed health check"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 container failed health check"
         echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
         rollback_omnia_core
         return 1
@@ -1607,6 +1618,23 @@ restore_from_backup() {
     return 0
 }
 
+# Display cleanup instructions for failed upgrade/rollback
+display_cleanup_instructions() {
+    echo ""
+    echo -e "${RED}================================================================================${NC}"
+    echo -e "${RED}                    ROLLBACK FAILED${NC}"
+    echo -e "${RED}================================================================================${NC}"
+    echo ""
+    echo -e "${YELLOW}Rollback failed. Manual cleanup is required to restore a clean state before retrying.${NC}"
+    echo ""
+    echo -e "${YELLOW}Run the following on the OIM host:${NC}"
+    echo -e "${YELLOW}1. Clean Omnia shared path: rm -rf <shared_path>${NC}"
+    echo -e "${YELLOW}2. Stop Omnia core system service: systemctl stop omnia_core${NC}"
+    echo -e "${YELLOW}3. Remove the Omnia core container: podman rm -f omnia_core${NC}"
+    echo -e "${YELLOW}4. Perform a fresh Omnia core install: ./omnia.sh --install${NC}"
+    echo ""
+}
+
 # Main rollback function
 rollback_omnia_core() {
     echo -e "${GREEN}================================================================================${NC}"
@@ -1695,11 +1723,12 @@ rollback_omnia_core() {
     echo ""
     echo "[INFO] [ROLLBACK] Starting rollback process..."
     
-    # Step 1: Stop 1.1 container gracefully
+    # Step 1: Stop 2.1 container gracefully
     echo ""
-    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 1.1 container..."
+    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 2.1 container..."
     if ! stop_container_gracefully "omnia_core" 30; then
         echo -e "${RED}ERROR: Failed to stop container.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1715,6 +1744,7 @@ rollback_omnia_core() {
             podman tag omnia_core:latest omnia_core:1.0
         else
             echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}"
+            display_cleanup_instructions
             exit 1
         fi
     fi
@@ -1725,6 +1755,7 @@ rollback_omnia_core() {
     systemctl daemon-reload
     if ! systemctl start omnia_core.service; then
         echo -e "${RED}ERROR: Failed to start container service.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1747,6 +1778,7 @@ rollback_omnia_core() {
     if [ $health_count -ge $health_timeout ]; then
         echo ""
         echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1755,6 +1787,7 @@ rollback_omnia_core() {
     echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..."
     if ! validate_backup_directory "$selected_backup"; then
         echo -e "${RED}ERROR: Backup validation failed.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1763,6 +1796,7 @@ rollback_omnia_core() {
     echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..."
     if ! restore_from_backup "$selected_backup"; then
         echo -e "${RED}ERROR: Failed to restore from backup.${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     
@@ -1773,6 +1807,7 @@ rollback_omnia_core() {
     
     if [ "$verify_version" != "$backup_version" ]; then
         echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}"
+        display_cleanup_instructions
         exit 1
     fi
     

From a39e26f82cbe954e492e6438a745dce13e042b1f Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 13 Feb 2026 06:38:40 +0000
Subject: [PATCH 17/77] updating /etc/hosts entries

---
 .../discovery_validations/tasks/update_hosts.yml | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml
index 43e7d3fc63..85c9ecf611 100644
--- a/discovery/roles/discovery_validations/tasks/update_hosts.yml
+++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml
@@ -13,16 +13,22 @@
 #  limitations under the License.
 ---
 
-- name: Add hosts file entry for cluster
+- name: Ensure 127.0.0.1 localhost entry exists
   ansible.builtin.shell: |
     set -o pipefail
-    grep -qxF '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' {{ hosts_file_path }} || \
-    echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }}
+    grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }}
+  changed_when: true
+
+- name: Remove stale entries for IPs that are being updated
+  ansible.builtin.shell: |
+    set -o pipefail
+    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp && cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} && rm -f {{ hosts_file_path }}.tmp
   changed_when: true
   loop: "{{ read_mapping_file.dict | dict2items }}"
 
-- name: Ensure 127.0.0.1 localhost entry exists uniquely using echo
+- name: Add hosts file entry for cluster
   ansible.builtin.shell: |
     set -o pipefail
-    grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }}
+    echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }}
   changed_when: true
+  loop: "{{ read_mapping_file.dict | dict2items }}"

From 00fd2e2942b97d2610cb720ba4b647bde3d876c6 Mon Sep 17 00:00:00 2001
From: Katakam Rakesh Naga Sai
 <125246792+Katakam-Rakesh@users.noreply.github.com>
Date: Fri, 13 Feb 2026 12:43:26 +0530
Subject: [PATCH 18/77] Update service_k8s.json

Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com>
---
 input/config/x86_64/rhel/10.0/service_k8s.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json
index afc073a19f..0ef4408a7f 100644
--- a/input/config/x86_64/rhel/10.0/service_k8s.json
+++ b/input/config/x86_64/rhel/10.0/service_k8s.json
@@ -33,7 +33,7 @@
       { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" },
       { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" },
       { "package": "apptainer", "type": "rpm", "repo_name": "epel" },
-      {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"}
+	  { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }
     ]
   },
   "service_kube_control_plane": {

From 7b98e5ecd47d1d46b51aba587d4ee6eb99feeb7e Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 13 Feb 2026 07:19:48 +0000
Subject: [PATCH 19/77] lint issue fixed

---
 discovery/roles/discovery_validations/tasks/update_hosts.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml
index 85c9ecf611..f040dd997f 100644
--- a/discovery/roles/discovery_validations/tasks/update_hosts.yml
+++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml
@@ -22,7 +22,9 @@
 - name: Remove stale entries for IPs that are being updated
   ansible.builtin.shell: |
     set -o pipefail
-    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp && cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} && rm -f {{ hosts_file_path }}.tmp
+    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp
+    cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }}
+    rm -f {{ hosts_file_path }}.tmp
   changed_when: true
   loop: "{{ read_mapping_file.dict | dict2items }}"
 

From 6ff5423831736dc86ea5227bd1702b553ccf81af Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Fri, 13 Feb 2026 07:26:03 +0000
Subject: [PATCH 20/77] Add user registry to crio.conf

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../tasks/fetch_additional_images.yml             |  9 +++++++++
 ...ervice_kube_control_plane_first_x86_64.yaml.j2 | 15 ++++++++++++---
 ...roup-service_kube_control_plane_x86_64.yaml.j2 | 15 ++++++++++++---
 .../ci-group-service_kube_node_x86_64.yaml.j2     | 14 +++++++++++---
 discovery/roles/configure_ochami/vars/main.yml    |  1 +
 5 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
index 2fecb895e8..ca13f0c414 100644
--- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
+++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
@@ -42,3 +42,12 @@
   ansible.builtin.debug:
     var: additional_images_dict
     verbosity: 2
+
+- name: Read local_repo_config.yml  
+  ansible.builtin.include_vars:
+    file: "{{ local_repo_config_path }}"
+    name: local_repo_config
+
+- name: Set fact for user_registry
+  ansible.builtin.set_fact:
+    user_registry: "{{ local_repo_config.user_registry | default([]) }}"
\ No newline at end of file
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2
index b8b71bf099..b98df53d7d 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2
@@ -169,6 +169,16 @@
             location = "gcr.io"
             [[registry.mirror]]
             location = "{{ pulp_mirror }}"
+{% if user_registry | default([]) | length > 0 %}
+{% for registry in user_registry %}
+
+            [[registry]]
+            prefix = "{{ registry.host }}"
+            location = "{{ registry.host }}"
+            [[registry.mirror]]
+            location = "{{ pulp_mirror }}"
+{% endfor %}
+{% endif %}
 
         - path: /tmp/kube-vip.yaml
           owner: root:root
@@ -415,13 +425,12 @@
         - update-ca-trust extract
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
+        - mkdir -p /etc/containers/registries.conf.d
+        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
         - systemctl start crio.service
         - systemctl enable crio.service
         - sudo systemctl enable --now kubelet
-        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
         - mv /tmp/generate-control-plane-join.sh {{ k8s_client_mount_path }}
-        - systemctl daemon-reload
-        - systemctl restart crio
         - kubeadm config images pull --kubernetes-version={{ service_k8s_version }}
 {% set role_name = 'service_kube_control_plane_first' %}
 {% include 'pull_additional_images.yaml.j2' %}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2
index f3ba7a7330..922f63f852 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2
@@ -147,6 +147,16 @@
             location = "gcr.io"
             [[registry.mirror]]
             location = "{{ pulp_mirror }}"
+{% if user_registry | default([]) | length > 0 %}
+{% for registry in user_registry %}
+
+            [[registry]]
+            prefix = "{{ registry.host }}"
+            location = "{{ registry.host }}"
+            [[registry.mirror]]
+            location = "{{ pulp_mirror }}"
+{% endfor %}
+{% endif %}
         - path: /tmp/kube-vip.yaml
           owner: root:root
           permissions: '0644'
@@ -323,12 +333,11 @@
         - update-ca-trust extract
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
+        - mkdir -p /etc/containers/registries.conf.d
+        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
         - systemctl start crio.service
         - systemctl enable crio.service
         - sudo systemctl enable --now kubelet
-        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
-        - systemctl daemon-reload
-        - systemctl restart crio
         - kubeadm config images pull --kubernetes-version={{ service_k8s_version }}
 {% set role_name = 'service_kube_control_plane' %}
 {% include 'pull_additional_images.yaml.j2' %}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2
index b380030ddd..df98035baa 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2
@@ -146,7 +146,16 @@
             location = "gcr.io"
             [[registry.mirror]]
             location = "{{ pulp_mirror }}"
+{% if user_registry | default([]) | length > 0 %}
+{% for registry in user_registry %}
 
+            [[registry]]
+            prefix = "{{ registry.host }}"
+            location = "{{ registry.host }}"
+            [[registry.mirror]]
+            location = "{{ pulp_mirror }}"
+{% endfor %}
+{% endif %}
       runcmd:
         - /usr/local/bin/set-ssh.sh
         - "systemctl enable chronyd"
@@ -226,12 +235,11 @@
         - update-ca-trust extract
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
+        - mkdir -p /etc/containers/registries.conf.d
+        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
         - systemctl start crio.service
         - systemctl enable crio.service
         - sudo systemctl enable --now kubelet
-        - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf
-        - systemctl daemon-reload
-        - systemctl restart crio
         - kubeadm config images pull --kubernetes-version={{ service_k8s_version }}
 {% set role_name = 'service_kube_node' %}
 {% include 'pull_additional_images.yaml.j2' %}
diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml
index 7f75daa01d..053ee15c0d 100644
--- a/discovery/roles/configure_ochami/vars/main.yml
+++ b/discovery/roles/configure_ochami/vars/main.yml
@@ -108,3 +108,4 @@ cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cud
 # Usage: fetch_additional_images.yml
 input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}"
 software_config_file_path: "{{ input_project_dir }}/software_config.json"
+local_repo_config_path: "{{ input_project_dir }}/local_repo_config.yml"

From a70b838c3a7e4707d0f0235b0c350e13d598c36f Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 13 Feb 2026 08:14:30 +0000
Subject: [PATCH 21/77] duplicated hostnames

---
 discovery/roles/discovery_validations/tasks/update_hosts.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml
index f040dd997f..bd046032bc 100644
--- a/discovery/roles/discovery_validations/tasks/update_hosts.yml
+++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml
@@ -19,10 +19,11 @@
     grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }}
   changed_when: true
 
-- name: Remove stale entries for IPs that are being updated
+- name: Remove stale entries for IPs and hostnames that are being updated
   ansible.builtin.shell: |
     set -o pipefail
-    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp
+    grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \
+    grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp
     cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }}
     rm -f {{ hosts_file_path }}.tmp
   changed_when: true

From aba17ded12da3c66de984e0cabb6dce24f7ca1a4 Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Fri, 13 Feb 2026 14:05:55 +0530
Subject: [PATCH 22/77] Update omnia.sh

---
 omnia.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 81e2094ccc..b7a086545d 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -164,7 +164,7 @@ setup_omnia_core() {
 # It removes the container and performs the necessary cleanup steps.
 cleanup_omnia_core() {
     # Block if critical service containers exist
-    critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+    critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$')
     if [ -n "$critical_running" ]; then
         echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
         echo "$critical_running"
@@ -272,7 +272,7 @@ cleanup_config(){
 # Otherwise, it prints an error message.
 remove_container() {
     # Block if critical service containers exist
-    critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+    critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$')
     if [ -n "$critical_running" ]; then
         echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
         echo "$critical_running"
@@ -1083,7 +1083,7 @@ install_omnia_core() {
             # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function
             if [ "$choice" = "2" ]; then
                 # Block if critical service containers exist
-                critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd')
+                critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$')
                 if [ -n "$critical_running" ]; then
                     echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}"
                     echo "$critical_running"

From 7c79b599c8fd89b75cdaf2eb082d9b95449cf84a Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Fri, 13 Feb 2026 08:47:06 +0000
Subject: [PATCH 23/77] resolve input validation + lint

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../validation_flows/common_validation.py           | 13 +++++++++++++
 .../tasks/fetch_additional_images.yml               |  4 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index 198c527440..f577a4e9b8 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -233,6 +233,19 @@ def validate_software_config(
             )
         )
 
+    # Check for required subgroups when specific software names are present
+    software_requiring_subgroups = ["additional_packages", "slurm_custom", "service_k8s"]
+    for software_name in software_requiring_subgroups:
+        if software_name in software_names:
+            if software_name not in data or not data[software_name]:
+                errors.append(
+                    create_error_msg(
+                        "Validation Error: ",
+                        software_name,
+                        f"is present in softwares but corresponding subgroup '{software_name}' is missing or empty in software_config.json. Please refer examples directory for the correct format."
+                    )
+                )
+
     for software_pkg in data['softwares']:
         software = software_pkg['name']
         arch_list = software_pkg.get('arch')
diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
index ca13f0c414..d4e8425749 100644
--- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
+++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml
@@ -43,11 +43,11 @@
     var: additional_images_dict
     verbosity: 2
 
-- name: Read local_repo_config.yml  
+- name: Read local_repo_config.yml
   ansible.builtin.include_vars:
     file: "{{ local_repo_config_path }}"
     name: local_repo_config
 
 - name: Set fact for user_registry
   ansible.builtin.set_fact:
-    user_registry: "{{ local_repo_config.user_registry | default([]) }}"
\ No newline at end of file
+    user_registry: "{{ local_repo_config.user_registry | default([]) }}"

From 40f1595cd15c9f59b4c653c679a0acfaa1eb6c57 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Fri, 13 Feb 2026 16:09:23 +0530
Subject: [PATCH 24/77] Removed slurmd dependency issue where ssh key changes
 on slurmctld, live

---
 ...-group-login_compiler_node_aarch64.yaml.j2 |  8 +++--
 ...i-group-login_compiler_node_x86_64.yaml.j2 |  8 +++--
 .../ci-group-login_node_aarch64.yaml.j2       |  7 +++-
 .../ci-group-login_node_x86_64.yaml.j2        |  7 +++-
 .../ci-group-slurm_node_aarch64.yaml.j2       |  8 +++--
 .../ci-group-slurm_node_x86_64.yaml.j2        |  7 ++--
 .../slurm_config/tasks/check_ctld_running.yml | 32 +++++++++++++------
 discovery/roles/slurm_config/tasks/confs.yml  |  2 ++
 .../slurm_config/tasks/create_slurm_dir.yml   | 12 +------
 .../slurm_config/tasks/update_hosts_munge.yml |  1 +
 .../slurm_config/templates/slurmd.service.j2  | 22 -------------
 11 files changed, 62 insertions(+), 52 deletions(-)
 delete mode 100644 discovery/roles/slurm_config/templates/slurmd.service.j2

diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
index dc2ddf9dcd..8918f03050 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2
@@ -209,6 +209,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -278,12 +284,10 @@
 
 {% if hostvars['localhost']['ldms_support'] %}
         - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
-
         - /root/ldms_sampler.sh
 {% endif %}
 
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
-        - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
index 2c23b868c0..51121a2e82 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2
@@ -209,6 +209,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -282,12 +288,10 @@
 
 {% if hostvars['localhost']['ldms_support'] %}
         - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log
-
         - /root/ldms_sampler.sh
 {% endif %}
 
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
-        - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
index 8b3d771592..4aacc2222d 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2
@@ -102,6 +102,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -131,7 +137,6 @@
         - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
-        - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
index 4e68ba8d81..524553bd55 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2
@@ -108,6 +108,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
@@ -142,7 +148,6 @@
         - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust
         - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf
         - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh
-        - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
         - /usr/local/bin/check_slurm_controller_status.sh
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }}
         - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }}
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
index 06a04a6068..dacade639b 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2
@@ -277,8 +277,6 @@
 
             echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) (aarch64) ====="
 
-            echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/"
-            yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
             bash /usr/local/bin/check_slurm_controller_status.sh
 
             echo "[INFO] Setting ownership for Slurm directories"
@@ -415,6 +413,12 @@
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
 
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
+
         - path: /usr/local/bin/check_slurm_controller_status.sh
           owner: root:root
           permissions: '{{ file_mode_755 }}'
diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
index c1b532908e..d21fcf9c5c 100644
--- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
+++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2
@@ -244,6 +244,11 @@
 {% for key in ip_name_map | sort %}
             {{ ip_name_map[key] }} {{ key }}
 {% endfor %}
+        - path: /etc/sysconfig/slurmd
+          owner: root:root
+          permissions: '0644'
+          content: |
+            SLURMD_OPTIONS="{{ conf_server }}"
 
         - path: /usr/local/bin/configure_dirs_and_mounts.sh
           permissions: '{{ file_mode_755 }}'
@@ -288,8 +293,6 @@
 
             echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) ====="
 
-            echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/"
-            yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/
             bash /usr/local/bin/check_slurm_controller_status.sh
 
             echo "[INFO] Setting ownership for Slurm directories"
diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 5f2d41a904..7d908169ab 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -14,30 +14,37 @@
 ---
 - name: Check if remote host is reachable via SSH
   ansible.builtin.wait_for:
-    host: "{{ item }}"
+    host: "{{ ctld }}"
     port: 22 # TODO: make it configurable
     timeout: 10
     state: started
   delegate_to: localhost
   register: ssh_check
   ignore_errors: true
-  ignore_unreachable: true
 
-- name: Block when ssh_check is success
-  when: ssh_check is success
+- name: Enter slurm controller when pingable
+  when:
+    - ssh_check is success
+  ignore_unreachable: true
   block:
     - name: Initialize ctld_state dict
       ansible.builtin.set_fact:
-        ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}"
+        ctld_state: "{{ ctld_state | default({}) | combine({ctld: false}) }}"
 
     - name: Check if slurmctld is running on remote host
       ansible.builtin.service_facts:
-      delegate_to: "{{ item }}"
+      delegate_to: "{{ ctld }}"
       register: service_facts
+      ignore_unreachable: true
+
+    - name: Fail if slurmctld is unreachable
+      ansible.builtin.fail:
+        msg: "Failed to connect to {{ ctld }}."
+      when: service_facts is unreachable
 
     - name: Update ctld_state if slurmctld is running
       ansible.builtin.set_fact:
-        ctld_state: "{{ ctld_state | combine({item: true}) }}"
+        ctld_state: "{{ ctld_state | combine({ctld: true}) }}"
       when:
         - service_facts is success
         - ansible_facts.services['slurmctld.service'] is defined
@@ -72,6 +79,13 @@
       changed_when: scontrol_reconfig.rc == 0
       failed_when: false
       register: scontrol_reconfig
-      delegate_to: "{{ item }}"
+      delegate_to: "{{ ctld }}"
       when:
-        - ctld_state[item] is true
+        - ctld_state[ctld] is true
+
+  rescue:
+    - name: Fail if slurmctld is not running on any host
+      ansible.builtin.debug:
+        msg: "Failed to 'scontrol reconfigure' on {{ ctld }}.
+         As task '{{ ansible_failed_task.name }}' failed.
+         results: {{ ansible_failed_result }}"
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 799d4cd757..c5f7953b0d 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -197,3 +197,5 @@
     - ctld_list
     - ctld_conf_files is changed
   loop: "{{ ctld_list }}"
+  loop_control:
+    loop_var: ctld
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index 45e37ac243..e4ac760d77 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -194,17 +194,7 @@
     group: "{{ root_group }}"
     mode: "{{ common_mode }}"
   when: cmpt_list
-  loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}"
-
-- name: Create logout_user.sh and slurmd.service in login and login_compiler
-  ansible.builtin.template:
-    src: "{{ item.1 }}.j2"
-    dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}"
-    owner: "{{ root_user }}"
-    group: "{{ root_group }}"
-    mode: "{{ conf_file_mode }}"
-  when: login_list or compiler_login_list
-  loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}"
+  loop: "{{ cmpt_list | product(['logout_user.sh']) }}"
 
 - name: Get the slurm NFS path
   ansible.builtin.debug:
diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
index a326fa820d..64c36dbeaf 100644
--- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
+++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
@@ -30,6 +30,7 @@
     munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}"
   when: munge_key_copy is defined
 
+# TODO: Clean unreachable handling
 - name: Block when munge key changed
   when:
     - munge_key_changed is defined
diff --git a/discovery/roles/slurm_config/templates/slurmd.service.j2 b/discovery/roles/slurm_config/templates/slurmd.service.j2
deleted file mode 100644
index 294d1fda75..0000000000
--- a/discovery/roles/slurm_config/templates/slurmd.service.j2
+++ /dev/null
@@ -1,22 +0,0 @@
-[Unit]
-Description=Slurm node daemon
-After=munge.service network-online.target remote-fs.target sssd.service
-Wants=network-online.target
-
-[Service]
-Type=notify
-EnvironmentFile=-/etc/sysconfig/slurmd
-EnvironmentFile=-/etc/default/slurmd
-RuntimeDirectory=slurm
-RuntimeDirectoryMode=0755
-ExecStart=/usr/sbin/slurmd --systemd $SLURMD_OPTIONS {{ conf_server }}
-ExecReload=/bin/kill -HUP $MAINPID
-KillMode=process
-LimitNOFILE=131072
-LimitMEMLOCK=infinity
-LimitSTACK=infinity
-Delegate=yes
-TasksMax=infinity
-
-[Install]
-WantedBy=multi-user.target
\ No newline at end of file

From 80a512650ed5146ed55b5a716fa855928d80b1cb Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 13 Feb 2026 17:43:19 +0530
Subject: [PATCH 25/77] Added user guidance messages in rollback_omnia.yml and
 upgrade_cluster.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 49 ++++++++++++++++--
 upgrade/rollback_omnia.yml                   | 53 ++++++++++++++++++++
 2 files changed, 99 insertions(+), 3 deletions(-)
 create mode 100644 upgrade/rollback_omnia.yml

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 196366870b..92aa87e2a3 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -13,6 +13,49 @@
 # limitations under the License.
 ---
 
-- name: Include import input parameters
-  ansible.builtin.include_role:
-    name: import_input_parameters
+
+- name: Display cluster reprovision guidance
+  ansible.builtin.pause:
+    prompt: "{{ '\x1b[32m' }}===================================================
+          CLUSTER REPROVISION REQUIRED
+      ===========================================================
+
+      Cluster reprovisioning is required after upgrade to enable new features.
+
+      Review and update new 2.1 input fields before reprovisioning:
+
+        1. local_repo_config.yml
+
+            - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64)
+
+            - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64)
+
+        2. network_spec.yml (ib_network section)
+
+            - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable)
+
+            - Ensure host IB interfaces map to the IB network entries
+
+        3. omnia_config.yml (slurm_cluster.config_source)
+
+            - Use the new structure: config_source: { type: <local|url>, location: <path_or_url> }
+
+            - Populate location to point to your Slurm config bundle (local path or remote URL)
+
+        4. NFS cleanup (if NFS share is used for k8s/slurm)
+
+            - Clean stale mounts and ensure the NFS share is accessible before reprovision
+
+            - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment
+
+
+      Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster:
+
+        1. ansible-playbook local_repo/local_repo.yml
+
+        2. ansible-playbook build_image_x86_64/build_image_x86_64.yml
+
+        3. ansible-playbook discovery/discovery.yml
+
+    {{ '\x1b[0m' }}"
+    seconds: 1
diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml
new file mode 100644
index 0000000000..fc33ab4a2e
--- /dev/null
+++ b/upgrade/rollback_omnia.yml
@@ -0,0 +1,53 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Rollback Omnia guidance
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  vars:
+    oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml"
+  tasks:
+    - name: Read oim_metadata.yml for backup details
+      ansible.builtin.slurp:
+        src: "{{ oim_metadata_path }}"
+      register: oim_metadata_slurp
+      ignore_errors: true
+
+    - name: Parse oim_metadata.yml
+      ansible.builtin.set_fact:
+        oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}"
+      when: oim_metadata_slurp is defined and oim_metadata_slurp.content is defined
+
+    - name: Derive backup_version from upgrade_backup_dir
+      ansible.builtin.set_fact:
+        backup_version: "{{ (oim_metadata.upgrade_backup_dir | regex_search('version_([^/]+)', '\\1'))
+          | default('previous version', true) }}"
+      when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined
+
+    - name: Display rollback guidance
+      ansible.builtin.debug:
+        msg: >-
+          The rollback function restores the Omnia core to the last backup version
+          created during upgrade, including configs and container state.
+
+          To return to the previous Omnia version
+          {{("(version " ~ backup_version[0] ~ ")") if backup_version is defined and backup_version }}
+          captured in the backup,
+          run the rollback from the OIM host:
+            1) If you are inside the Omnia core container, exit the container shell.
+            2) On the OIM host prompt, execute: ./omnia.sh --rollback
+
+    - name: End play
+      ansible.builtin.meta: end_play

From 3f516a3dd38d4923dd318e9600fb110f457700cf Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Fri, 13 Feb 2026 20:32:27 +0530
Subject: [PATCH 26/77] Fix for local repo is failing as cuda run package
 download issue

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../local_repo/parse_and_download.py          | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index 72efd4566b..c8b8278eef 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -64,6 +64,26 @@ def execute_command(cmd_string, logger, type_json=False):
             stderr=subprocess.PIPE,
             shell=True,
         )
+        status["returncode"] = cmd.returncode
+        status["stdout"] = cmd.stdout.strip() if cmd.stdout else None
+        status["stderr"] = cmd.stderr.strip() if cmd.stderr else None
+
+        if cmd.returncode != 0:
+            logger.error(f"Command failed with return code {cmd.returncode}")
+            logger.error(f"Error: {status['stderr']}")
+            return False
+
+        if type_json:
+            if not status["stdout"]:
+                logger.error("Command succeeded but returned empty output when JSON was expected")
+                return False
+            try:
+                status["stdout"] = json.loads(status["stdout"])
+            except json.JSONDecodeError as error:
+                logger.error(f"Failed to parse JSON output: {error}")
+                logger.error(f"Raw output was: {status['stdout']}")
+                return False
+
         logger.info(f"Command succeeded: {cmd_string}")
         return True
     except subprocess.CalledProcessError as e:

From d138e3a75271e9653c4827899f0bdade8f00cb1e Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 12:19:33 +0530
Subject: [PATCH 27/77] Modification of Rollback guidance message

---
 upgrade/roles/upgrade_cluster/tasks/main.yml |  8 +++----
 upgrade/rollback_omnia.yml                   | 25 ++++++++++----------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 92aa87e2a3..1b70dc9561 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -22,7 +22,7 @@
 
       Cluster reprovisioning is required after upgrade to enable new features.
 
-      Review and update new 2.1 input fields before reprovisioning:
+      Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning:
 
         1. local_repo_config.yml
 
@@ -42,11 +42,11 @@
 
             - Populate location to point to your Slurm config bundle (local path or remote URL)
 
-        4. NFS cleanup (if NFS share is used for k8s/slurm)
+      Do NFS cleanup (if NFS share is used for k8s/slurm)
 
-            - Clean stale mounts and ensure the NFS share is accessible before reprovision
+         - Clean stale mounts and ensure the NFS share is accessible before reprovision
 
-            - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment
+         - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment
 
 
       Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster:
diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml
index fc33ab4a2e..c0d5080c22 100644
--- a/upgrade/rollback_omnia.yml
+++ b/upgrade/rollback_omnia.yml
@@ -36,18 +36,19 @@
           | default('previous version', true) }}"
       when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined
 
-    - name: Display rollback guidance
+    - name: Display rollback guidance (green)
       ansible.builtin.debug:
-        msg: >-
-          The rollback function restores the Omnia core to the last backup version
-          created during upgrade, including configs and container state.
-
-          To return to the previous Omnia version
-          {{("(version " ~ backup_version[0] ~ ")") if backup_version is defined and backup_version }}
-          captured in the backup,
-          run the rollback from the OIM host:
-            1) If you are inside the Omnia core container, exit the container shell.
-            2) On the OIM host prompt, execute: ./omnia.sh --rollback
-
+        msg:
+          - "================================="
+          - "       OMNIA ROLLBACK"
+          - "================================="
+          - ""
+          - "[Rollback Actions]"
+          - "1. Purpose: restore Omnia core to the last backup version (includes configs and container state)."
+          - "2. Target version: {{ backup_version | default('previous version from the backup location') }}."
+          - "3. How to run:"
+          - "   - Exit the Omnia core container shell if you are inside it."
+          - "   - From the OIM host prompt, execute: ./omnia.sh --rollback"
+          - "4. Note: ensure the backup location is accessible on the OIM host before running rollback."
     - name: End play
       ansible.builtin.meta: end_play

From 3d5fa5b3f06c7dd41fbf8bf88c976eb25a0e348b Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 13:10:44 +0530
Subject: [PATCH 28/77] Update main.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 1b70dc9561..6165997a47 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -54,8 +54,15 @@
         1. ansible-playbook local_repo/local_repo.yml
 
         2. ansible-playbook build_image_x86_64/build_image_x86_64.yml
+        
+        If the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
+        
+        ansible-playbook build_image_aarch64/build_image_aarch64.yml
 
         3. ansible-playbook discovery/discovery.yml
 
+
+      Please follow the omnia documentation for steps in more detail.
+
     {{ '\x1b[0m' }}"
     seconds: 1

From 53a1d1c62f303c4615e4d34dc2fe02a013e7269a Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 13:14:47 +0530
Subject: [PATCH 29/77] Update main.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 6165997a47..751be68d73 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -61,7 +61,7 @@
 
         3. ansible-playbook discovery/discovery.yml
 
-
+      
       Please follow the omnia documentation for steps in more detail.
 
     {{ '\x1b[0m' }}"

From f370a252b786df319a1a8feeb4a7cec08a0511db Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 13:19:58 +0530
Subject: [PATCH 30/77] Update main.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 751be68d73..a45be3f885 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -56,12 +56,12 @@
         2. ansible-playbook build_image_x86_64/build_image_x86_64.yml
         
         If the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
-        
+
         ansible-playbook build_image_aarch64/build_image_aarch64.yml
 
         3. ansible-playbook discovery/discovery.yml
 
-      
+
       Please follow the omnia documentation for steps in more detail.
 
     {{ '\x1b[0m' }}"

From de653020056aed145a14421592f2bdf676ed5cb8 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 13:23:25 +0530
Subject: [PATCH 31/77] Update main.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index a45be3f885..a64df8feff 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -54,14 +54,12 @@
         1. ansible-playbook local_repo/local_repo.yml
 
         2. ansible-playbook build_image_x86_64/build_image_x86_64.yml
-        
         If the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
 
         ansible-playbook build_image_aarch64/build_image_aarch64.yml
 
         3. ansible-playbook discovery/discovery.yml
 
-
       Please follow the omnia documentation for steps in more detail.
 
     {{ '\x1b[0m' }}"

From 37358c9cc5e25e25cb8e86ce974d85a7e318f615 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 14:17:55 +0530
Subject: [PATCH 32/77] Update main.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index a64df8feff..76c90b21bd 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -54,11 +54,12 @@
         1. ansible-playbook local_repo/local_repo.yml
 
         2. ansible-playbook build_image_x86_64/build_image_x86_64.yml
-        If the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
 
-        ansible-playbook build_image_aarch64/build_image_aarch64.yml
+        3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
 
-        3. ansible-playbook discovery/discovery.yml
+         - ansible-playbook build_image_aarch64/build_image_aarch64.yml
+
+        4. ansible-playbook discovery/discovery.yml
 
       Please follow the omnia documentation for steps in more detail.
 

From 1be86a2a250f6d9169fb46f30a1a0bd7ec338267 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 14:21:58 +0530
Subject: [PATCH 33/77] Update main.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 76c90b21bd..90b25611b5 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -57,7 +57,7 @@
 
         3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
 
-         - ansible-playbook build_image_aarch64/build_image_aarch64.yml
+        ansible-playbook build_image_aarch64/build_image_aarch64.yml
 
         4. ansible-playbook discovery/discovery.yml
 

From f531576a0a3ff35bb969225716f15b73c1329ce7 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 16 Feb 2026 14:27:10 +0530
Subject: [PATCH 34/77] Addition of user guidance messages for cluster
 reprovisioning and rollback after upgrade to 2.1 (#3978)

* Added user guidance messages in rollback_omnia.yml and upgrade_cluster.yml

* Modification of Rollback guidance message

* Update main.yml

* Update main.yml

* Update main.yml

* Update main.yml

* Update main.yml

* Update main.yml
---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 55 ++++++++++++++++++--
 upgrade/rollback_omnia.yml                   | 54 +++++++++++++++++++
 2 files changed, 106 insertions(+), 3 deletions(-)
 create mode 100644 upgrade/rollback_omnia.yml

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 196366870b..90b25611b5 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -13,6 +13,55 @@
 # limitations under the License.
 ---
 
-- name: Include import input parameters
-  ansible.builtin.include_role:
-    name: import_input_parameters
+
+- name: Display cluster reprovision guidance
+  ansible.builtin.pause:
+    prompt: "{{ '\x1b[32m' }}===================================================
+          CLUSTER REPROVISION REQUIRED
+      ===========================================================
+
+      Cluster reprovisioning is required after upgrade to enable new features.
+
+      Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning:
+
+        1. local_repo_config.yml
+
+            - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64)
+
+            - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64)
+
+        2. network_spec.yml (ib_network section)
+
+            - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable)
+
+            - Ensure host IB interfaces map to the IB network entries
+
+        3. omnia_config.yml (slurm_cluster.config_source)
+
+            - Use the new structure: config_source: { type: <local|url>, location: <path_or_url> }
+
+            - Populate location to point to your Slurm config bundle (local path or remote URL)
+
+      Do NFS cleanup (if NFS share is used for k8s/slurm)
+
+         - Clean stale mounts and ensure the NFS share is accessible before reprovision
+
+         - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment
+
+
+      Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster:
+
+        1. ansible-playbook local_repo/local_repo.yml
+
+        2. ansible-playbook build_image_x86_64/build_image_x86_64.yml
+
+        3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
+
+        ansible-playbook build_image_aarch64/build_image_aarch64.yml
+
+        4. ansible-playbook discovery/discovery.yml
+
+      Please follow the omnia documentation for steps in more detail.
+
+    {{ '\x1b[0m' }}"
+    seconds: 1
diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml
new file mode 100644
index 0000000000..c0d5080c22
--- /dev/null
+++ b/upgrade/rollback_omnia.yml
@@ -0,0 +1,54 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Rollback Omnia guidance
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  vars:
+    oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml"
+  tasks:
+    - name: Read oim_metadata.yml for backup details
+      ansible.builtin.slurp:
+        src: "{{ oim_metadata_path }}"
+      register: oim_metadata_slurp
+      ignore_errors: true
+
+    - name: Parse oim_metadata.yml
+      ansible.builtin.set_fact:
+        oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}"
+      when: oim_metadata_slurp is defined and oim_metadata_slurp.content is defined
+
+    - name: Derive backup_version from upgrade_backup_dir
+      ansible.builtin.set_fact:
+        backup_version: "{{ (oim_metadata.upgrade_backup_dir | regex_search('version_([^/]+)', '\\1'))
+          | default('previous version', true) }}"
+      when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined
+
+    - name: Display rollback guidance (green)
+      ansible.builtin.debug:
+        msg:
+          - "================================="
+          - "       OMNIA ROLLBACK"
+          - "================================="
+          - ""
+          - "[Rollback Actions]"
+          - "1. Purpose: restore Omnia core to the last backup version (includes configs and container state)."
+          - "2. Target version: {{ backup_version | default('previous version from the backup location') }}."
+          - "3. How to run:"
+          - "   - Exit the Omnia core container shell if you are inside it."
+          - "   - From the OIM host prompt, execute: ./omnia.sh --rollback"
+          - "4. Note: ensure the backup location is accessible on the OIM host before running rollback."
+    - name: End play
+      ansible.builtin.meta: end_play

From 8066a19d5542f2acaaf042e8dd5ccb92cdbb9b32 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Mon, 16 Feb 2026 12:04:27 +0000
Subject: [PATCH 35/77] fix status return in execute command

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 common/library/module_utils/local_repo/parse_and_download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index c8b8278eef..15bed1efb3 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -85,7 +85,7 @@ def execute_command(cmd_string, logger, type_json=False):
                 return False
 
         logger.info(f"Command succeeded: {cmd_string}")
-        return True
+        return status
     except subprocess.CalledProcessError as e:
         logger.error(f"Command failed: {cmd_string} - {e}")
         return False

From f0928443075d08a01973bb8b6f3921d9b16c0ea4 Mon Sep 17 00:00:00 2001
From: Nethravathi M G <146437298+nethramg@users.noreply.github.com>
Date: Mon, 16 Feb 2026 23:12:44 +0530
Subject: [PATCH 36/77] Initial iDRAC Telemetry Node addition and deletion
 changes  (#3972)

* Initial set of changes for iDRAC Telemetry add and remove node

* Ansible link and pylint fixes

* Ansible lint fixes

* Updated Copyrights to 2026

* Addressed the comments
---
 .../modules/delete_idracips_from_mysqldb.py   | 251 ++++++++++++++++++
 .../modules/disable_idrac_telemetry.py        | 184 +++++++++++++
 .../initiate_telemetry_service_cluster.yml    |   5 +-
 .../tasks/remove_deleted_nodes.yml            | 101 +++++++
 .../templates/telemetry_report.j2             |  18 ++
 telemetry/roles/idrac_telemetry/vars/main.yml |  24 +-
 6 files changed, 581 insertions(+), 2 deletions(-)
 create mode 100644 common/library/modules/delete_idracips_from_mysqldb.py
 create mode 100644 common/library/modules/disable_idrac_telemetry.py
 create mode 100644 telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml

diff --git a/common/library/modules/delete_idracips_from_mysqldb.py b/common/library/modules/delete_idracips_from_mysqldb.py
new file mode 100644
index 0000000000..cd81b943e2
--- /dev/null
+++ b/common/library/modules/delete_idracips_from_mysqldb.py
@@ -0,0 +1,251 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/python
+"""Module to delete iDRAC IPs from MySQL database.
+This module connects to a Kubernetes pod running MySQL and deletes iDRAC IPs
+that are not present in bmc_data.csv. It handles retries and delays for robustness."""
+
+import time
+from ansible.module_utils.basic import AnsibleModule
+from kubernetes import client, config
+from kubernetes.stream import stream
+from kubernetes.config.config_exception import ConfigException
+
+
+def load_kube_context():
+    """Load Kubernetes configuration for accessing the cluster."""
+    try:
+        config.load_kube_config()
+    except ConfigException:
+        config.load_incluster_config()
+
+
+def run_mysql_query_in_pod(namespace, pod, container, mysql_user, mysql_password, query):
+    """Run a MySQL query in the specified pod.
+
+    Args:
+        namespace: Kubernetes namespace
+        pod: Pod name
+        container: Container name
+        mysql_user: MySQL username
+        mysql_password: MySQL password
+        query: MySQL query to execute
+
+    Returns:
+        dict: Result containing return code and output
+    """
+    core_v1 = client.CoreV1Api()
+    mysql_command = [
+        "mysql",
+        "-u", mysql_user,
+        "-N", "-B",
+        f"-p{mysql_password}",
+        "-e", query
+    ]
+
+    try:
+        ws = stream(
+            core_v1.connect_get_namespaced_pod_exec,
+            name=pod,
+            namespace=namespace,
+            container=container,
+            command=mysql_command,
+            stderr=True,
+            stdin=False,
+            stdout=True,
+            tty=False,
+            _preload_content=False
+        )
+
+        stdout = ""
+        stderr = ""
+
+        while ws.is_open():
+            ws.update(timeout=1)
+            if ws.peek_stdout():
+                stdout += ws.read_stdout()
+            if ws.peek_stderr():
+                stderr += ws.read_stderr()
+        ws.close()
+
+        rc = ws.returncode
+
+        if rc != 0:
+            return {
+                "rc": rc,
+                "result": stderr.strip() if stderr else "Unknown error"
+            }
+
+        query_result = [
+            line.strip() for line in stdout.strip().splitlines()
+            if line.strip() and not line.strip().startswith("mysql:")
+        ]
+
+        return {
+            "rc": rc,
+            "result": query_result
+        }
+
+    except (ConfigException, OSError) as e:
+        return {
+            "rc": 1,
+            "result": str(e)
+        }
+
+
+def delete_idrac_from_mysql(
+    namespace,
+    pod,
+    container,
+    mysqldb_name,
+    mysql_user,
+    mysql_password,
+    ip_to_delete,
+    retries=3,
+    delay=3
+):
+    """Delete a single iDRAC IP from MySQL database.
+
+    Args:
+        namespace: Kubernetes namespace
+        pod: Pod name
+        container: Container name
+        mysqldb_name: MySQL database name
+        mysql_user: MySQL username
+        mysql_password: MySQL password
+        ip_to_delete: IP address to delete
+        retries: Number of retry attempts
+        delay: Delay between retries in seconds
+
+    Returns:
+        dict: Result containing success status and message
+    """
+    query = (
+        f"DELETE FROM {mysqldb_name}.services "
+        f"WHERE ip = '{ip_to_delete}';"
+    )
+
+    for attempt in range(retries):
+        result = run_mysql_query_in_pod(
+            namespace=namespace,
+            pod=pod,
+            container=container,
+            mysql_user=mysql_user,
+            mysql_password=mysql_password,
+            query=query
+        )
+
+        if result.get("rc") == 0:
+            return {
+                "success": True,
+                "ip": ip_to_delete,
+                "msg": f"Successfully deleted iDRAC IP {ip_to_delete} from MySQL."
+            }
+
+        if attempt < retries - 1:
+            time.sleep(delay)
+
+    return {
+        "success": False,
+        "ip": ip_to_delete,
+        "msg": f"Failed to delete iDRAC IP {ip_to_delete} after {retries} attempts: {result.get('result')}"
+    }
+
+
+def main():
+    """Main function to execute the module logic."""
+    module_args = {
+        "telemetry_namespace": {"type": "str", "required": True},
+        "idrac_podnames": {"type": "list", "required": True},
+        "mysqldb_k8s_name": {"type": "str", "required": True},
+        "mysqldb_name": {"type": "str", "required": True},
+        "mysqldb_user": {"type": "str", "required": True, "no_log": True},
+        "mysqldb_password": {"type": "str", "required": True, "no_log": True},
+        "ips_to_delete": {"type": "list", "required": True},
+        "pod_to_db_idrac_ips": {"type": "dict", "required": True},
+        "db_retries": {"type": "int", "default": 3},
+        "db_delay": {"type": "int", "default": 3},
+    }
+
+    module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
+
+    telemetry_namespace = module.params["telemetry_namespace"]
+    idrac_podnames = module.params["idrac_podnames"]
+    mysqldb_k8s_name = module.params["mysqldb_k8s_name"]
+    mysqldb_name = module.params["mysqldb_name"]
+    mysqldb_user = module.params["mysqldb_user"]
+    mysqldb_password = module.params["mysqldb_password"]
+    ips_to_delete = module.params["ips_to_delete"]
+    pod_to_db_idrac_ips = module.params["pod_to_db_idrac_ips"]
+    db_retries = module.params["db_retries"]
+    db_delay = module.params["db_delay"]
+
+    load_kube_context()
+
+    deleted_ips = []
+    failed_ips = []
+    changed = False
+
+    try:
+        for pod in idrac_podnames:
+            pod_ips = pod_to_db_idrac_ips.get(pod, [])
+            ips_to_delete_from_pod = list(set(pod_ips) & set(ips_to_delete))
+
+            if not ips_to_delete_from_pod:
+                module.warn(f"No IPs to delete from pod {pod}. Skipping.")
+                continue
+
+            module.warn(f"Deleting IPs from pod {pod}: {ips_to_delete_from_pod}")
+
+            for ip in ips_to_delete_from_pod:
+                result = delete_idrac_from_mysql(
+                    namespace=telemetry_namespace,
+                    pod=pod,
+                    container=mysqldb_k8s_name,
+                    mysqldb_name=mysqldb_name,
+                    mysql_user=mysqldb_user,
+                    mysql_password=mysqldb_password,
+                    ip_to_delete=ip,
+                    retries=db_retries,
+                    delay=db_delay
+                )
+
+                if result.get("success"):
+                    deleted_ips.append(ip)
+                    changed = True
+                else:
+                    failed_ips.append({
+                        "pod": pod,
+                        "ip": ip,
+                        "msg": result.get("msg", "Unknown error")
+                    })
+
+        module.exit_json(
+            changed=changed,
+            deleted_ips=deleted_ips,
+            failed_ips=failed_ips,
+            msg=f"Deleted {len(deleted_ips)} iDRAC IPs from MySQL database."
+        )
+
+    except (OSError, ValueError) as e:
+        module.fail_json(
+            msg=f"An error occurred while deleting iDRAC IPs from MySQL: {str(e)}",
+            deleted_ips=deleted_ips,
+            failed_ips=failed_ips
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/common/library/modules/disable_idrac_telemetry.py b/common/library/modules/disable_idrac_telemetry.py
new file mode 100644
index 0000000000..cb7b885e1e
--- /dev/null
+++ b/common/library/modules/disable_idrac_telemetry.py
@@ -0,0 +1,184 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/python
+"""Module to disable telemetry on iDRAC nodes via Redfish API.
+This module connects to iDRAC nodes and disables telemetry collection
+by sending PATCH requests to the Redfish API endpoint."""
+
+import requests
+import urllib3
+from ansible.module_utils.basic import AnsibleModule
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+
+def disable_telemetry_on_idrac(idrac_ip, username, password, timeout=30):
+    """
+    Disable telemetry on a single iDRAC node using Redfish API.
+
+    Args:
+        idrac_ip: IP address of the iDRAC
+        username: iDRAC username
+        password: iDRAC password
+        timeout: Request timeout in seconds
+
+    Returns:
+        dict: Result containing success status and message
+    """
+    url = (
+        f"https://{idrac_ip}/redfish/v1/Managers/"
+        f"iDRAC.Embedded.1/Attributes"
+    )
+
+    # Try different telemetry property names in order of preference
+    telemetry_properties = [
+        "Telemetry.1.EnableTelemetry",
+        "TelemetryService.1.EnableTelemetry", 
+        "Telemetry.2.EnableTelemetry",
+        "Redfish.1.TelemetryServiceEnabled"
+    ]
+
+    headers = {
+        "Content-Type": "application/json"
+    }
+
+    for property_name in telemetry_properties:
+        payload = {
+            "Attributes": {
+                property_name: "Disabled"
+            }
+        }
+
+        try:
+            response = requests.patch(
+                url,
+                json=payload,
+                headers=headers,
+                auth=(username, password),
+                verify=False,
+                timeout=timeout
+            )
+            
+            if response.status_code in [200, 202, 204]:
+                return {
+                    "success": True,
+                    "ip": idrac_ip,
+                    "status_code": response.status_code,
+                    "msg": f"Successfully disabled telemetry on iDRAC {idrac_ip} using {property_name}"
+                }
+            elif response.status_code == 400:
+                # Property not supported, try next one
+                continue
+            else:
+                return {
+                    "success": False,
+                    "ip": idrac_ip,
+                    "status_code": response.status_code,
+                    "msg": (
+                        f"Failed to disable telemetry on iDRAC {idrac_ip}. "
+                        f"Status: {response.status_code}, Response: {response.text}"
+                    )
+                }
+        
+        except requests.exceptions.Timeout:
+            return {
+                "success": False,
+                "ip": idrac_ip,
+                "msg": f"Timeout while connecting to iDRAC {idrac_ip}"
+            }
+        
+        except requests.exceptions.ConnectionError:
+            return {
+                "success": False,
+                "ip": idrac_ip,
+                "msg": f"Connection error while connecting to iDRAC {idrac_ip}"
+            }
+        
+        except (requests.exceptions.RequestException, OSError) as e:
+            return {
+                "success": False,
+                "ip": idrac_ip,
+                "msg": f"Error disabling telemetry on iDRAC {idrac_ip}: {str(e)}"
+            }
+
+    # All properties failed
+    return {
+        "success": False,
+        "ip": idrac_ip,
+        "msg": (
+            f"Failed to disable telemetry on iDRAC {idrac_ip}. "
+            f"None of the supported telemetry properties were found: {', '.join(telemetry_properties)}"
+        )
+    }
+
+
+def main():
+    """Main function to execute the module logic."""
+    module_args = {
+        "idrac_ips": {"type": "list", "required": True, "elements": "str"},
+        "username": {"type": "str", "required": True, "no_log": True},
+        "password": {"type": "str", "required": True, "no_log": True},
+        "timeout": {"type": "int", "default": 30},
+    }
+
+    module = AnsibleModule(
+        argument_spec=module_args,
+        supports_check_mode=True
+    )
+
+    idrac_ips = module.params["idrac_ips"]
+    username = module.params["username"]
+    password = module.params["password"]
+    timeout = module.params["timeout"]
+
+    disabled_ips = []
+    failed_ips = []
+    changed = False
+
+    try:
+        for idrac_ip in idrac_ips:
+            result = disable_telemetry_on_idrac(
+                idrac_ip=idrac_ip,
+                username=username,
+                password=password,
+                timeout=timeout
+            )
+
+            if result.get("success"):
+                disabled_ips.append(idrac_ip)
+                changed = True
+            else:
+                failed_ips.append({
+                    "ip": idrac_ip,
+                    "msg": result.get("msg", "Unknown error")
+                })
+
+        module.exit_json(
+            changed=changed,
+            disabled_ips=disabled_ips,
+            failed_ips=failed_ips,
+            msg=f"Disabled telemetry on {len(disabled_ips)} iDRAC nodes."
+        )
+
+    except (requests.exceptions.RequestException, OSError) as e:
+        module.fail_json(
+            msg=f"An error occurred while disabling telemetry: {str(e)}",
+            disabled_ips=disabled_ips,
+            failed_ips=failed_ips
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml
index 8615897205..7078a2f056 100644
--- a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml
+++ b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -87,6 +87,9 @@
   ansible.builtin.debug:
     msg: "Filtered BMC IPs: {{ filtered_bmc_ip_list }}"
 
+- name: Remove deleted nodes from telemetry (nodes not in bmc_data.csv)
+  ansible.builtin.include_tasks: remove_deleted_nodes.yml
+
 - name: Convert filtered_bmc_ip_list to a dictionary with bmc_ip
   ansible.builtin.set_fact:
     filtered_bmc_ip_dict_list: "{{ filtered_bmc_ip_list | map('community.general.dict_kv', 'bmc_ip') | list }}"
diff --git a/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml
new file mode 100644
index 0000000000..4c82abf9e1
--- /dev/null
+++ b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml
@@ -0,0 +1,101 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Identify iDRAC IPs to remove (present in DB but not in bmc_data.csv)
+  ansible.builtin.set_fact:
+    ips_to_remove: "{{ db_idrac_ip_list | difference(bmc_ips) }}"
+
+- name: Show iDRAC IPs to be removed
+  ansible.builtin.debug:
+    msg: "iDRAC IPs to be removed: {{ ips_to_remove }}"
+  when: ips_to_remove | length > 0
+
+- name: Skip removal if no IPs to remove
+  ansible.builtin.debug:
+    msg: "{{ no_idracips_to_remove_msg }}"
+  when: ips_to_remove | length == 0
+
+- name: Disable telemetry on iDRAC nodes before removal
+  when: ips_to_remove | length > 0
+  block:
+    - name: Disable telemetry service on iDRAC nodes
+      disable_idrac_telemetry:
+        idrac_ips: "{{ ips_to_remove }}"
+        username: "{{ hostvars['localhost']['bmc_username'] }}"
+        password: "{{ hostvars['localhost']['bmc_password'] }}"
+        timeout: "{{ redfish_timeout }}"
+      register: disable_telemetry_result
+      ignore_errors: true
+
+    - name: Show successfully disabled telemetry IPs
+      ansible.builtin.debug:
+        msg: "Successfully disabled telemetry on: {{ disable_telemetry_result.disabled_ips | default([]) }}"
+      when:
+        - disable_telemetry_result.disabled_ips is defined
+        - disable_telemetry_result.disabled_ips | length > 0
+
+    - name: Show failed to disable telemetry IPs
+      ansible.builtin.debug:
+        msg: "Failed to disable telemetry on: {{ disable_telemetry_result.failed_ips | default([]) }}"
+      when:
+        - disable_telemetry_result.failed_ips is defined
+        - disable_telemetry_result.failed_ips | length > 0
+
+- name: Remove iDRAC IPs from MySQL database
+  when: ips_to_remove | length > 0
+  block:
+    - name: Delete iDRAC IPs from mysqldb
+      delete_idracips_from_mysqldb:
+        telemetry_namespace: "{{ telemetry_namespace }}"
+        idrac_podnames: "{{ idrac_podname_idracips.idrac_podname_ips.keys() | list }}"
+        mysqldb_k8s_name: "{{ mysqldb_k8s_name }}"
+        mysqldb_name: "{{ mysqldb_name }}"
+        mysqldb_user: "{{ hostvars['localhost']['mysqldb_user'] }}"
+        mysqldb_password: "{{ hostvars['localhost']['mysqldb_password'] }}"
+        ips_to_delete: "{{ ips_to_remove }}"
+        pod_to_db_idrac_ips: "{{ existing_pod_to_db_idrac_ips }}"
+        db_retries: "{{ db_retries }}"
+        db_delay: "{{ db_delay }}"
+      register: delete_idrac_result
+  rescue:
+    - name: Failed to delete iDRAC IPs from mysqldb
+      ansible.builtin.fail:
+        msg: "{{ mysqldb_delete_fail_msg }}"
+
+- name: Show deleted iDRAC IPs
+  ansible.builtin.debug:
+    msg: "Successfully deleted iDRAC IPs from mysqldb: {{ delete_idrac_result.deleted_ips | default([]) }}"
+  when:
+    - ips_to_remove | length > 0
+    - delete_idrac_result.deleted_ips is defined
+    - delete_idrac_result.deleted_ips | length > 0
+
+- name: Show failed to delete iDRAC IPs
+  ansible.builtin.debug:
+    msg: "Failed to delete iDRAC IPs from mysqldb: {{ delete_idrac_result.failed_ips | default([]) }}"
+  when:
+    - ips_to_remove | length > 0
+    - delete_idrac_result.failed_ips is defined
+    - delete_idrac_result.failed_ips | length > 0
+
+- name: Update telemetry report variables with deletion info
+  ansible.builtin.set_fact:
+    deleted_idrac_count: "{{ delete_idrac_result.deleted_ips | default([]) | length }}"
+    deleted_idrac_ips: "{{ delete_idrac_result.deleted_ips | default([]) }}"
+    failed_delete_count: "{{ delete_idrac_result.failed_ips | default([]) | length }}"
+    failed_delete_ips: "{{ delete_idrac_result.failed_ips | default([]) }}"
+    disabled_telemetry_count: "{{ disable_telemetry_result.disabled_ips | default([]) | length }}"
+    disabled_telemetry_ips: "{{ disable_telemetry_result.disabled_ips | default([]) }}"
+  when: ips_to_remove | length > 0
diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
index 4d8554cab3..06bf230980 100644
--- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
+++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
@@ -14,5 +14,23 @@ Telemetry not supported IPs List:
   - {{ item }}
 {% endfor %}
 
+{% if deleted_idrac_count is defined and deleted_idrac_count | int > 0 %}
+----- Node Deletion Report -----
+
+Total IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }}
+Removed IPs List:
+{% for item in deleted_idrac_ips %}
+  - {{ item }}
+{% endfor %}
+
+{% if disabled_telemetry_count is defined and disabled_telemetry_count | int > 0 %}
+IPs with telemetry disabled via Redfish: {{ disabled_telemetry_count | int }}
+Disabled telemetry IPs List:
+{% for item in disabled_telemetry_ips %}
+  - {{ item }}
+{% endfor %}
+{% endif %}
+{% endif %}
+
 ===== Telemetry Report End =====
 
diff --git a/telemetry/roles/idrac_telemetry/vars/main.yml b/telemetry/roles/idrac_telemetry/vars/main.yml
index d2696f4ac8..7fe6730789 100644
--- a/telemetry/roles/idrac_telemetry/vars/main.yml
+++ b/telemetry/roles/idrac_telemetry/vars/main.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -67,6 +67,13 @@ idrac_telemetry_statefulset_restart_failure_msg: |
   Failed to restart the  {{ idrac_telemetry_k8s_name }} StatefulSet.
   Please check the logs using the command kubectl logs -n {{ telemetry_namespace }} {{ idrac_telemetry_k8s_name }}-<pod-index> and try again.
 
+# Usage: remove_deleted_nodes.yml
+redfish_timeout: 30
+mysqldb_delete_fail_msg: |
+  Failed to delete iDRAC IPs from the mysql database.
+  This could be due to the tables in the mysqldb not being accessible at the moment. Please try running the playbook again after some time.
+no_idracips_to_remove_msg: "No iDRAC IPs to remove. All DB entries are present in bmc_data.csv."
+
 # Usage: create_telemetry_report.yml
 telemetry_report_path: "/opt/omnia/telemetry/idrac_telemetry_report.yml"
 telemetry_report_template: "telemetry_report.j2"
@@ -75,6 +82,9 @@ telemetry_report: |
 
       IP count with Telemetry not supported: {{ failed_idrac_count | int + invalid_idrac_count | int }}
       IP count with Telemetry activated in current execution: {{ telemetry_idrac_count | int }}
+      {% if deleted_idrac_count is defined %}
+      IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }}
+      {% endif %}
 
       {% if (failed_idrac_count | int + invalid_idrac_count | int) > 0 %}
       Potential reasons for telemetry not being initiated include Redfish connectivity problems, timeout issues,
@@ -105,3 +115,15 @@ telemetry_report: |
         - {{ item }}
       {% endfor %}
       {% endif %}
+      {% if deleted_idrac_ips is defined and deleted_idrac_ips | length > 0 %}
+      IPs removed from telemetry database (not present in bmc_data.csv):
+      {% for item in deleted_idrac_ips %}
+        - {{ item }}
+      {% endfor %}
+      {% endif %}
+      {% if disabled_telemetry_ips is defined and disabled_telemetry_ips | length > 0 %}
+      IPs with telemetry disabled via Redfish:
+      {% for item in disabled_telemetry_ips %}
+        - {{ item }}
+      {% endfor %}
+      {% endif %}

From 128cac669d133c7c6eb1f52b37b1d201e1a3810a Mon Sep 17 00:00:00 2001
From: SOWJANYAJAGADISH123 <Sowjanya.Jagadish@dell.com>
Date: Tue, 17 Feb 2026 08:37:15 +0530
Subject: [PATCH 37/77] support multiple Omnia versions (2.1.0.0, 2.1.0.1)
 using a single core container tag (2.1) (#3983)

---
 omnia.sh | 782 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 622 insertions(+), 160 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index b7a086545d..3b320b0bf6 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -52,11 +52,226 @@ is_local_ip() {
     fi
 }
 
+# Version configuration variables
+OMNIA_CORE_CONTAINER_TAG="2.1"  # Default container tag
+OMNIA_VERSION=""  # Will be read from metadata
+TARGET_OMNIA_VERSION=""  # Target version for upgrade
+TARGET_CONTAINER_TAG=""  # Target container tag for upgrade
+
+# Centralized version list (in chronological order)
+ALL_OMNIA_VERSIONS=("2.0.0.0" "2.1.0.0")
+
 # Container-side paths (used inside podman exec commands)
 CONTAINER_INPUT_DIR="/opt/omnia/input"
 CONTAINER_BACKUPS_DIR="/opt/omnia/backups"
 CONTAINER_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml"
 
+# Function to get available upgrade versions (higher than current)
+get_available_upgrade_versions() {
+    local current_version="$1"
+    local available_versions=()
+    local version_descriptions=()
+    
+    # Find versions higher than current
+    local found_current=false
+    for version in "${ALL_OMNIA_VERSIONS[@]}"; do
+        if [ "$version" = "$current_version" ]; then
+            found_current=true
+            continue
+        fi
+        
+        if [ "$found_current" = true ]; then
+            available_versions+=("$version")
+            
+            # Generate description based on upgrade type
+            local current_tag=$(get_container_tag_from_version "$current_version")
+            local target_tag=$(get_container_tag_from_version "$version")
+            
+            if [ "$current_tag" = "$target_tag" ]; then
+                version_descriptions+=("Patch upgrade to $version (container restart only)")
+            else
+                version_descriptions+=("Major upgrade to $version (container swap required)")
+            fi
+        fi
+    done
+    
+    # Return arrays
+    printf '%s\n' "${available_versions[@]}"
+    printf '%s\n' "${version_descriptions[@]}"
+}
+
+# Function to get available rollback versions (lower than current)
+get_available_rollback_versions() {
+    local current_version="$1"
+    local available_versions=()
+    
+    # Find versions lower than current
+    for version in "${ALL_OMNIA_VERSIONS[@]}"; do
+        if [ "$version" = "$current_version" ]; then
+            break
+        fi
+        available_versions+=("$version")
+    done
+    
+    # Return array (reverse order for rollback - newest first)
+    local reversed_versions=()
+    for ((i=${#available_versions[@]}-1; i>=0; i--)); do
+        reversed_versions+=("${available_versions[$i]}")
+    done
+    
+    printf '%s\n' "${reversed_versions[@]}"
+}
+
+# Function to perform same-tag rollback (container restart only)
+rollback_same_tag() {
+    local target_version="$1"
+    local current_version="$2"
+    
+    echo "[INFO] [ROLLBACK] Phase: Same-Tag Rollback"
+    echo "[INFO] [ROLLBACK] Rolling back to $target_version within same container tag"
+    
+    # Verify container is running
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ROLLBACK] Container is not running for same-tag rollback"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Updating metadata to version $target_version"
+    
+    # Update version metadata
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2
+            exit 1
+        fi
+        if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then
+            sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE'
+        else
+            echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE'
+        fi
+    "; then
+        echo "[ERROR] [ROLLBACK] Failed to update metadata version"
+        echo "[ERROR] [ROLLBACK] Rollback failed: Could not update version metadata"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Restarting container to apply changes..."
+    
+    # Restart container to apply changes
+    if ! systemctl restart omnia_core.service; then
+        echo "[ERROR] [ROLLBACK] Failed to restart container service"
+        echo "[ERROR] [ROLLBACK] Rollback failed: Container restart failed"
+        return 1
+    fi
+    
+    # Wait for container to be healthy after restart
+    echo "[INFO] [ROLLBACK] Waiting for container health check after restart (30s)"
+    local health_timeout=30
+    local health_count=0
+    
+    while [ $health_count -lt $health_timeout ]; do
+        if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then
+            echo "[INFO] [ROLLBACK] Container is healthy after restart"
+            break
+        fi
+        sleep 1
+        health_count=$((health_count + 1))
+        echo -n "."
+    done
+    
+    if [ $health_count -ge $health_timeout ]; then
+        echo ""
+        echo "[ERROR] [ROLLBACK] Container failed to become healthy within 30 seconds after restart"
+        echo "[ERROR] [ROLLBACK] Rollback failed: Container health check failed"
+        return 1
+    fi
+    
+    # Verify version update
+    local updated_version=$(get_current_omnia_version)
+    if [ "$updated_version" != "$target_version" ]; then
+        echo "[ERROR] [ROLLBACK] Version update verification failed"
+        echo "[ERROR] [ROLLBACK] Expected: $target_version, Found: $updated_version"
+        return 1
+    fi
+    
+    echo "[INFO] [ROLLBACK] Same-tag rollback completed successfully"
+    echo "[INFO] [ROLLBACK] Version rolled back to: $target_version"
+    return 0
+}
+
+# Function to validate container image availability and show build instructions
+validate_container_image() {
+    local target_version="$1"
+    local target_container_tag="$2"
+    local operation="${3:-upgrade}"
+    
+    echo -e "${BLUE}Validating target container image: omnia_core:$target_container_tag${NC}"
+    if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then
+        echo -e "${RED}ERROR: Target image missing locally: omnia_core:$target_container_tag${NC}"
+        echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}"
+        echo -e "1. Clone the Omnia Artifactory repository:"
+        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-$target_version"
+        echo -e "2. Navigate to the repository directory:"
+        echo -e "   cd omnia-artifactory"
+        echo -e "3. Build the core image locally (loads into local Podman by default):"
+        echo -e "   ./build_images.sh core core_tag=$target_container_tag omnia_branch=$target_version"
+        echo -e "Then re-run:"
+        echo -e "   ./omnia.sh --$operation"
+        return 1
+    fi
+    
+    echo -e "${GREEN}✓ Target image available locally: omnia_core:$target_container_tag${NC}"
+    return 0
+}
+
+# Function to get container tag from omnia version
+get_container_tag_from_version() {
+    local version="$1"
+    case "$version" in
+        2.0.*)
+            echo "1.0"
+            ;;
+        *)
+            echo "$(echo "$version" | awk -F. '{print $1"."$2}')"
+            ;;
+    esac
+}
+
+# Function to read current omnia version from metadata
+get_current_omnia_version() {
+    if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        podman exec omnia_core cat /opt/omnia/.data/oim_metadata.yml 2>/dev/null | grep "omnia_version:" | awk '{print $2}' | tr -d '"'
+    else
+        echo ""
+    fi
+}
+
+show_post_upgrade_instructions() {
+    local upgraded_version="$1"
+
+    echo ""
+    echo -e "${YELLOW}================================================================================${NC}"
+    echo -e "${YELLOW}                    IMPORTANT POST-UPGRADE STEP${NC}"
+    echo -e "${YELLOW}================================================================================${NC}"
+    echo ""
+    echo -e "${GREEN}✓ Omnia core container has been successfully upgraded${NC}"
+    echo -e "${GREEN}✓ Version updated to: $upgraded_version${NC}"
+    echo ""
+    echo -e "${BLUE}NEXT REQUIRED ACTION:${NC}"
+    echo -e "${YELLOW}You must now run the upgrade playbook inside the omnia_core container:${NC}"
+    echo ""
+    echo -e "${GREEN}podman exec -it omnia_core ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}"
+    echo ""
+    echo -e "${BLUE}This playbook will:${NC}"
+    echo -e "• Update input files"
+    echo -e "• Update internal configurations"
+    echo ""
+    echo -e "${YELLOW}Note: Run this command after the container is fully healthy and stable${NC}"
+    echo -e "${YELLOW}================================================================================${NC}"
+    echo ""
+}
+
 # Host-side paths (initialized dynamically after omnia_path is set)
 OMNIA_INPUT_DIR=""
 OMNIA_METADATA_DIR=""
@@ -1004,29 +1219,9 @@ install_omnia_core() {
     local omnia_core_tag="2.1"
     local omnia_core_registry=""
     
-    # Check if local omnia_core:2.1 exists
-    if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then
+    # Check if local omnia_core image exists using validate function
+    if validate_container_image "" "$omnia_core_tag" "install"; then
         echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
-    # Check if latest exists for backward compatibility
-    elif podman inspect omnia_core:latest >/dev/null 2>&1; then
-        echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}"
-        # Tag it as 2.1 for consistency
-        podman tag omnia_core:latest omnia_core:${omnia_core_tag}
-    else
-        echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}"
-        echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}"
-        echo ""
-        echo -e "${YELLOW}One way to build the image locally:${NC}"
-        echo -e "1. Clone the Omnia Artifactory repository:"
-        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-<omnia version>"
-        echo -e "2. Navigate to the repository directory:"
-        echo -e "   cd omnia-artifactory"
-        echo -e "3. Build the core image locally (loads into local Podman by default):"
-        echo -e "   ./build_images.sh core core_tag=2.1 omnia_branch=<omnia version/branch_name>"
-        echo ""
-        echo -e "${YELLOW}Then re-run:${NC}"
-        echo -e "   ./omnia.sh --install"
-        exit 1
     fi
 
     # Check if any other containers with 'omnia' in their name are running
@@ -1148,9 +1343,6 @@ install_omnia_core() {
 
     # If core container is not present
     else
-
-        # Start the container setup
-        echo -e "${GREEN}Starting Omnia core container setup.${NC}"
         setup_omnia_core
     fi
 }
@@ -1216,16 +1408,6 @@ phase1_validate() {
         return 1
     fi
 
-    if [ "$previous_omnia_version" = "2.1.0.0" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Upgrade already performed. Current Omnia version is 2.1.0.0. No further upgrade required."
-        return 1
-    fi
-
-    if [ "$previous_omnia_version" != "2.0.0.0" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version"
-        return 1
-    fi
-
     shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r')
     if [ -z "$shared_path" ]; then
         echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml"
@@ -1244,28 +1426,6 @@ phase1_validate() {
         return 1
     fi
 
-    current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null)
-    if [ -z "$current_image" ]; then
-        echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image"
-        return 1
-    fi
-
-    if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1"
-        echo ""
-        echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}"
-        echo ""
-        echo -e "${YELLOW}To build the core image locally:${NC}"
-        echo -e "1. Clone the Omnia Artifactory repository:"
-        echo -e "   git clone https://github.com/dell/omnia-artifactory -b omnia-container-<omnia version>"
-        echo -e "2. Navigate to the repository directory:"
-        echo -e "   cd omnia-artifactory"
-        echo -e "3. Build the core image locally (loads into local Podman by default):"
-        echo -e "   ./build_images.sh core core_tag=2.1 omnia_branch=<omnia version/branch_name>"
-        echo ""
-        return 1
-    fi
-
     echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed"
     return 0
 }
@@ -1277,13 +1437,18 @@ phase2_approval() {
     echo "============================================"
     echo "OMNIA UPGRADE SUMMARY"
     echo "============================================"
-    echo "Current Container Tag: 1.0"
-    echo "Target Container Tag:  2.1"
-    echo "Current Omnia Release: 2.0.0.0"
-    echo "Target Omnia Release:  2.1.0.0"
-    echo "New Features:"
-    echo "  - Add and remove node for slurm cluster"
-    echo "  - Additional Package Installation"
+    echo "Current Container Tag: $OMNIA_CORE_CONTAINER_TAG"
+    echo "Target Container Tag:  $TARGET_CONTAINER_TAG"
+    echo "Current Omnia Release: $OMNIA_VERSION"
+    echo "Target Omnia Release:  $TARGET_OMNIA_VERSION"
+    
+    # Show upgrade type
+    if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then
+        echo "Upgrade Type: Same-tag upgrade (container restart)"
+    else
+        echo "Upgrade Type: Cross-tag upgrade (container swap)"
+    fi
+    
     echo "============================================"
 
     current_omnia_version=$(podman exec -u root omnia_core /bin/bash -c "grep '^omnia_version:' '$CONTAINER_METADATA_FILE' | cut -d':' -f2 | tr -d ' \t\n\r'" 2>/dev/null)
@@ -1367,6 +1532,85 @@ phase3_backup_creation() {
     return 0
 }
 
+phase4_same_tag_upgrade() {
+    local target_version="$1"
+    
+    echo "[INFO] [ORCHESTRATOR] Phase 4: Same-Tag Upgrade"
+    echo "[INFO] [ORCHESTRATOR] Upgrading to $target_version within same container tag"
+    
+    # Verify container is running
+    if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
+        echo "[ERROR] [ORCHESTRATOR] Container is not running for same-tag upgrade"
+        return 1
+    fi
+    
+    echo "[INFO] [ORCHESTRATOR] Updating metadata to version $target_version"
+    
+    # Update version metadata
+    if ! podman exec -u root omnia_core bash -c "
+        set -e
+        if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
+            echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2
+            exit 1
+        fi
+        if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then
+            sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE'
+        else
+            echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE'
+        fi
+    "; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to update metadata version"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata"
+        return 1
+    fi
+    
+    echo "[INFO] [ORCHESTRATOR] Restarting container to apply changes..."
+    
+    # Restart container to apply changes
+    if ! systemctl restart omnia_core.service; then
+        echo "[ERROR] [ORCHESTRATOR] Failed to restart container service"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container restart failed"
+        return 1
+    fi
+    
+    # Wait for container to be healthy after restart
+    echo "[INFO] [ORCHESTRATOR] Waiting for container health check after restart (30s)"
+    local health_timeout=30
+    local health_count=0
+    
+    while [ $health_count -lt $health_timeout ]; do
+        if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then
+            echo "[INFO] [ORCHESTRATOR] Container is healthy after restart"
+            break
+        fi
+        sleep 1
+        health_count=$((health_count + 1))
+        echo -n "."
+    done
+    
+    if [ $health_count -ge $health_timeout ]; then
+        echo ""
+        echo "[ERROR] [ORCHESTRATOR] Container failed to become healthy within 30 seconds after restart"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container health check failed"
+        return 1
+    fi
+    
+    # Verify version update
+    local updated_version=$(get_current_omnia_version)
+    if [ "$updated_version" != "$target_version" ]; then
+        echo "[ERROR] [ORCHESTRATOR] Version update verification failed"
+        echo "[ERROR] [ORCHESTRATOR] Expected: $target_version, Found: $updated_version"
+        return 1
+    fi
+    
+    echo "[INFO] [ORCHESTRATOR] Same-tag upgrade completed successfully"
+    echo "[INFO] [ORCHESTRATOR] Version updated to: $target_version"
+
+    show_post_upgrade_instructions "$target_version"
+    
+    return 0
+}
+
 phase4_container_swap() {
     local quadlet_file="/etc/containers/systemd/omnia_core.container"
     local i
@@ -1376,12 +1620,12 @@ phase4_container_swap() {
     if [ ! -f "$quadlet_file" ]; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Stopping omnia_core 1.0 container"
+    echo "[INFO] [ORCHESTRATOR] Stopping omnia_core $OMNIA_CORE_CONTAINER_TAG container"
     systemctl stop omnia_core.service >/dev/null 2>&1 || true
 
     if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
@@ -1391,25 +1635,25 @@ phase4_container_swap() {
 
     if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop 1.0 container"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop $OMNIA_CORE_CONTAINER_TAG container"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Starting omnia_core 2.1 Quadlet unit"
-    if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then
-        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 image not available"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+    echo "[INFO] [ORCHESTRATOR] Starting omnia_core $TARGET_CONTAINER_TAG Quadlet unit"
+    if ! podman inspect "omnia_core:$TARGET_CONTAINER_TAG" >/dev/null 2>&1; then
+        echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:$TARGET_CONTAINER_TAG"
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG image not available"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
 
-    if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:2.1/' "$quadlet_file"; then
-        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 2.1 in quadlet file"
+    if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$TARGET_CONTAINER_TAG/" "$quadlet_file"; then
+        echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to $TARGET_CONTAINER_TAG in quadlet file"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
@@ -1417,20 +1661,20 @@ phase4_container_swap() {
     systemctl daemon-reload || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     }
 
     systemctl start omnia_core.service || {
         echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 2.1 container"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start $TARGET_CONTAINER_TAG container"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     }
 
-    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 2.1 health check (60s)"
+    echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core $TARGET_CONTAINER_TAG health check (60s)"
     for i in $(seq 1 60); do
         if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
             break
@@ -1440,13 +1684,13 @@ phase4_container_swap() {
 
     if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap"
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 container failed health check"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG container failed health check"
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
 
-    echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to 2.1.0.0"
+    echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to $TARGET_OMNIA_VERSION"
     if ! podman exec -u root omnia_core bash -c "
         set -e
         if [ ! -f '$CONTAINER_METADATA_FILE' ]; then
@@ -1454,14 +1698,14 @@ phase4_container_swap() {
             exit 1
         fi
         if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then
-            sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$CONTAINER_METADATA_FILE'
+            sed -i 's/^omnia_version:.*/omnia_version: $TARGET_OMNIA_VERSION/' '$CONTAINER_METADATA_FILE'
         else
-            echo 'omnia_version: 2.1.0.0' >> '$CONTAINER_METADATA_FILE'
+            echo 'omnia_version: $TARGET_OMNIA_VERSION' >> '$CONTAINER_METADATA_FILE'
         fi
     "; then
         echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version"
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata"
-        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..."
+        echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..."
         rollback_omnia_core
         return 1
     fi
@@ -1471,21 +1715,129 @@ phase4_container_swap() {
 }
 
 upgrade_omnia_core() {
-    local lock_file="/var/lock/omnia_core_upgrade.lock"
-    local backup_base
-
-    if [ -e "$lock_file" ]; then
-        echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}"
+    echo -e "${BLUE}=================== Omnia Core Upgrade ====================${NC}"
+    echo -e "${BLUE}This script will upgrade Omnia core container.${NC}"
+    echo -e "${BLUE}Current version will be backed up and upgraded to target version.${NC}"
+    echo -e "${BLUE}=============================================================${NC}"
+    
+    # Read current version
+    OMNIA_VERSION=$(get_current_omnia_version)
+    if [ -z "$OMNIA_VERSION" ]; then
+        echo -e "${RED}ERROR: Could not determine current Omnia version${NC}"
+        echo -e "${YELLOW}Please ensure omnia_core container is running and metadata is accessible${NC}"
         exit 1
     fi
-
-    mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true
-    echo "$$" > "$lock_file" || {
-        echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}"
+    
+    # Get current container tag
+    OMNIA_CORE_CONTAINER_TAG=$(get_container_tag_from_version "$OMNIA_VERSION")
+    
+    echo -e "${GREEN}Current Omnia version: $OMNIA_VERSION${NC}"
+    echo -e "${GREEN}Current container tag: $OMNIA_CORE_CONTAINER_TAG${NC}"
+    
+    # Show available upgrade options
+    echo ""
+    echo "Available upgrade options:"
+    echo "========================="
+    
+    # Get available upgrade versions dynamically
+    local upgrade_output
+    upgrade_output=$(get_available_upgrade_versions "$OMNIA_VERSION")
+    
+    # Parse output into versions and descriptions
+    local available_versions=()
+    local version_descriptions=()
+    local line_count=0
+    local total_lines
+    
+    # Count total lines
+    total_lines=$(echo "$upgrade_output" | wc -l)
+    
+    # Split into versions and descriptions (first half = versions, second half = descriptions)
+    local mid_line=$((total_lines / 2))
+    local line_num=0
+    
+    while IFS= read -r line; do
+        line_num=$((line_num + 1))
+        if [ $line_num -le $mid_line ]; then
+            available_versions+=("$line")
+        else
+            version_descriptions+=("$line")
+        fi
+    done <<< "$upgrade_output"
+    
+    # Check if any upgrade options are available
+    if [ ${#available_versions[@]} -eq 0 ]; then
+        echo -e "${GREEN}Already at latest version $OMNIA_VERSION${NC}"
+        echo "No upgrade options available."
+        exit 0
+    fi
+    
+    # Display upgrade options
+    for i in "${!available_versions[@]}"; do
+        local target_version="${available_versions[$i]}"
+        local target_container_tag=$(get_container_tag_from_version "$target_version")
+        
+        # Check if target image exists locally
+        local image_status="✓ Available"
+        if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then
+            image_status="✗ Missing (build required)"
+        fi
+        
+        echo "$((i+1)). Upgrade to $target_version (container tag: $target_container_tag) [$image_status]"
+    done
+    
+    # Prompt user to select upgrade version
+    echo -n "Select upgrade option (1-${#available_versions[@]}) or press Enter to cancel: "
+    read -r selection
+    
+    # Validate selection
+    if [ -z "$selection" ]; then
+        echo "Upgrade cancelled by user."
+        exit 0
+    fi
+    
+    if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then
+        echo -e "${RED}ERROR: Invalid selection.${NC}"
         exit 1
-    }
+    fi
+    
+    # Set target version based on user selection
+    TARGET_OMNIA_VERSION="${available_versions[$((selection-1))]}"
+    TARGET_CONTAINER_TAG=$(get_container_tag_from_version "$TARGET_OMNIA_VERSION")
+    
+    # Pre-validation: Check if target container image exists locally
+    if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then
+        exit 1
+    fi
+    
+    echo -e "${GREEN}Target Omnia version: $TARGET_OMNIA_VERSION${NC}"
+    echo -e "${GREEN}Target container tag: $TARGET_CONTAINER_TAG${NC}"
+    
+    # Check if container tag change is needed
+    if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then
+        echo -e "${BLUE}Upgrade within same container tag ($TARGET_CONTAINER_TAG)${NC}"
+        echo -e "${BLUE}Will restart container instead of swapping${NC}"
+        SAME_TAG_UPGRADE=true
+    else
+        echo -e "${BLUE}Container tag change required ($OMNIA_CORE_CONTAINER_TAG -> $TARGET_CONTAINER_TAG)${NC}"
+        echo -e "${BLUE}Will perform full container swap${NC}"
+        SAME_TAG_UPGRADE=false
+    fi
+    
+    # Pre-validation: Check if target container image exists locally
+    if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then
+        exit 1
+    fi
+    local lock_file="/tmp/omnia_upgrade.lock"
+    if [ -f "$lock_file" ]; then
+        echo -e "${RED}ERROR: Another upgrade process is already running${NC}"
+        echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}"
+        exit 1
+    fi
+    touch "$lock_file"
     trap 'rm -f "$lock_file"' EXIT
 
+    # Run upgrade phases
     if ! phase1_validate; then
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1"
         exit 1
@@ -1495,7 +1847,7 @@ upgrade_omnia_core() {
         exit 0
     fi
 
-    backup_base="$OMNIA_UPGRADE_BACKUP_PATH"
+    local backup_base="$OMNIA_UPGRADE_BACKUP_PATH"
     if [ -z "$backup_base" ]; then
         echo "[ERROR] [ORCHESTRATOR] Backup path is empty"
         exit 1
@@ -1506,13 +1858,26 @@ upgrade_omnia_core() {
         exit 1
     fi
 
-    if ! phase4_container_swap; then
-        echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4"
-        exit 1
+    # Choose upgrade path based on container tag
+    if [ "$SAME_TAG_UPGRADE" = "true" ]; then
+        if ! phase4_same_tag_upgrade "$TARGET_OMNIA_VERSION"; then
+            echo "[ERROR] [ORCHESTRATOR] Upgrade failed in same-tag upgrade"
+            exit 1
+        fi
+    else
+        if ! phase4_container_swap; then
+            echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4"
+            exit 1
+        fi
     fi
 
     echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully"
     echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base"
+
+    show_post_upgrade_instructions "$TARGET_OMNIA_VERSION"
+    
+    # Initialize SSH config and start container session
+    init_ssh_config
     start_container_session
     exit 0
 }
@@ -1622,16 +1987,31 @@ restore_from_backup() {
 display_cleanup_instructions() {
     echo ""
     echo -e "${RED}================================================================================${NC}"
-    echo -e "${RED}                    ROLLBACK FAILED${NC}"
+    echo -e "${RED}                    UPGRADE/ROLLBACK FAILED${NC}"
     echo -e "${RED}================================================================================${NC}"
     echo ""
-    echo -e "${YELLOW}Rollback failed. Manual cleanup is required to restore a clean state before retrying.${NC}"
+    echo -e "${YELLOW}Operation failed. Manual cleanup is required to restore a clean state before retrying.${NC}"
+    echo ""
+    echo -e "${BLUE}Choose the appropriate cleanup scenario:${NC}"
+    echo ""
+    echo -e "${GREEN}CASE 1: If you can log into omnia_core container:${NC}"
+    echo -e "${YELLOW}1. Enter omnia_core container: podman exec -it omnia_core bash${NC}"
+    echo -e "${YELLOW}2. Run oim cleanup: ansible-playbook /omnia/oim_cleanup.yml${NC}"
+    echo -e "${YELLOW}3. Run uninstall inside container: ./omnia.sh --uninstall${NC}"
+    echo -e "${YELLOW}4. Exit container: exit${NC}"
+    echo -e "${YELLOW}5. Clean shared path: rm -rf <omnia_shared_path>${NC}"
+    echo -e "${YELLOW}6. Install required version: ./omnia.sh --install${NC}"
     echo ""
-    echo -e "${YELLOW}Run the following on the OIM host:${NC}"
-    echo -e "${YELLOW}1. Clean Omnia shared path: rm -rf <shared_path>${NC}"
-    echo -e "${YELLOW}2. Stop Omnia core system service: systemctl stop omnia_core${NC}"
-    echo -e "${YELLOW}3. Remove the Omnia core container: podman rm -f omnia_core${NC}"
-    echo -e "${YELLOW}4. Perform a fresh Omnia core install: ./omnia.sh --install${NC}"
+    echo -e "${GREEN}CASE 2: If you cannot log into omnia_core container (but other containers are running):${NC}"
+    echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}"
+    echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}"
+    echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}"
+    echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}"
+    echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}"
+    echo -e "${YELLOW}6. Clean shared path: rm -rf <omnia_shared_path>${NC}"
+    echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}"
+    echo ""
+    echo -e "${BLUE}Note: Replace <omnia_shared_path> with your actual Omnia shared path.${NC}"
     echo ""
 }
 
@@ -1652,6 +2032,27 @@ rollback_omnia_core() {
         exit 1
     fi
     
+    # Create lock file to prevent concurrent rollbacks
+    local lock_file="/tmp/omnia_rollback.lock"
+    if [ -f "$lock_file" ]; then
+        local existing_pid
+        existing_pid=$(cat "$lock_file" 2>/dev/null | tr -d ' \t\n\r')
+
+        if [ -n "$existing_pid" ] && kill -0 "$existing_pid" >/dev/null 2>&1; then
+            echo -e "${RED}ERROR: Another rollback process is already running (PID: $existing_pid)${NC}"
+            echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}"
+            exit 1
+        fi
+
+        if [ -n "$existing_pid" ]; then
+            echo -e "${YELLOW}[WARN] Stale rollback lock file found (PID: $existing_pid); removing: $lock_file${NC}"
+        fi
+        rm -f "$lock_file" >/dev/null 2>&1 || true
+    fi
+
+    echo "$$" > "$lock_file"
+    trap 'rm -f "$lock_file"' EXIT INT TERM
+    
     # Get current version
     if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then
         echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}"
@@ -1659,48 +2060,56 @@ rollback_omnia_core() {
     fi
     
     local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
-    if [ "$current_version" != "2.1.0.0" ]; then
-        echo -e "${RED}ERROR: Cannot rollback from version $current_version. Rollback is only supported from version 2.1.0.0.${NC}"
-        exit 1
-    fi
     
-    # List available backups
-    echo "[INFO] [ROLLBACK] Scanning for available backups..."
-    local backup_dirs=()
+    # Get available rollback versions dynamically
+    local rollback_versions
+    rollback_versions=$(get_available_rollback_versions "$current_version")
+    
+    # Convert to array
+    local available_versions=()
     while IFS= read -r line; do
-        backup_dirs+=("$line")
-    done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_*" 2>/dev/null | sort -r)
+        available_versions+=("$line")
+    done <<< "$rollback_versions"
     
-    if [ ${#backup_dirs[@]} -eq 0 ]; then
-        echo -e "${RED}ERROR: No backup directories found.${NC}"
+    # Check if any rollback options are available
+    if [ ${#available_versions[@]} -eq 0 ]; then
+        echo -e "${RED}ERROR: No rollback versions available from $current_version.${NC}"
         exit 1
     fi
     
     echo ""
-    echo "Available backup versions:"
-    for i in "${!backup_dirs[@]}"; do
-        local version=$(basename "${backup_dirs[$i]}" | sed 's/version_//')
-        local backup_date=$(podman exec -u root omnia_core stat -c '%y' "${backup_dirs[$i]}" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1)
-        echo "  $((i+1)). Version $version (created: $backup_date)"
+    echo "Available rollback versions:"
+    echo "==========================="
+    for i in "${!available_versions[@]}"; do
+        local version="${available_versions[$i]}"
+        local container_tag=$(get_container_tag_from_version "$version")
+        
+        # Check if target image exists locally
+        local image_status="✓ Available"
+        if ! podman inspect "omnia_core:$container_tag" >/dev/null 2>&1; then
+            image_status="✗ Missing (build required)"
+        fi
+        
+        echo "  $((i+1)). Rollback to version $version (container tag: $container_tag) [$image_status]"
     done
     
-    # Prompt for backup selection
+    # Prompt for rollback selection
     echo ""
-    echo -n "Select backup to restore from (1-${#backup_dirs[@]}): "
+    echo -n "Select rollback version (1-${#available_versions[@]}): "
     read -r selection
     
     # Validate selection
-    if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#backup_dirs[@]} ]; then
+    if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then
         echo -e "${RED}ERROR: Invalid selection.${NC}"
         exit 1
     fi
     
-    local selected_backup="${backup_dirs[$((selection-1))]}"
-    local backup_version=$(basename "$selected_backup" | sed 's/version_//')
+    local selected_version="${available_versions[$((selection-1))]}"
+    local selected_container_tag=$(get_container_tag_from_version "$selected_version")
     
     echo ""
-    echo "Selected backup: Version $backup_version"
-    echo -n "Are you sure you want to rollback to version $backup_version? [y/N]: "
+    echo "Selected rollback: Version $selected_version"
+    echo -n "Are you sure you want to rollback to version $selected_version? [y/N]: "
     read -r confirm
     
     if [[ ! "$confirm" =~ ^[yY] ]]; then
@@ -1708,50 +2117,99 @@ rollback_omnia_core() {
         exit 0
     fi
     
-    # Validate selected backup - only check if directory exists without podman exec
-    if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then
-        # Try to check on host if container check fails
-        # Get shared path from metadata to check on host
-        local shared_path=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
-        local host_backup_path="${selected_backup#/opt/omnia}"
-        if [ -z "$shared_path" ] || [ ! -d "$shared_path$host_backup_path" ]; then
-            echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}"
+    # Pre-validation: Check if target container image exists locally
+    if ! validate_container_image "$selected_version" "$selected_container_tag" "rollback"; then
+        exit 1
+    fi
+    
+    # Check if container tag change is needed
+    local current_container_tag=$(get_container_tag_from_version "$current_version")
+    if [ "$current_container_tag" = "$selected_container_tag" ]; then
+        echo -e "${BLUE}Rollback within same container tag ($selected_container_tag)${NC}"
+        echo -e "${BLUE}Will restart container instead of swapping${NC}"
+        
+        # Perform same-tag rollback (container restart only)
+        if ! rollback_same_tag "$selected_version" "$current_version"; then
+            echo "[ERROR] [ROLLBACK] Rollback failed in same-tag rollback"
             exit 1
         fi
+        
+        echo "[INFO] [ROLLBACK] Rollback completed successfully"
+        echo "[INFO] [ROLLBACK] Version rolled back to: $selected_version"
+        exit 0
+    else
+        echo -e "${BLUE}Container tag change required ($current_container_tag -> $selected_container_tag)${NC}"
+        echo -e "${BLUE}Will perform full container swap${NC}"
+        # Continue with existing container swap logic
+    fi
+    
+    # List available backups for selected version
+    echo "[INFO] [ROLLBACK] Scanning for available backups for version $selected_version..."
+    local backup_dirs=()
+    while IFS= read -r line; do
+        backup_dirs+=("$line")
+    done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_${selected_version}*" 2>/dev/null | sort -r)
+    
+    if [ ${#backup_dirs[@]} -eq 0 ]; then
+        echo -e "${RED}ERROR: No backup directories found for version $selected_version.${NC}"
+        exit 1
+    fi
+    
+    echo ""
+    echo "Available backups for version $selected_version:"
+    for i in "${!backup_dirs[@]}"; do
+        local backup_path="${backup_dirs[$i]}"
+        local backup_date=$(podman exec -u root omnia_core stat -c '%y' "$backup_path" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1)
+        echo "  $((i+1)). Backup created: $backup_date"
+    done
+    
+    # Prompt for backup selection
+    echo ""
+    echo -n "Select backup to restore from (1-${#backup_dirs[@]}): "
+    read -r backup_selection
+    
+    # Validate backup selection
+    if ! [[ "$backup_selection" =~ ^[0-9]+$ ]] || [ "$backup_selection" -lt 1 ] || [ "$backup_selection" -gt ${#backup_dirs[@]} ]; then
+        echo -e "${RED}ERROR: Invalid backup selection.${NC}"
+        exit 1
+    fi
+    
+    local selected_backup="${backup_dirs[$((backup_selection-1))]}"
+    
+    # Validate selected backup exists
+    if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then
+        echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}"
+        exit 1
     fi
     
     echo ""
     echo "[INFO] [ROLLBACK] Starting rollback process..."
     
-    # Step 1: Stop 2.1 container gracefully
+    # Step 1: Stop current container gracefully
     echo ""
-    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 2.1 container..."
+    echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core $current_container_tag container..."
     if ! stop_container_gracefully "omnia_core" 30; then
         echo -e "${RED}ERROR: Failed to stop container.${NC}"
         display_cleanup_instructions
         exit 1
     fi
     
-    # Step 2: Check for 1.0 image
+    # Step 2: Update Quadlet file to use target container tag
     echo ""
-    echo "[INFO] [ROLLBACK] Step 2: Checking for Omnia core 1.0 image..."
-    if ! podman inspect omnia_core:1.0 >/dev/null 2>&1; then
-        echo -e "${YELLOW}WARNING: Omnia core 1.0 image not found locally.${NC}"
-        echo -e "${YELLOW}Attempting to tag image...${NC}"
-        
-        # Try to tag latest as 1.0 if available
-        if podman inspect omnia_core:latest >/dev/null 2>&1; then
-            podman tag omnia_core:latest omnia_core:1.0
-        else
-            echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}"
-            display_cleanup_instructions
-            exit 1
-        fi
+    echo "[INFO] [ROLLBACK] Step 2: Updating Quadlet file to use container tag $selected_container_tag..."
+    local quadlet_file="/etc/containers/systemd/omnia_core.container"
+    
+    if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$selected_container_tag/" "$quadlet_file"; then
+        echo -e "${RED}ERROR: Failed to update Image to $selected_container_tag in quadlet file${NC}"
+        display_cleanup_instructions
+        exit 1
     fi
     
-    # Step 3: Start 1.0 container
+    echo "[INFO] [ROLLBACK] Quadlet file updated to use omnia_core:$selected_container_tag"
+    
+    # Step 3: Start target container
     echo ""
-    echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core 1.0 container..."
+    echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core $selected_container_tag container..."
     systemctl daemon-reload
     if ! systemctl start omnia_core.service; then
         echo -e "${RED}ERROR: Failed to start container service.${NC}"
@@ -1805,8 +2263,8 @@ rollback_omnia_core() {
     echo "[INFO] [ROLLBACK] Step 7: Verifying container version..."
     local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r')
     
-    if [ "$verify_version" != "$backup_version" ]; then
-        echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}"
+    if [ "$verify_version" != "$selected_version" ]; then
+        echo -e "${RED}ERROR: Version verification failed. Expected: $selected_version, Found: $verify_version${NC}"
         display_cleanup_instructions
         exit 1
     fi
@@ -1814,18 +2272,22 @@ rollback_omnia_core() {
     # Audit log end
     local rollback_end=$(date -Iseconds)
     echo "[AUDIT] Rollback operation completed at: $rollback_end"
-    echo "[AUDIT] Rolled back from version $current_version to $backup_version"
+    echo "[AUDIT] Rolled back from version $current_version to $selected_version"
     
     echo ""
     echo -e "${GREEN}================================================================================${NC}"
     echo -e "${GREEN}                    ROLLBACK COMPLETED SUCCESSFULLY${NC}"
     echo -e "${GREEN}================================================================================${NC}"
     echo ""
-    echo -e "${GREEN}✓ Omnia core has been rolled back to version $backup_version${NC}"
+    echo -e "${GREEN}✓ Omnia core has been rolled back to version $selected_version${NC}"
     echo -e "${GREEN}✓ Container is running and healthy${NC}"
     echo -e "${GREEN}✓ Configuration restored from backup${NC}"
     echo ""
     
+    # Clean up lock file before starting long-running ssh session
+    rm -f "$lock_file" >/dev/null 2>&1 || true
+    echo "[INFO] Rollback lock file removed before starting container session"
+    
     # Initialize SSH config and start container session
     init_ssh_config
     start_container_session

From 2078496e82aa5525bfc6255373f8f42ca4a51fa2 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Tue, 17 Feb 2026 12:35:42 +0530
Subject: [PATCH 38/77] LDMS Slurm node add /delete (#3976)

* LDMS slurm node add/delete

* pr review comments update
---
 .../telemetry/tasks/check_pxe_changes.yml     |  88 ++++++++++
 discovery/roles/telemetry/tasks/main.yml      |  10 ++
 .../telemetry/tasks/restart_ldms_configs.yml  | 151 ++++++++++++++++++
 discovery/roles/telemetry/vars/main.yml       |  21 +++
 4 files changed, 270 insertions(+)
 create mode 100644 discovery/roles/telemetry/tasks/check_pxe_changes.yml
 create mode 100644 discovery/roles/telemetry/tasks/restart_ldms_configs.yml

diff --git a/discovery/roles/telemetry/tasks/check_pxe_changes.yml b/discovery/roles/telemetry/tasks/check_pxe_changes.yml
new file mode 100644
index 0000000000..398c831961
--- /dev/null
+++ b/discovery/roles/telemetry/tasks/check_pxe_changes.yml
@@ -0,0 +1,88 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Check if current PXE mapping file exists
+  ansible.builtin.stat:
+    path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}"
+  delegate_to: localhost
+  register: current_pxe_file
+
+- name: Check if backup PXE mapping file exists
+  ansible.builtin.stat:
+    path: "{{ backup_pxe_mapping_ldms_path }}"
+  delegate_to: localhost
+  register: backup_pxe_file
+
+- name: Handle first discovery run (no backup exists)
+  when:
+    - current_pxe_file.stat.exists
+    - not backup_pxe_file.stat.exists
+  block:
+    - name: Create backup of PXE mapping file
+      ansible.builtin.copy:
+        src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}"
+        dest: "{{ backup_pxe_mapping_ldms_path }}"
+        remote_src: true
+        mode: preserve
+      delegate_to: localhost
+
+    - name: Set pxe_changed to false for first run
+      ansible.builtin.set_fact:
+        pxe_changed: false
+
+    - name: Display first run message
+      ansible.builtin.debug:
+        msg: "{{ pxe_first_run_msg }}"
+
+- name: Compare PXE mapping files when backup exists
+  when:
+    - current_pxe_file.stat.exists
+    - backup_pxe_file.stat.exists
+  block:
+    - name: Get checksum of current PXE mapping file
+      ansible.builtin.stat:
+        path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}"
+        checksum_algorithm: sha256
+      delegate_to: localhost
+      register: current_pxe_checksum
+
+    - name: Get checksum of backup PXE mapping file
+      ansible.builtin.stat:
+        path: "{{ backup_pxe_mapping_ldms_path }}"
+        checksum_algorithm: sha256
+      delegate_to: localhost
+      register: backup_pxe_checksum
+
+    - name: Set pxe_changed based on checksum comparison
+      ansible.builtin.set_fact:
+        pxe_changed: "{{ current_pxe_checksum.stat.checksum != backup_pxe_checksum.stat.checksum }}"
+
+    - name: Update backup PXE mapping file when changed
+      ansible.builtin.copy:
+        src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}"
+        dest: "{{ backup_pxe_mapping_ldms_path }}"
+        remote_src: true
+        mode: preserve
+      delegate_to: localhost
+      when: pxe_changed | bool
+
+    - name: Display PXE change status
+      ansible.builtin.debug:
+        msg: "{{ pxe_changed_msg if (pxe_changed | bool) else pxe_no_change_msg }}"
+
+- name: Set pxe_changed to false when PXE file is missing
+  ansible.builtin.set_fact:
+    pxe_changed: false
+  when: not current_pxe_file.stat.exists
diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml
index 825c3988d7..e4e3d1846a 100644
--- a/discovery/roles/telemetry/tasks/main.yml
+++ b/discovery/roles/telemetry/tasks/main.yml
@@ -55,3 +55,13 @@
 - name: Update ldms agg configuration
   ansible.builtin.include_tasks: update_ldms_agg_config.yml
   when: hostvars['localhost']['ldms_support']
+
+- name: Check if PXE mapping has changed since last run
+  ansible.builtin.include_tasks: check_pxe_changes.yml
+  when: hostvars['localhost']['ldms_support']
+
+- name: Restart LDMS configs for node addition and deletion
+  ansible.builtin.include_tasks: restart_ldms_configs.yml
+  when:
+    - hostvars['localhost']['ldms_support']
+    - pxe_changed | default(false) | bool
diff --git a/discovery/roles/telemetry/tasks/restart_ldms_configs.yml b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml
new file mode 100644
index 0000000000..0a176118f0
--- /dev/null
+++ b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml
@@ -0,0 +1,151 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Load high availability config
+  ansible.builtin.include_vars:
+    file: "{{ hostvars['localhost']['input_project_dir'] }}/high_availability_config.yml"
+    name: ha_config
+
+- name: Set kube_vip fact
+  ansible.builtin.set_fact:
+    kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}"
+
+- name: Test SSH connectivity to kube VIP only when PXE has changed
+  when:
+    - kube_vip | length > 0
+    - pxe_changed | default(false) | bool
+  block:
+    - name: SSH test to kube VIP
+      ansible.builtin.command:
+        cmd: "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes {{ kube_vip }} echo reachable"
+      delegate_to: localhost
+      register: kube_vip_ssh_check
+      changed_when: false
+
+    - name: Set kube VIP reachable fact
+      ansible.builtin.set_fact:
+        kube_vip_reachable: "{{ kube_vip_ssh_check.rc == 0 }}"
+
+  rescue:
+    - name: Display kube VIP unreachable message
+      ansible.builtin.debug:
+        msg: "{{ kube_vip_unreachable_msg }}"
+
+    - name: Set kube VIP reachable fact to false
+      ansible.builtin.set_fact:
+        kube_vip_reachable: false
+
+- name: Restart LDMS aggregator when PXE has changed
+  when: pxe_changed | default(false) | bool
+  block:
+    - name: Check if LDMS aggregator is running on service k8s cluster
+      kubernetes.core.k8s_info:
+        api_version: apps/v1
+        kind: StatefulSet
+        name: nersc-ldms-aggr
+        namespace: "{{ telemetry_namespace }}"
+      delegate_to: "{{ kube_vip }}"
+      register: ldms_statefulset_info
+      failed_when: false
+      when:
+        - kube_vip_reachable | bool
+
+    - name: Set LDMS running state
+      ansible.builtin.set_fact:
+        ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}"
+      when:
+        - kube_vip_reachable | bool
+
+    - name: Check if LDMS conf ConfigMap file exists
+      ansible.builtin.stat:
+        path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml"
+      register: ldms_conf_file
+      when: ldms_running | default(false) | bool
+
+    - name: Check if LDMS bin ConfigMap file exists
+      ansible.builtin.stat:
+        path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml"
+      register: ldms_bin_file
+      when: ldms_running | default(false) | bool
+
+    - name: Apply LDMS configuration ConfigMap
+      kubernetes.core.k8s:
+        state: present
+        src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml"
+        namespace: "{{ telemetry_namespace }}"
+      delegate_to: "{{ kube_vip }}"
+      failed_when: false
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_conf_file.stat.exists | default(false)
+
+    - name: Apply LDMS scripts ConfigMap
+      kubernetes.core.k8s:
+        state: present
+        src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml"
+        namespace: "{{ telemetry_namespace }}"
+      delegate_to: "{{ kube_vip }}"
+      failed_when: false
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_bin_file.stat.exists | default(false)
+
+    - name: Restart LDMS aggregator StatefulSet
+      kubernetes.core.k8s:
+        state: present
+        definition:
+          apiVersion: apps/v1
+          kind: StatefulSet
+          metadata:
+            name: nersc-ldms-aggr
+            namespace: "{{ telemetry_namespace }}"
+          spec:
+            template:
+              metadata:
+                annotations:
+                  kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}"
+      delegate_to: "{{ kube_vip }}"
+      failed_when: false
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_conf_file.stat.exists | default(false)
+        - ldms_bin_file.stat.exists | default(false)
+
+    - name: Wait for LDMS aggregator pod to be ready after restart
+      kubernetes.core.k8s_info:
+        api_version: v1
+        kind: Pod
+        namespace: "{{ telemetry_namespace }}"
+        label_selectors:
+          - "app=nersc-ldms-aggr"
+        wait: true
+        wait_condition:
+          type: Ready
+          status: "True"
+        wait_timeout: 120
+      delegate_to: "{{ kube_vip }}"
+      register: ldms_pod_ready
+      failed_when: false
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_conf_file.stat.exists | default(false)
+        - ldms_bin_file.stat.exists | default(false)
+
+    - name: Display LDMS aggregator restart status
+      ansible.builtin.debug:
+        msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}"
+      when:
+        - ldms_running | default(false) | bool
+        - ldms_conf_file.stat.exists | default(false)
+        - ldms_bin_file.stat.exists | default(false)
diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml
index 5c5838ce29..69b0c0c0ac 100644
--- a/discovery/roles/telemetry/vars/main.yml
+++ b/discovery/roles/telemetry/vars/main.yml
@@ -252,3 +252,24 @@ common_templates:
     skip_when: "{{ cluster_id_present | default(false) }}"
   - src: 'telemetry/kustomization.yaml.j2'
     dest: 'kustomization.yaml'
+
+# Usage: check_pxe_changes.yml
+backup_pxe_mapping_ldms_path: "/opt/omnia/telemetry/backup_pxe_mapping_ldms.csv"
+pxe_first_run_msg: "First discovery run detected. Saving PXE mapping backup. LDMS restart not required."
+pxe_no_change_msg: "PXE mapping file has not changed since last run. Skipping LDMS restart."
+pxe_changed_msg: "PXE mapping file has changed. LDMS restart will be triggered."
+
+# Usage: restart_ldms_configs.yml
+kube_vip_unreachable_msg: >-
+  Kube VIP ({{ kube_vip }}) is not reachable via SSH.
+  There might be issues with the k8s cluster.
+  LDMS aggregator restart will be skipped.
+
+  After discovery completes, manually restart the LDMS aggregator pod with:
+
+  ssh {{ kube_vip }}
+  kubectl rollout restart statefulset nersc-ldms-aggr -n {{ telemetry_namespace }}
+  kubectl get pods -n {{ telemetry_namespace }} -l app=nersc-ldms-aggr -w
+
+ldms_pod_ready_msg: "LDMS aggregator pod is ready."
+ldms_pod_not_ready_msg: "WARNING: LDMS aggregator pod did not become ready within 120s."

From 7953e3c519a8bd5ba72e820832ef596f062b1357 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 18 Feb 2026 00:31:55 +0530
Subject: [PATCH 39/77] Node drain logic for deletion

---
 .../slurm_config/tasks/check_ctld_running.yml |  12 ++-
 .../tasks/drain_and_remove_node.yml           | 102 ++++++++++++++++++
 discovery/roles/slurm_config/vars/main.yml    |   3 +
 3 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 discovery/roles/slurm_config/tasks/drain_and_remove_node.yml

diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 7d908169ab..ce27d3c362 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -22,6 +22,16 @@
   register: ssh_check
   ignore_errors: true
 
+- name: Drain and remove nodes if any
+  ansible.builtin.include_tasks: drain_and_remove_node.yml
+  loop: "{{ nodes_in_normal_not_in_cmpt }}"
+  loop_control:
+    loop_var: node_to_remove
+  when:
+    - ssh_check is success
+    - nodes_in_normal_not_in_cmpt is defined
+    - nodes_in_normal_not_in_cmpt | length > 0
+
 - name: Enter slurm controller when pingable
   when:
     - ssh_check is success
@@ -37,7 +47,7 @@
       register: service_facts
       ignore_unreachable: true
 
-    - name: Fail if slurmctld is unreachable
+    - name: Check slurmctld is reachable
       ansible.builtin.fail:
         msg: "Failed to connect to {{ ctld }}."
       when: service_facts is unreachable
diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
new file mode 100644
index 0000000000..7b40363808
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
@@ -0,0 +1,102 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Check if node exists in Slurm cluster
+  ansible.builtin.command: scontrol show node {{ node_to_remove }}
+  register: node_exists_check
+  failed_when: false
+  ignore_unreachable: true
+  changed_when: false
+  delegate_to: "{{ ctld }}"
+
+- name: Skip if node does not exist
+  ansible.builtin.debug:
+    msg: "Node {{ node_to_remove }} not found in cluster, skipping removal"
+  when:
+    - node_exists_check is reachable
+    - node_exists_check.rc != 0
+
+- name: Process node removal
+  when:
+    - node_exists_check is reachable
+    - node_exists_check.rc == 0
+  ignore_unreachable: true
+  block:
+    - name: Get current job count on node
+      ansible.builtin.command: squeue -w {{ node_to_remove }} -h | wc -l
+      register: current_jobs
+      changed_when: false
+      delegate_to: "{{ ctld }}"
+
+    - name: Display job information
+      ansible.builtin.debug:
+        msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)"
+
+    - name: Drain the node to prevent new job assignments
+      ansible.builtin.command: >
+        scontrol update NodeName={{ node_to_remove }}
+        State=DRAIN
+        Reason="Scheduled removal - waiting for jobs to complete"
+      changed_when: true
+      delegate_to: "{{ ctld }}"
+
+    - name: Wait for all jobs to complete on the node
+      ansible.builtin.command: squeue -w {{ node_to_remove }} -h | wc -l
+      register: job_count_check
+      until: job_count_check.stdout | int == 0
+      retries: "{{ (node_drain_timeout / node_drain_delay) | int }}"
+      delay: "{{ node_drain_delay }}"
+      changed_when: false
+      delegate_to: "{{ ctld }}"
+      when: current_jobs.stdout | int > 0
+
+    - name: Confirm jobs completed
+      ansible.builtin.debug:
+        msg: "All jobs on {{ node_to_remove }} have completed"
+      when: current_jobs.stdout | int > 0
+
+    - name: Log node removal
+      ansible.builtin.debug:
+        msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state"
+
+  rescue:
+    - name: Log node removal failure
+      ansible.builtin.debug:
+        msg: "Failed to drain node {{ node_to_remove }}"
+
+    - name: Prompt for node with running job after timeout
+      ansible.builtin.pause:
+        prompt: |
+          Jobs are still running on {{ node_to_remove }}.
+          Options:
+            1. Press Ctrl+C then 'A' to abort
+            2. Press Enter to force removal (jobs will be killed)
+      when: not force_scancel_node
+
+    - name: Force cancel jobs if timeout reached
+      ansible.builtin.command: scancel -f -w {{ node_to_remove }}
+      changed_when: true
+      failed_when: false
+      delegate_to: "{{ ctld }}"
+
+  always:
+    - name: Set node to DOWN state
+      ansible.builtin.command: >
+        scontrol update NodeName={{ node_to_remove }}
+        State=DOWN
+        Reason="Node removed from cluster"
+      changed_when: true
+      failed_when: false
+      delegate_to: "{{ ctld }}"
+      when: node_exists_check.rc == 0
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 1593f791cb..39311ca64d 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -117,6 +117,9 @@ munge_dir_mode: "0700"
 common_mode: "0755"
 slurm_dbd_mode: "0600"
 slurm_db_cnf_mode: "0600"
+node_drain_timeout: 900
+node_drain_delay: 30
+force_scancel_node: false
 dbd_slurm_conf:
   AccountingStoragePort: "{{ slurm_dbd_port }}"
   AccountingStorageType: accounting_storage/slurmdbd

From 4dbc6a978fdbcbd74c7a7c62e75ab47c399784be Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Wed, 18 Feb 2026 07:32:41 +0000
Subject: [PATCH 40/77] mask docker credentials in local_repo logs

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../library/module_utils/local_repo/parse_and_download.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py
index 15bed1efb3..d5192e2bbe 100644
--- a/common/library/module_utils/local_repo/parse_and_download.py
+++ b/common/library/module_utils/local_repo/parse_and_download.py
@@ -84,16 +84,16 @@ def execute_command(cmd_string, logger, type_json=False):
                 logger.error(f"Raw output was: {status['stdout']}")
                 return False
 
-        logger.info(f"Command succeeded: {cmd_string}")
+        logger.info(f"Command succeeded: {safe_cmd_string}")
         return status
     except subprocess.CalledProcessError as e:
-        logger.error(f"Command failed: {cmd_string} - {e}")
+        logger.error(f"Command failed: {safe_cmd_string} - {e}")
         return False
     except subprocess.TimeoutExpired as e:
-        logger.error(f"Command timed out: {cmd_string} - {e}")
+        logger.error(f"Command timed out: {safe_cmd_string} - {e}")
         return False
     except OSError as e:
-        logger.error(f"OS error during command: {cmd_string} - {e}")
+        logger.error(f"OS error during command: {safe_cmd_string} - {e}")
         return False
 
     finally:

From f657612bbaab6891b2b6342cbb186fecf45cf43f Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 18 Feb 2026 13:19:27 +0530
Subject: [PATCH 41/77] Shell instead of command for piping

---
 discovery/roles/slurm_config/tasks/drain_and_remove_node.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
index 7b40363808..2de076a5a0 100644
--- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
+++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
@@ -34,7 +34,7 @@
   ignore_unreachable: true
   block:
     - name: Get current job count on node
-      ansible.builtin.command: squeue -w {{ node_to_remove }} -h | wc -l
+      ansible.builtin.shell: squeue -w {{ node_to_remove }} -h | wc -l
       register: current_jobs
       changed_when: false
       delegate_to: "{{ ctld }}"
@@ -52,7 +52,7 @@
       delegate_to: "{{ ctld }}"
 
     - name: Wait for all jobs to complete on the node
-      ansible.builtin.command: squeue -w {{ node_to_remove }} -h | wc -l
+      ansible.builtin.shell: squeue -w {{ node_to_remove }} -h | wc -l
       register: job_count_check
       until: job_count_check.stdout | int == 0
       retries: "{{ (node_drain_timeout / node_drain_delay) | int }}"

From d8bbd64b31daa7d6a540ea514662f639ea8a1641 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Wed, 18 Feb 2026 13:35:41 +0530
Subject: [PATCH 42/77] lint fixes

---
 .../slurm_config/tasks/drain_and_remove_node.yml  | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
index 2de076a5a0..da1c41d3fe 100644
--- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
+++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
@@ -34,7 +34,10 @@
   ignore_unreachable: true
   block:
     - name: Get current job count on node
-      ansible.builtin.shell: squeue -w {{ node_to_remove }} -h | wc -l
+      ansible.builtin.shell:
+        cmd: |
+          set -o pipefail
+          squeue -w {{ node_to_remove }} -h | wc -l
       register: current_jobs
       changed_when: false
       delegate_to: "{{ ctld }}"
@@ -52,7 +55,10 @@
       delegate_to: "{{ ctld }}"
 
     - name: Wait for all jobs to complete on the node
-      ansible.builtin.shell: squeue -w {{ node_to_remove }} -h | wc -l
+      ansible.builtin.shell:
+        cmd: |
+          set -o pipefail
+          squeue -w {{ node_to_remove }} -h | wc -l
       register: job_count_check
       until: job_count_check.stdout | int == 0
       retries: "{{ (node_drain_timeout / node_drain_delay) | int }}"
@@ -75,10 +81,11 @@
       ansible.builtin.debug:
         msg: "Failed to drain node {{ node_to_remove }}"
 
-    - name: Prompt for node with running job after timeout
+    - name: Remove slurm node with running job after timeout
       ansible.builtin.pause:
         prompt: |
-          Jobs are still running on {{ node_to_remove }}.
+          Node {{ node_to_remove }} has been DRAINED to prevent new job assignments.
+          Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds.
           Options:
             1. Press Ctrl+C then 'A' to abort
             2. Press Enter to force removal (jobs will be killed)

From 76d7f3cd0c9c77fd0467a18249be12edc4236b34 Mon Sep 17 00:00:00 2001
From: Nethravathi M G <146437298+nethramg@users.noreply.github.com>
Date: Thu, 19 Feb 2026 13:04:12 +0530
Subject: [PATCH 43/77] Removing the IP's from the Activated IP list (#3992)

---
 telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
index 06bf230980..54986f418f 100644
--- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
+++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2
@@ -2,9 +2,9 @@
 
 ----- Telemetry Report for Cluster -----
 
-Total IP count with Telemetry activated: {{ (db_idrac_ip_list | length) + (telemetry_idrac | length) }}
+Total IP count with Telemetry activated: {{ ((db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([]))) | length }}
 Telemetry activated IPs List:
-{% for item in db_idrac_ip_list + telemetry_idrac %}
+{% for item in (db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([])) %}
   - {{ item }}
 {% endfor %}
 

From 272bfb51c94fe7283bc3256c32894882b7b032e8 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Thu, 19 Feb 2026 14:41:35 +0530
Subject: [PATCH 44/77] Fix for local_repo.yml allows passes even with invalid
 package names in JSON files.

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../library/module_utils/local_repo/config.py |   6 +-
 .../local_repo/container_repo_utils.py        | 161 ++++++++++--------
 .../module_utils/local_repo/download_rpm.py   |  89 +++++++++-
 3 files changed, 178 insertions(+), 78 deletions(-)

diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index a731c8528d..7bfea4b301 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -64,6 +64,10 @@
     "x86_64": ["dnf", "download", "--resolve", "--alldeps", "--arch=x86_64,noarch"],
     "aarch64": ["dnf", "download", "--forcearch", "aarch64", "--resolve", "--alldeps", "--exclude=*.x86_64"]
 }
+DNF_INFO_COMMANDS = {
+    "x86_64": ["dnf", "info", "--quiet"],
+    "aarch64": ["dnf", "info", "--quiet", "--forcearch=aarch64"]
+}
 
 # ----------------------------
 # Used by download_common.py
@@ -222,7 +226,7 @@
 # Naming convention: <arch>_omnia-additional to match existing filter patterns
 # ----------------------------
 ADDITIONAL_REPOS_KEY = "additional_repos"
-AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional-repo"
+AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional"
 AGGREGATED_REMOTE_NAME_TEMPLATE = "{arch}_omnia-additional-{name}"
 AGGREGATED_DISTRIBUTION_NAME_TEMPLATE = "{arch}_omnia-additional"
 AGGREGATED_BASE_PATH_TEMPLATE = "opt/omnia/offline_repo/cluster/{arch}/rhel/10.0/rpms/omnia-additional"
diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py
index 0a4abb35fb..e3f47869af 100644
--- a/common/library/module_utils/local_repo/container_repo_utils.py
+++ b/common/library/module_utils/local_repo/container_repo_utils.py
@@ -13,6 +13,13 @@
 # limitations under the License.
 #pylint: disable=import-error,no-name-in-module
 
+"""
+Container repository utilities for Pulp operations.
+
+This module provides functions for creating, syncing, and managing
+container repositories and distributions in Pulp.
+"""
+
 import multiprocessing
 from ansible.module_utils.local_repo.parse_and_download import execute_command
 from ansible.module_utils.local_repo.config import (
@@ -114,109 +121,119 @@ def sync_container_repository(repo_name, remote_name, package_content, logger, t
         logger.info(f"Getting repository version before sync for {repo_name}")
         verify_command = pulp_container_commands["show_container_repo"] % repo_name
         verify_result_before = execute_command(verify_command, logger, type_json=True)
-        
+
         version_before = None
-        if verify_result_before and isinstance(verify_result_before, dict) and "stdout" in verify_result_before:
+        if (verify_result_before and isinstance(verify_result_before, dict) and 
+                "stdout" in verify_result_before):
             repo_data_before = verify_result_before["stdout"]
             if isinstance(repo_data_before, dict):
                 version_before = repo_data_before.get("latest_version_href")
                 logger.info(f"Repository version before sync: {version_before}")
-        
+
         command = pulp_container_commands["sync_container_repository"] % (repo_name, remote_name)
         result = execute_command(command,logger)
         if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0):
             logger.error(f"Sync command failed for repository {repo_name}")
             return False
-        
+
         logger.info(f"Validating sync result for repository {repo_name}")
         verify_result_after = execute_command(verify_command, logger, type_json=True)
-        
-        if verify_result_after and isinstance(verify_result_after, dict) and "stdout" in verify_result_after:
+
+        if (verify_result_after and isinstance(verify_result_after, dict) and 
+                "stdout" in verify_result_after):
             repo_data_after = verify_result_after["stdout"]
             if isinstance(repo_data_after, dict):
                 version_after = repo_data_after.get("latest_version_href")
                 logger.info(f"Repository version after sync: {version_after}")
-                
+
                 if not version_after or version_after.endswith("/versions/0/"):
                     logger.error(f"Sync completed but no content was downloaded for {repo_name}. "
                                f"The specified image tag likely does not exist in the upstream registry.")
                     return False
-                
+
                 if version_before and version_after and version_before == version_after:
                     # Check if tag actually exists using precise Pulp commands
                     try:
                         # Step 1: Get distribution to find repository href
                         dist_command = f"pulp container distribution show --name {repo_name}"
                         dist_result = execute_command(dist_command, logger, type_json=True)
-                        
+
                         if not dist_result or not isinstance(dist_result, dict) or "stdout" not in dist_result:
-                            logger.error(f"Failed to get distribution info for {repo_name}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        dist_data = dist_result["stdout"]
-                        if not isinstance(dist_data, dict) or "repository" not in dist_data:
-                            logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        repo_href = dist_data["repository"]
-                        logger.info(f"Found repository href: {repo_href}")
-                        
-                        # Step 2: Get repository version href
-                        repo_command = f"pulp container repository show --href {repo_href}"
-                        repo_result = execute_command(repo_command, logger, type_json=True)
-                        
-                        if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result:
-                            logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        repo_data = repo_result["stdout"]
-                        if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data:
-                            logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        repo_ver_href = repo_data["latest_version_href"]
-                        logger.info(f"Found repository version href: {repo_ver_href}")
-                        
-                        # Step 3: Check if tag exists in content
-                        tags_command = f"pulp show --href '/pulp/api/v3/content/container/tags/?repository_version={repo_ver_href}'"
-                        tags_result = execute_command(tags_command, logger, type_json=True)
-                        
-                        if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result:
-                            logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        tags_data = tags_result["stdout"]
-                        if not isinstance(tags_data, dict) or "results" not in tags_data:
-                            logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.")
-                            return False
-                        
-                        tags = tags_data["results"]
-                        tag_exists = False
-                        
-                        # Use the tag parameter if provided, otherwise fall back to checking package_content
-                        tag_to_check = tag if tag else package_content
-                        
-                        for tag_item in tags:
-                            if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check:
-                                tag_exists = True
-                                break
-                        
-                        if tag_exists:
-                            logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.")
+                            logger.info(f"Distribution {repo_name} does not exist yet - skipping tag validation, will create distribution")
+                        # Skip tag validation but continue to create distribution at line 221
                         else:
-                            logger.error(f"Sync completed but repository version did not change for {repo_name}. "
-                                       f"Version remained at {version_after}. "
-                                       f"Tag '{tag_to_check}' does not exist in Pulp repository content. "
-                                       f"This indicates the tag likely does not exist in the upstream registry.")
-                            return False
+                            # Distribution exists, validate the tag
+                            dist_data = dist_result["stdout"]
+                            if not isinstance(dist_data, dict) or "repository" not in dist_data:
+                                logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.")
+                                return False
+                            repo_href = dist_data["repository"]
+                            logger.info(f"Found repository href: {repo_href}")
+
+                            # Step 2: Get repository version href
+                            repo_command = f"pulp container repository show --href {repo_href}"
+                            repo_result = execute_command(repo_command, logger, type_json=True)
+
+                            if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result:
+                                logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.")
+                                return False
+
+                            repo_data = repo_result["stdout"]
+                            if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data:
+                                logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.")
+                                return False
+
+                            repo_ver_href = repo_data["latest_version_href"]
+                            logger.info(f"Found repository version href: {repo_ver_href}")
+
+                            # Step 3: Check if tag exists in content
+                            tags_command = (
+                                f"pulp show --href "
+                                f"'/pulp/api/v3/content/container/tags/"
+                                f"?repository_version={repo_ver_href}'"
+                            )
+                            tags_result = execute_command(tags_command, logger, type_json=True)
+
+                            if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result:
+                                logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.")
+                                return False
+
+                            tags_data = tags_result["stdout"]
+                            if not isinstance(tags_data, dict) or "results" not in tags_data:
+                                logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.")
+                                return False
+
+                            tags = tags_data["results"]
+                            tag_exists = False
+
+                            # Use the tag parameter if provided, otherwise fall back to checking package_content
+                            tag_to_check = tag if tag else package_content
+
+                            for tag_item in tags:
+                                if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check:
+                                    tag_exists = True
+                                    break
+
+                            if tag_exists:
+                                logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.")
+                            else:
+                                logger.error(f"Sync completed but repository version did not change for {repo_name}. "
+                                        f"Version remained at {version_after}. "
+                                        f"Tag '{tag_to_check}' does not exist in Pulp repository content. "
+                                        f"This indicates the tag likely does not exist in the upstream registry.")
+                                return False
                             
                     except Exception as e:
-                        logger.error(f"Error checking repository tag existence: {e}. Assuming tag doesn't exist.")
+                        logger.error(
+                            f"Error checking repository tag existence: {e}. Assuming tag doesn't exist."
+                        )
                         return False
-                
-                logger.info(f"Sync validation successful: repository {repo_name} version changed from {version_before} to {version_after}")
-        
-        result = create_container_distribution(repo_name,package_content,logger)
+
+                logger.info(
+                    f"Sync validation successful: repository {repo_name} version changed "
+                    f"from {version_before} to {version_after}"
+                )
+        result = create_container_distribution(repo_name, package_content, logger)
         return result
     except Exception as e:
         logger.error(f"Failed to synchronize repository {repo_name} with remote {remote_name}. Error: {e}")
diff --git a/common/library/module_utils/local_repo/download_rpm.py b/common/library/module_utils/local_repo/download_rpm.py
index 95b354dd6b..44b56c1799 100644
--- a/common/library/module_utils/local_repo/download_rpm.py
+++ b/common/library/module_utils/local_repo/download_rpm.py
@@ -20,7 +20,8 @@
 import shutil
 from pathlib import Path
 from ansible.module_utils.local_repo.config import (
-    DNF_COMMANDS
+    DNF_COMMANDS,
+    DNF_INFO_COMMANDS
 )
 from multiprocessing import Lock
 from ansible.module_utils.local_repo.parse_and_download import write_status_to_file
@@ -95,11 +96,30 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
             for pkg in rpm_list:
                 # Get repo_name for this specific RPM from mapping
                 pkg_repo_name = repo_mapping.get(pkg, "")
-                if any(pkg in line and ".rpm" in line for line in stdout_lines + stderr_lines):
+                # Check if package was downloaded successfully
+                # Look for "Already downloaded" or actual .rpm file in output
+                pkg_downloaded = False
+                for line in stdout_lines + stderr_lines:
+                    if pkg in line and (".rpm" in line or "Already downloaded" in line):
+                        pkg_downloaded = True
+                        break
+
+                # Also check for "No match for argument" or "No package" errors
+                pkg_not_found = False
+                for line in stderr_lines:
+                    if pkg in line and ("No match for argument" in line or 
+                                       "No package" in line or
+                                       "not found" in line.lower()):
+                        pkg_not_found = True
+                        break
+
+                if pkg_downloaded and not pkg_not_found:
                     downloaded.append(pkg)
                     write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name)
                 else:
                     failed.append(pkg)
+                    if pkg_not_found:
+                        logger.warning(f"Package '{pkg}' not found in configured repositories")
 
             # Retry failed ones individually
             if failed:
@@ -110,6 +130,15 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
                     # Get repo_name for this specific RPM from mapping
                     pkg_repo_name = repo_mapping.get(pkg, "")
 
+                    # Check for package not found errors
+                    retry_stderr = retry_res.stderr.lower()
+                    pkg_invalid = any(err in retry_stderr for err in [
+                        "no match for argument",
+                        "no package",
+                        "not found",
+                        "unable to find a match"
+                    ])
+
                     if retry_res.returncode == 0 and ".rpm" in retry_res.stdout + retry_res.stderr:
                         downloaded.append(pkg)
                         failed.remove(pkg)
@@ -117,7 +146,10 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
                         logger.info(f"Package '{pkg}' downloaded successfully on retry.")
                     else:
                         write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name)
-                        logger.error(f"Package '{pkg}' still failed after retry.")
+                        if pkg_invalid:
+                            logger.error(f"Package '{pkg}' does not exist in configured repositories.")
+                        else:
+                            logger.error(f"Package '{pkg}' still failed after retry.")
 
             # Determine final status
             if not failed:
@@ -128,12 +160,59 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type,
                 status = "Failed"
 
         else:
-            status = "Success"
             logger.info("RPM won't be downloaded when repo_config is partial or never")
+            logger.info("Validating package availability using dnf info...")
+
+            arch_key = "x86_64" if arc.lower() in ("x86_64") else "aarch64"
+            valid_packages = []
+            invalid_packages = []
+
             for pkg in package["rpm_list"]:
+                # Validate package using dnf info
+                dnf_info_command = DNF_INFO_COMMANDS[arch_key] + [
+                    "--repo=*",  # Search all enabled repositories
+                    pkg
+                ]
+                result = subprocess.run(
+                    dnf_info_command,
+                    check=False,
+                    capture_output=True,
+                    text=True
+                )
                 # Get repo_name for this specific RPM from mapping
                 pkg_repo_name = repo_mapping.get(pkg, "")
-                write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name)
+                if result.returncode == 0:
+                    # Package exists and is available
+                    valid_packages.append(pkg)
+                    write_status_to_file(
+                        status_file_path, pkg, "rpm", "Success", 
+                        logger, file_lock, pkg_repo_name
+                    )
+                    logger.info(f"Package '{pkg}' validated successfully")
+                else:
+                    # Package not found or invalid
+                    invalid_packages.append(pkg)
+                    write_status_to_file(
+                        status_file_path, pkg, "rpm", "Failed", 
+                        logger, file_lock, pkg_repo_name
+                    )
+                    logger.error(
+                        f"Package '{pkg}' validation failed. "
+                        f"Package may not exist in configured repositories."
+                    )
+
+            # Determine final status based on validation results
+            if not invalid_packages:
+                status = "Success"
+            elif valid_packages:
+                status = "Partial"
+            else:
+                status = "Failed"
+
+            logger.info(
+                f"Validation complete - Valid: {len(valid_packages)}, "
+                f"Invalid: {len(invalid_packages)}"
+            )
 
     except Exception as e:
         logger.error(f"Exception occurred: {e}")

From 5a03ffcf03c7dc7612ccd59f501b495fa105d4d1 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Thu, 19 Feb 2026 09:40:22 +0000
Subject: [PATCH 45/77] checkmarx fixes

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../module_utils/local_repo/software_utils.py | 38 ++++++++++++++++++-
 .../local_repo/user_image_utility.py          |  8 ++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index 3e06ddc7cd..126020f930 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -21,6 +21,7 @@
 import json
 import csv
 import re
+import shlex
 import yaml
 from jinja2 import Template
 import requests
@@ -526,6 +527,37 @@ def get_failed_software(file_path):
     ]
     return failed_software
 
+def _sanitize_shell_arg(value, logger, field_name="value"):
+    """
+    Sanitize a value before using it in a shell command to prevent argument injection.
+
+    Validates the value against a strict allowlist of characters that are safe
+    for shell interpolation, then applies shlex.quote for safe shell escaping.
+
+    Args:
+        value (str): The value to sanitize.
+        logger (logging.Logger): Logger instance.
+        field_name (str): Name of the field being sanitized (for logging).
+
+    Returns:
+        str: The sanitized, shell-quoted value.
+
+    Raises:
+        ValueError: If the value contains disallowed characters.
+    """
+    if not isinstance(value, str) or not value:
+        raise ValueError(f"Invalid {field_name}: must be a non-empty string")
+    value = value.strip().strip('"')
+    safe_pattern = re.compile(r'^[a-zA-Z0-9._\-/:@=?&\[\]]+$')
+    if not safe_pattern.match(value):
+        logger.error("Potentially unsafe characters detected in %s: %s", field_name, value)
+        raise ValueError(
+            f"Invalid {field_name}{value}: contains disallowed characters. "
+            f"Only alphanumeric characters and ._-/:@=?&[] are allowed."
+        )
+    return shlex.quote(value)
+
+
 def check_additional_image_in_pulp(image_entry, logger):
     """
     Checks if image present in additional_packages.json is configured in Pulp.
@@ -536,6 +568,8 @@ def check_additional_image_in_pulp(image_entry, logger):
 
     logger.info("Checking if %s is present in Pulp", image_name)
 
+    _sanitize_shell_arg(image_name, logger, "image_name")
+
     dist_name_prefix = "container_repo_"
     transformed_dist_name = (f"{dist_name_prefix}{image_name.replace('/', '_').replace(':', '_')}")
 
@@ -543,7 +577,7 @@ def check_additional_image_in_pulp(image_entry, logger):
     latest_version_href_result = None
     tags_output_result = None
 
-    show_dist_cmd = (pulp_container_commands["container_distribution_show"] % transformed_dist_name)
+    show_dist_cmd = (pulp_container_commands["container_distribution_show"] % shlex.quote(transformed_dist_name))
     repo_href_result = execute_command(show_dist_cmd, logger)
     logger.info("repo_href_result: %s", repo_href_result)
 
@@ -557,6 +591,7 @@ def check_additional_image_in_pulp(image_entry, logger):
     else:
         logger.info("Distribution %s found in Pulp", transformed_dist_name)
         repo_href = repo_href_result["stdout"]
+        repo_href = _sanitize_shell_arg(repo_href, logger, "repo_href")
         show_repo_cmd = (pulp_container_commands["show_repository_version"] % repo_href)
         latest_version_href_result = execute_command(show_repo_cmd, logger)
         logger.info("latest_version_href_result: %s", latest_version_href_result)
@@ -570,6 +605,7 @@ def check_additional_image_in_pulp(image_entry, logger):
         else:
             logger.info("Repository version found in Pulp")
             latest_version_href = latest_version_href_result["stdout"]
+            latest_version_href = _sanitize_shell_arg(latest_version_href, logger, "latest_version_href")
             show_tags_cmd = (pulp_container_commands["list_image_tags"] % latest_version_href)
             tags_output_result = execute_command(show_tags_cmd, logger, type_json=True)
             logger.info("tags_output_result: %s", tags_output_result)
diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py
index e97e9411dd..4c68cd1803 100644
--- a/common/library/module_utils/local_repo/user_image_utility.py
+++ b/common/library/module_utils/local_repo/user_image_utility.py
@@ -58,9 +58,11 @@ def check_image_in_registry(
     """
 
     if not host.startswith(("http://", "https://")):
-        protocol = "https" if (cacert and key) else "http"
-        host = f"{protocol}://{host}"
-    image_url = f"{host}/v2/{image}/manifests/{tag}"
+        if cacert and key:
+            image_url = f"https://{host}/v2/{image}/manifests/{tag}"
+        else:
+            image_url = f"http://{host}/v2/{image}/manifests/{tag}"
+
     logger.info(f"Checking image existence at: {image_url}")
 
     try:

From fa0cd325ee7b38bafedc794c6bb47242d88323f1 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 19 Feb 2026 17:19:13 +0530
Subject: [PATCH 46/77] Updated permission for slurmdbd Added new force_conf
 option for allowing confs pass through validation

---
 .../input_validation/schema/omnia_config.json |  4 ++
 .../validation_flows/common_validation.py     |  9 ++-
 .../slurm_config/tasks/build_slurm_conf.yml   |  5 ++
 discovery/roles/slurm_config/tasks/confs.yml  | 14 ++--
 .../slurm_config/tasks/create_slurm_dir.yml   |  1 +
 .../tasks/handle_forced_confs.yml             | 64 +++++++++++++++++++
 .../roles/slurm_config/tasks/remove_node.yml  |  2 +-
 discovery/roles/slurm_config/vars/main.yml    |  3 +-
 input/omnia_config.yml                        | 10 +++
 9 files changed, 102 insertions(+), 10 deletions(-)
 create mode 100644 discovery/roles/slurm_config/tasks/handle_forced_confs.yml

diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json
index f53485770f..f7771d9441 100644
--- a/common/library/module_utils/input_validation/schema/omnia_config.json
+++ b/common/library/module_utils/input_validation/schema/omnia_config.json
@@ -19,6 +19,10 @@
             "minLength": 1,
             "description": "Name of the nfs storage in storage_config.yml" 
           },
+          "force_conf": { 
+            "type": "boolean", 
+            "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" 
+          },
           "config_sources": {
             "type": "object",
             "description": "Config can be a file path or inline mapping",
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index f577a4e9b8..7726df24fb 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -1074,9 +1074,12 @@ def validate_omnia_config(
                     "slurm NFS not provided",
                     f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}"
                     ))
-        cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
+                    
         skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation")
-        for cfg_path_dict in cnfg_src:
+        cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
+        force_conf_list = [clst.get('force_conf', False) for clst in data.get('slurm_cluster')]
+        for idx, cfg_path_dict in enumerate(cnfg_src):
+            force_conf = force_conf_list[idx]
             for k,v in cfg_path_dict.items():
                 conf_dict = None
                 if isinstance(v, str):
@@ -1086,7 +1089,7 @@ def validate_omnia_config(
                                 f"provided conf path for {k} - {v} does not exist"))
                         continue
                     else: # path exists
-                        if not skip_conf_validation:
+                        if not force_conf and not skip_conf_validation:
                             conf_dict, duplicate_keys = parse_slurm_conf(v, k, False)
                             if duplicate_keys:
                                 errors.append(
diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
index 9d5d0f0944..40b6137172 100644
--- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
+++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Read NodeName parameters from iDRAC
+  ansible.builtin.include_tasks: read_node_idrac.yml
+  when: cmpt_list
+  loop: "{{ cmpt_list }}"
+
 - name: Append node_params list into NodeName list
   ansible.builtin.set_fact:
     apply_config: "{{ apply_config | default({})
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index c5f7953b0d..3764ecc18a 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -17,13 +17,16 @@
     apply_config: "{{ __default_config }}"
   no_log: true
 
-- name: Read NodeName parameters
-  ansible.builtin.include_tasks: read_node_idrac.yml
-  when: cmpt_list
-  loop: "{{ cmpt_list }}"
+- name: Remove keys from conf_files if they have string values in configs_input (when force_conf is true)
+  ansible.builtin.set_fact:
+    conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}"
+  when:
+    - force_conf | default(false)
+    - configs_input is defined
 
 - name: Build slurm.conf
   ansible.builtin.include_tasks: build_slurm_conf.yml
+  when: "'slurm' in conf_files"
 
 - name: Slurm dbd opts
   ansible.builtin.set_fact:
@@ -167,12 +170,13 @@
 - name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd
   ansible.builtin.set_fact:
     conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}"
+  when: slurm_conf_dict is defined
 
 - name: Write merged .conf
   ansible.builtin.copy:
     content: "{{ item.ini_lines | join('\n') }}\n"
     dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf"
-    mode: "0640"
+    mode: "{{ slurm_dbd_mode if item.item.key == 'slurmdbd' else slurm_mode }}"
     owner: "{{ slurm_user }}"
     group: "{{ slurm_user_group }}"
     remote_src: "{{ copy_from_oim }}"
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index e4ac760d77..f2182db18e 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -60,6 +60,7 @@
   ansible.builtin.set_fact:
     cluster_name: "{{ slurm_cluster[0].cluster_name }}"
     configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}"
+    force_conf: "{{ slurm_cluster[0].force_conf | default(false) }}"
     slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}"
     controller_trackfile_path: "{{ share_path }}/ctld_track"
 
diff --git a/discovery/roles/slurm_config/tasks/handle_forced_confs.yml b/discovery/roles/slurm_config/tasks/handle_forced_confs.yml
new file mode 100644
index 0000000000..1862359cb1
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/handle_forced_confs.yml
@@ -0,0 +1,64 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Parse forced conf file from localhost
+  slurm_conf:
+    op: parse
+    conf_name: "{{ forced_conf }}"
+    path: "{{ configs_input[forced_conf] }}"
+  delegate_to: localhost
+  register: forced_conf_parsed
+  no_log: true
+  when:
+    - configs_input[forced_conf] is string
+
+- name: Use forced conf dict directly
+  ansible.builtin.set_fact:
+    forced_conf_dict: "{{ configs_input[forced_conf] }}"
+  no_log: true
+  when:
+    - configs_input[forced_conf] is mapping
+
+- name: Use parsed forced conf dict
+  ansible.builtin.set_fact:
+    forced_conf_dict: "{{ forced_conf_parsed.conf_dict }}"
+  no_log: true
+  when:
+    - configs_input[forced_conf] is string
+    - forced_conf_parsed is success
+
+- name: Convert forced conf to ini format
+  slurm_conf:
+    op: merge
+    conf_sources: "{{ [forced_conf_dict] }}"
+    conf_name: "{{ forced_conf }}"
+  register: forced_conf_result
+  delegate_to: localhost
+  no_log: true
+  when:
+    - forced_conf_dict is defined
+
+- name: Write forced .conf file as-is
+  ansible.builtin.copy:
+    content: "{{ forced_conf_result.ini_lines | join('\n') }}\n"
+    dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ forced_conf }}.conf"
+    mode: "0640"
+    owner: "{{ slurm_user }}"
+    group: "{{ slurm_user_group }}"
+    remote_src: "{{ copy_from_oim }}"
+  register: forced_conf_written
+  no_log: true
+  when:
+    - forced_conf_result is defined
+    - forced_conf_result.ini_lines is defined
diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml
index 4dc0217559..ba93bb086a 100644
--- a/discovery/roles/slurm_config/tasks/remove_node.yml
+++ b/discovery/roles/slurm_config/tasks/remove_node.yml
@@ -30,7 +30,7 @@
 - name: Update normal partition Nodes to match cmpt_list
   ansible.builtin.set_fact:
     updated_partitions: "{{ updated_partitions | default([])
-     + [item | combine({'Nodes': cmpt_list | join(',')}) if item.PartitionName == slurm_partition_name else item] }}"
+     + [item | combine({'Nodes': (cmpt_list | join(',')) if cmpt_list | length > 0 else 'ALL'}) if item.PartitionName == slurm_partition_name else item] }}"
   loop: "{{ slurm_conf_dict.PartitionName | default([]) }}"
   when:
     - "'slurm' in conf_merge_dict"
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 39311ca64d..d708eb0777 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -68,6 +68,7 @@ gpu_slurm_conf:
   SlurmdParameters: l3cache_as_socket
 innodb_buffer_pool_size: 4G
 innodb_lock_wait_timeout: 900
+conf_server: "--conf-server {{ ctld_list | join(',') }}"
 # TODO tmp
 nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
 bmc_username: "{{ hostvars['localhost']['bmc_username'] }}"
@@ -125,7 +126,7 @@ dbd_slurm_conf:
   AccountingStorageType: accounting_storage/slurmdbd
 partition_params:
   PartitionName: "{{ slurm_partition_name }}"
-  Nodes: "{{ cmpt_list | join(',') }}"
+  Nodes: "{{ cmpt_list | join(',') if cmpt_list else 'ALL' }}"
   MaxTime: "INFINITE"
   State: "UP"
   Default: "YES"
diff --git a/input/omnia_config.yml b/input/omnia_config.yml
index bb5a4f06fa..75cc599c81 100644
--- a/input/omnia_config.yml
+++ b/input/omnia_config.yml
@@ -27,6 +27,15 @@
 # Storage name corresponding to the NFS share to be used by slurm cluster 
 # This should match with exactly with a entry in storage_config.yml
 
+# force_conf
+# Variable indicates whether a specific configuration file path
+# under config_sources should be used as-is without merging
+# If force_conf is set to true for a configuration source path,
+# that configuration file will be applied directly
+# without merging with defaults or existing configurations
+# It accepts true and false values
+# Default value is false
+
 # config_sources
 # defines how the Slurm configuration files are provided to the cluster.
 # <conf name>: 
@@ -50,6 +59,7 @@
 slurm_cluster:
   - cluster_name: slurm_cluster
     nfs_storage_name: nfs_slurm
+    # force_conf: true
     # config_sources:
     #   slurm:
     #     SlurmctldTimeout: 60

From 123df9514617a76a2fc6b376baca3070d92cf951 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 19 Feb 2026 17:24:44 +0530
Subject: [PATCH 47/77] removede new file

---
 .../tasks/handle_forced_confs.yml             | 64 -------------------
 1 file changed, 64 deletions(-)
 delete mode 100644 discovery/roles/slurm_config/tasks/handle_forced_confs.yml

diff --git a/discovery/roles/slurm_config/tasks/handle_forced_confs.yml b/discovery/roles/slurm_config/tasks/handle_forced_confs.yml
deleted file mode 100644
index 1862359cb1..0000000000
--- a/discovery/roles/slurm_config/tasks/handle_forced_confs.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-- name: Parse forced conf file from localhost
-  slurm_conf:
-    op: parse
-    conf_name: "{{ forced_conf }}"
-    path: "{{ configs_input[forced_conf] }}"
-  delegate_to: localhost
-  register: forced_conf_parsed
-  no_log: true
-  when:
-    - configs_input[forced_conf] is string
-
-- name: Use forced conf dict directly
-  ansible.builtin.set_fact:
-    forced_conf_dict: "{{ configs_input[forced_conf] }}"
-  no_log: true
-  when:
-    - configs_input[forced_conf] is mapping
-
-- name: Use parsed forced conf dict
-  ansible.builtin.set_fact:
-    forced_conf_dict: "{{ forced_conf_parsed.conf_dict }}"
-  no_log: true
-  when:
-    - configs_input[forced_conf] is string
-    - forced_conf_parsed is success
-
-- name: Convert forced conf to ini format
-  slurm_conf:
-    op: merge
-    conf_sources: "{{ [forced_conf_dict] }}"
-    conf_name: "{{ forced_conf }}"
-  register: forced_conf_result
-  delegate_to: localhost
-  no_log: true
-  when:
-    - forced_conf_dict is defined
-
-- name: Write forced .conf file as-is
-  ansible.builtin.copy:
-    content: "{{ forced_conf_result.ini_lines | join('\n') }}\n"
-    dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ forced_conf }}.conf"
-    mode: "0640"
-    owner: "{{ slurm_user }}"
-    group: "{{ slurm_user_group }}"
-    remote_src: "{{ copy_from_oim }}"
-  register: forced_conf_written
-  no_log: true
-  when:
-    - forced_conf_result is defined
-    - forced_conf_result.ini_lines is defined

From d78a74a04c8fb9e0555196a6be0287d0e2f4326d Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Thu, 19 Feb 2026 17:40:37 +0530
Subject: [PATCH 48/77] Lock Mechanism added for Upgrade Sequence Integrity

---
 build_image_aarch64/build_image_aarch64.yml   |  3 +
 build_image_x86_64/build_image_x86_64.yml     |  3 +
 discovery/discovery.yml                       |  3 +
 local_repo/local_repo.yml                     |  3 +
 omnia.sh                                      | 87 ++++++++++++++-----
 prepare_oim/prepare_oim.yml                   |  3 +
 .../tasks/display_warnings.yml                |  2 +
 upgrade/upgrade_omnia.yml                     | 10 +++
 utils/upgrade_checkup.yml                     | 33 +++++++
 9 files changed, 125 insertions(+), 22 deletions(-)
 create mode 100644 utils/upgrade_checkup.yml

diff --git a/build_image_aarch64/build_image_aarch64.yml b/build_image_aarch64/build_image_aarch64.yml
index 08ee0b4ad8..d5dc76a82d 100644
--- a/build_image_aarch64/build_image_aarch64.yml
+++ b/build_image_aarch64/build_image_aarch64.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml
index 676d8adbd6..8f56b86ef6 100644
--- a/build_image_x86_64/build_image_x86_64.yml
+++ b/build_image_x86_64/build_image_x86_64.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/discovery/discovery.yml b/discovery/discovery.yml
index 75efadb47c..40fd00123c 100644
--- a/discovery/discovery.yml
+++ b/discovery/discovery.yml
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: ../utils/include_input_dir.yml
diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml
index 3a743c3f47..963715b5e3 100644
--- a/local_repo/local_repo.yml
+++ b/local_repo/local_repo.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/omnia.sh b/omnia.sh
index 3b320b0bf6..25cfb01dec 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -398,6 +398,19 @@ cleanup_omnia_core() {
         # Fetch the configuration from the Omnia core container.
         fetch_config
 
+        # Clear upgrade guard lock if present (shared path visible to container and host)
+        local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
+        local upgrade_guard_lock_host
+        upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
+        if [ -n "$upgrade_guard_lock_host" ]; then
+            upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
+        else
+            upgrade_guard_lock_host="$upgrade_guard_lock_container"
+        fi
+
+        rm -f "$upgrade_guard_lock_host" >/dev/null 2>&1 || true
+        echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_host"
+
         # Remove the container
         remove_container
 
@@ -1837,6 +1850,22 @@ upgrade_omnia_core() {
     touch "$lock_file"
     trap 'rm -f "$lock_file"' EXIT
 
+    # Create upgrade guard lock in shared path so other playbooks can block during upgrade
+    local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
+    local upgrade_guard_lock_host
+    upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
+    if [ -n "$upgrade_guard_lock_host" ]; then
+        upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
+    else
+        upgrade_guard_lock_host="$upgrade_guard_lock_container"
+    fi
+
+    mkdir -p "$(dirname "$upgrade_guard_lock_host")" 2>/dev/null || true
+    echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_host" || {
+        echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_host${NC}"
+        exit 1
+    }
+
     # Run upgrade phases
     if ! phase1_validate; then
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1"
@@ -1874,8 +1903,10 @@ upgrade_omnia_core() {
     echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully"
     echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base"
 
+    # Seed inputs and defaults after upgrade
+    post_setup_config
+
     show_post_upgrade_instructions "$TARGET_OMNIA_VERSION"
-    
     # Initialize SSH config and start container session
     init_ssh_config
     start_container_session
@@ -1885,15 +1916,15 @@ upgrade_omnia_core() {
 # Validate backup directory structure and files
 validate_backup_directory() {
     local backup_path="$1"
-    
+
     echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path"
-    
+
     # Check if backup directory exists
     if ! podman exec -u root omnia_core test -d "$backup_path"; then
         echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path"
         return 1
     fi
-    
+
     # Check for required subdirectories
     for subdir in input metadata configs; do
         if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then
@@ -1901,24 +1932,24 @@ validate_backup_directory() {
             return 1
         fi
     done
-    
+
     # Check for required files
     if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then
         echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml"
         return 1
     fi
-    
+
     if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then
         echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container"
         return 1
     fi
-    
+
     # Verify metadata contains version information
     if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then
         echo "[ERROR] [ROLLBACK] Metadata file does not contain version information"
         return 1
     fi
-    
+
     echo "[INFO] [ROLLBACK] Backup validation successful"
     return 0
 }
@@ -1927,15 +1958,15 @@ validate_backup_directory() {
 stop_container_gracefully() {
     local container_name="$1"
     local timeout="${2:-30}"
-    
+
     echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..."
-    
+
     # Try graceful stop first
     if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then
         echo "[INFO] [ROLLBACK] Container stopped gracefully"
         return 0
     fi
-    
+
     # Check if container is still running
     if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
         echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..."
@@ -1947,16 +1978,16 @@ stop_container_gracefully() {
             return 1
         fi
     fi
-    
+
     return 0
 }
 
 # Restore files from backup
 restore_from_backup() {
     local backup_path="$1"
-    
+
     echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path"
-    
+
     # Restore input files
     if ! podman exec -u root omnia_core bash -c "
         set -e
@@ -1966,19 +1997,19 @@ restore_from_backup() {
         echo "[ERROR] [ROLLBACK] Failed to restore input files"
         return 1
     fi
-    
+
     # Restore metadata
     if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then
         echo "[ERROR] [ROLLBACK] Failed to restore metadata"
         return 1
     fi
-    
+
     # Restore container config on host
     if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then
         echo "[ERROR] [ROLLBACK] Failed to restore container config"
         return 1
     fi
-    
+
     echo "[INFO] [ROLLBACK] Files restored successfully"
     return 0
 }
@@ -2006,8 +2037,8 @@ display_cleanup_instructions() {
     echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}"
     echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}"
     echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}"
-    echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}"
-    echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}"
+    echo -e "${YELLOW}4. Stop all containers: podman stop $(podman ps -aq)${NC}"
+    echo -e "${YELLOW}5. Remove all containers: podman rm -f $(podman ps -aq)${NC}"
     echo -e "${YELLOW}6. Clean shared path: rm -rf <omnia_shared_path>${NC}"
     echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}"
     echo ""
@@ -2015,7 +2046,6 @@ display_cleanup_instructions() {
     echo ""
 }
 
-# Main rollback function
 rollback_omnia_core() {
     echo -e "${GREEN}================================================================================${NC}"
     echo -e "${GREEN}                         OMNIA CORE ROLLBACK${NC}"
@@ -2287,7 +2317,20 @@ rollback_omnia_core() {
     # Clean up lock file before starting long-running ssh session
     rm -f "$lock_file" >/dev/null 2>&1 || true
     echo "[INFO] Rollback lock file removed before starting container session"
-    
+
+    # Clear upgrade guard lock if it exists (shared path visible to container and host)
+    local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
+    local upgrade_guard_lock_host
+    upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
+    if [ -n "$upgrade_guard_lock_host" ]; then
+        upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
+    else
+        upgrade_guard_lock_host="$upgrade_guard_lock_container"
+    fi
+
+    rm -f "$upgrade_guard_lock_host" >/dev/null 2>&1 || true
+    echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_host"
+
     # Initialize SSH config and start container session
     init_ssh_config
     start_container_session
@@ -2325,4 +2368,4 @@ main() {
 }
 
 # Call the main function
-main "$1"
+main "$1"
\ No newline at end of file
diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml
index 50c48fd3e5..f5ea607994 100644
--- a/prepare_oim/prepare_oim.yml
+++ b/prepare_oim/prepare_oim.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
index 2cc6dfed26..444869291b 100644
--- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
+++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
@@ -29,6 +29,7 @@
 
 - name: Pause for user to review warnings
   ansible.builtin.pause:
+    seconds: 30
     prompt: |
       ╔════════════════════════════════════════════╗
       ║       ⚠️  UPGRADE WARNINGS REVIEW  ⚠️        ║
@@ -42,6 +43,7 @@
 
       Please review these warnings carefully.
       Press ENTER to continue or CTRL+C to abort.
+      Continuing automatically in 30 seconds...
   when:
     - upgrade_warnings is defined
     - upgrade_warnings | length > 0
diff --git a/upgrade/upgrade_omnia.yml b/upgrade/upgrade_omnia.yml
index 61050ec244..ade6b1f173 100644
--- a/upgrade/upgrade_omnia.yml
+++ b/upgrade/upgrade_omnia.yml
@@ -18,3 +18,13 @@
 
 - name: Upgrade cluster tasks
   ansible.builtin.import_playbook: upgrade_cluster.yml
+
+- name: Clear upgrade guard lock
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Remove upgrade guard lock
+      ansible.builtin.file:
+        path: /opt/omnia/.data/upgrade_in_progress.lock
+        state: absent
diff --git a/utils/upgrade_checkup.yml b/utils/upgrade_checkup.yml
new file mode 100644
index 0000000000..5fb8582000
--- /dev/null
+++ b/utils/upgrade_checkup.yml
@@ -0,0 +1,33 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: "Guard: block if upgrade is in progress"
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Check upgrade lock file
+      ansible.builtin.stat:
+        path: /opt/omnia/.data/upgrade_in_progress.lock
+      register: upgrade_lock
+
+    - name: Block playbook while upgrade is in progress
+      ansible.builtin.fail:
+        msg: >-
+          Upgrade is not completed fully.
+          Please run upgrade_omnia.yml to complete upgrade before running any other playbook using the below command:
+          "ansible-playbook /omnia/upgrade/upgrade_omnia.yml"
+          If you don't require input files to be migrated, reconfigure the default input files, remove the lock file using the following command
+          "rm /opt/omnia/.data/upgrade_in_progress.lock" and then proceed.
+      when: upgrade_lock.stat.exists

From 3a9ef0a8a231bf308ab5953450d01014dcab518f Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Thu, 19 Feb 2026 17:58:30 +0530
Subject: [PATCH 49/77] Renamed force_conf to skip_merge

---
 .../input_validation/schema/omnia_config.json             | 2 +-
 .../validation_flows/common_validation.py                 | 8 ++++----
 discovery/roles/slurm_config/tasks/confs.yml              | 4 ++--
 discovery/roles/slurm_config/tasks/create_slurm_dir.yml   | 2 +-
 input/omnia_config.yml                                    | 6 +++---
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json
index f7771d9441..ca7266124c 100644
--- a/common/library/module_utils/input_validation/schema/omnia_config.json
+++ b/common/library/module_utils/input_validation/schema/omnia_config.json
@@ -19,7 +19,7 @@
             "minLength": 1,
             "description": "Name of the nfs storage in storage_config.yml" 
           },
-          "force_conf": { 
+          "skip_merge": { 
             "type": "boolean", 
             "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" 
           },
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index 7726df24fb..36f55130d4 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -1074,12 +1074,12 @@ def validate_omnia_config(
                     "slurm NFS not provided",
                     f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}"
                     ))
-                    
+
         skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation")
         cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
-        force_conf_list = [clst.get('force_conf', False) for clst in data.get('slurm_cluster')]
+        skip_merge_list = [clst.get('skip_merge', False) for clst in data.get('slurm_cluster')]
         for idx, cfg_path_dict in enumerate(cnfg_src):
-            force_conf = force_conf_list[idx]
+            skip_merge = skip_merge_list[idx]
             for k,v in cfg_path_dict.items():
                 conf_dict = None
                 if isinstance(v, str):
@@ -1089,7 +1089,7 @@ def validate_omnia_config(
                                 f"provided conf path for {k} - {v} does not exist"))
                         continue
                     else: # path exists
-                        if not force_conf and not skip_conf_validation:
+                        if not skip_merge and not skip_conf_validation:
                             conf_dict, duplicate_keys = parse_slurm_conf(v, k, False)
                             if duplicate_keys:
                                 errors.append(
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 3764ecc18a..1e5a4e507e 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -17,11 +17,11 @@
     apply_config: "{{ __default_config }}"
   no_log: true
 
-- name: Remove keys from conf_files if they have string values in configs_input (when force_conf is true)
+- name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true)
   ansible.builtin.set_fact:
     conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}"
   when:
-    - force_conf | default(false)
+    - skip_merge | default(false)
     - configs_input is defined
 
 - name: Build slurm.conf
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index f2182db18e..b68bcbbded 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -60,7 +60,7 @@
   ansible.builtin.set_fact:
     cluster_name: "{{ slurm_cluster[0].cluster_name }}"
     configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}"
-    force_conf: "{{ slurm_cluster[0].force_conf | default(false) }}"
+    skip_merge: "{{ slurm_cluster[0].skip_merge | default(false) }}"
     slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}"
     controller_trackfile_path: "{{ share_path }}/ctld_track"
 
diff --git a/input/omnia_config.yml b/input/omnia_config.yml
index 75cc599c81..943d70e530 100644
--- a/input/omnia_config.yml
+++ b/input/omnia_config.yml
@@ -27,10 +27,10 @@
 # Storage name corresponding to the NFS share to be used by slurm cluster 
 # This should match with exactly with a entry in storage_config.yml
 
-# force_conf
+# skip_merge
 # Variable indicates whether a specific configuration file path
 # under config_sources should be used as-is without merging
-# If force_conf is set to true for a configuration source path,
+# If skip_merge is set to true for a configuration source path,
 # that configuration file will be applied directly
 # without merging with defaults or existing configurations
 # It accepts true and false values
@@ -59,7 +59,7 @@
 slurm_cluster:
   - cluster_name: slurm_cluster
     nfs_storage_name: nfs_slurm
-    # force_conf: true
+    # skip_merge: true
     # config_sources:
     #   slurm:
     #     SlurmctldTimeout: 60

From f12996cec9561ad0b027a3ff149468e284d760ed Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 20 Feb 2026 10:49:42 +0530
Subject: [PATCH 50/77] Update omnia.sh

---
 omnia.sh | 59 ++++++++++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 25cfb01dec..530c168e7d 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -299,7 +299,18 @@ update_metadata_upgrade_backup_dir() {
     "
 }
 
-
+# Resolve the upgrade guard lock path (container or host shared path)
+get_upgrade_guard_lock_path() {
+    local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
+    local upgrade_guard_lock_host
+    upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
+    if [ -n "$upgrade_guard_lock_host" ]; then
+        upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
+    else
+        upgrade_guard_lock_host="$upgrade_guard_lock_container"
+    fi
+    echo "$upgrade_guard_lock_host"
+}
 
 check_internal_nfs_export() {
     nfs_server_ip=$1
@@ -399,17 +410,9 @@ cleanup_omnia_core() {
         fetch_config
 
         # Clear upgrade guard lock if present (shared path visible to container and host)
-        local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
-        local upgrade_guard_lock_host
-        upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
-        if [ -n "$upgrade_guard_lock_host" ]; then
-            upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
-        else
-            upgrade_guard_lock_host="$upgrade_guard_lock_container"
-        fi
-
-        rm -f "$upgrade_guard_lock_host" >/dev/null 2>&1 || true
-        echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_host"
+        local upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
+        rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true
+        echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_path"
 
         # Remove the container
         remove_container
@@ -1851,18 +1854,12 @@ upgrade_omnia_core() {
     trap 'rm -f "$lock_file"' EXIT
 
     # Create upgrade guard lock in shared path so other playbooks can block during upgrade
-    local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
-    local upgrade_guard_lock_host
-    upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
-    if [ -n "$upgrade_guard_lock_host" ]; then
-        upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
-    else
-        upgrade_guard_lock_host="$upgrade_guard_lock_container"
-    fi
+    local upgrade_guard_lock_path
+    upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
 
-    mkdir -p "$(dirname "$upgrade_guard_lock_host")" 2>/dev/null || true
-    echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_host" || {
-        echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_host${NC}"
+    mkdir -p "$(dirname "$upgrade_guard_lock_path")" 2>/dev/null || true
+    echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_path" || {
+        echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_path${NC}"
         exit 1
     }
 
@@ -2319,17 +2316,11 @@ rollback_omnia_core() {
     echo "[INFO] Rollback lock file removed before starting container session"
 
     # Clear upgrade guard lock if it exists (shared path visible to container and host)
-    local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
-    local upgrade_guard_lock_host
-    upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
-    if [ -n "$upgrade_guard_lock_host" ]; then
-        upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
-    else
-        upgrade_guard_lock_host="$upgrade_guard_lock_container"
-    fi
+    local upgrade_guard_lock_path
+    upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
 
-    rm -f "$upgrade_guard_lock_host" >/dev/null 2>&1 || true
-    echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_host"
+    rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true
+    echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_path"
 
     # Initialize SSH config and start container session
     init_ssh_config

From 08dd3e9c06dd86f6b3314cc0f96572218cd42a5f Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Fri, 20 Feb 2026 05:30:41 +0000
Subject: [PATCH 51/77] auto-backup of slurm-confs

---
 discovery/roles/slurm_config/tasks/confs.yml | 57 ++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index c5f7953b0d..1885347260 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -191,6 +191,63 @@
   loop_control:
     loop_var: extra_conf
 
+- name: Backup Slurm configuration files when changed
+  when:
+    - ctld_conf_files is changed
+    - ctld_list is defined
+    - ctld_list | length > 0
+  block:
+    - name: Set backup timestamp
+      ansible.builtin.set_fact:
+        backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}"
+        backup_base_name: "auto_backup_discovery"
+
+    - name: Set backup name suffix
+      ansible.builtin.set_fact:
+        backup_name_suffix: "{{ backup_base_name ~ '_' ~ backup_timestamp }}"
+
+    - name: Set backup directories
+      ansible.builtin.set_fact:
+        slurm_backups_root: "{{ share_path }}/slurm_backups"
+        backup_dir: "{{ share_path }}/slurm_backups/{{ backup_base_name ~ '_' ~ backup_timestamp }}"
+
+    - name: Ensure slurm backups root exists
+      ansible.builtin.file:
+        path: "{{ slurm_backups_root }}"
+        state: directory
+        mode: '0755'
+
+    - name: Create backup directory
+      ansible.builtin.file:
+        path: "{{ backup_dir }}"
+        state: directory
+        mode: '0755'
+
+    - name: Create backup config directories
+      ansible.builtin.file:
+        path: "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}"
+        state: directory
+        mode: '0755'
+      loop:
+        - etc/slurm
+        - etc/munge
+        - etc/my.cnf.d
+
+    - name: Backup controller config directories
+      ansible.builtin.command: >-
+        cp -a "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/." "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}/"
+      loop:
+        - etc/slurm
+        - etc/munge
+        - etc/my.cnf.d
+      changed_when: true
+      failed_when: false
+
+    - name: Display backup location
+      ansible.builtin.debug:
+        msg: "Slurm config backup created at: {{ backup_dir }}/{{ ctld_list[0] }}"
+
+
 - name: Check if cluster running
   ansible.builtin.include_tasks: check_ctld_running.yml
   when:

From 7169855081ecefbf6183cc218cd41896683ce49b Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 20 Feb 2026 15:09:06 +0530
Subject: [PATCH 52/77] upgrade utility added to oim_cleanup and credential
 utility

---
 utils/credential_utility/get_config_credentials.yml | 4 ++++
 utils/oim_cleanup.yml                               | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/utils/credential_utility/get_config_credentials.yml b/utils/credential_utility/get_config_credentials.yml
index 0e4c323b94..b77ba14b9b 100644
--- a/utils/credential_utility/get_config_credentials.yml
+++ b/utils/credential_utility/get_config_credentials.yml
@@ -13,6 +13,10 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../upgrade_checkup.yml
+  tags: always
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: ../include_input_dir.yml
diff --git a/utils/oim_cleanup.yml b/utils/oim_cleanup.yml
index edb9cfb207..4d959d5ea4 100644
--- a/utils/oim_cleanup.yml
+++ b/utils/oim_cleanup.yml
@@ -13,6 +13,10 @@
 #  limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: upgrade_checkup.yml
+  tags: always
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: include_input_dir.yml

From c42782c8481703c5d0c10ba3e36ee7e242bd0304 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Fri, 20 Feb 2026 17:54:24 +0530
Subject: [PATCH 53/77] Lock Mechanism for Upgrade Sequence Integrity (#3994)

---
 build_image_aarch64/build_image_aarch64.yml   |  3 +
 build_image_x86_64/build_image_x86_64.yml     |  3 +
 discovery/discovery.yml                       |  3 +
 local_repo/local_repo.yml                     |  3 +
 omnia.sh                                      | 82 +++++++++++++------
 prepare_oim/prepare_oim.yml                   |  3 +
 .../tasks/display_warnings.yml                |  2 +
 upgrade/upgrade_omnia.yml                     | 10 +++
 .../get_config_credentials.yml                |  4 +
 utils/oim_cleanup.yml                         |  4 +
 utils/upgrade_checkup.yml                     | 33 ++++++++
 11 files changed, 126 insertions(+), 24 deletions(-)
 create mode 100644 utils/upgrade_checkup.yml

diff --git a/build_image_aarch64/build_image_aarch64.yml b/build_image_aarch64/build_image_aarch64.yml
index 08ee0b4ad8..d5dc76a82d 100644
--- a/build_image_aarch64/build_image_aarch64.yml
+++ b/build_image_aarch64/build_image_aarch64.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml
index 676d8adbd6..8f56b86ef6 100644
--- a/build_image_x86_64/build_image_x86_64.yml
+++ b/build_image_x86_64/build_image_x86_64.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/discovery/discovery.yml b/discovery/discovery.yml
index 75efadb47c..40fd00123c 100644
--- a/discovery/discovery.yml
+++ b/discovery/discovery.yml
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: ../utils/include_input_dir.yml
diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml
index 3a743c3f47..963715b5e3 100644
--- a/local_repo/local_repo.yml
+++ b/local_repo/local_repo.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/omnia.sh b/omnia.sh
index 3b320b0bf6..530c168e7d 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -299,7 +299,18 @@ update_metadata_upgrade_backup_dir() {
     "
 }
 
-
+# Resolve the upgrade guard lock path (container or host shared path)
+get_upgrade_guard_lock_path() {
+    local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock"
+    local upgrade_guard_lock_host
+    upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r')
+    if [ -n "$upgrade_guard_lock_host" ]; then
+        upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock"
+    else
+        upgrade_guard_lock_host="$upgrade_guard_lock_container"
+    fi
+    echo "$upgrade_guard_lock_host"
+}
 
 check_internal_nfs_export() {
     nfs_server_ip=$1
@@ -398,6 +409,11 @@ cleanup_omnia_core() {
         # Fetch the configuration from the Omnia core container.
         fetch_config
 
+        # Clear upgrade guard lock if present (shared path visible to container and host)
+        local upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
+        rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true
+        echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_path"
+
         # Remove the container
         remove_container
 
@@ -1837,6 +1853,16 @@ upgrade_omnia_core() {
     touch "$lock_file"
     trap 'rm -f "$lock_file"' EXIT
 
+    # Create upgrade guard lock in shared path so other playbooks can block during upgrade
+    local upgrade_guard_lock_path
+    upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
+
+    mkdir -p "$(dirname "$upgrade_guard_lock_path")" 2>/dev/null || true
+    echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_path" || {
+        echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_path${NC}"
+        exit 1
+    }
+
     # Run upgrade phases
     if ! phase1_validate; then
         echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1"
@@ -1874,8 +1900,10 @@ upgrade_omnia_core() {
     echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully"
     echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base"
 
+    # Seed inputs and defaults after upgrade
+    post_setup_config
+
     show_post_upgrade_instructions "$TARGET_OMNIA_VERSION"
-    
     # Initialize SSH config and start container session
     init_ssh_config
     start_container_session
@@ -1885,15 +1913,15 @@ upgrade_omnia_core() {
 # Validate backup directory structure and files
 validate_backup_directory() {
     local backup_path="$1"
-    
+
     echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path"
-    
+
     # Check if backup directory exists
     if ! podman exec -u root omnia_core test -d "$backup_path"; then
         echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path"
         return 1
     fi
-    
+
     # Check for required subdirectories
     for subdir in input metadata configs; do
         if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then
@@ -1901,24 +1929,24 @@ validate_backup_directory() {
             return 1
         fi
     done
-    
+
     # Check for required files
     if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then
         echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml"
         return 1
     fi
-    
+
     if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then
         echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container"
         return 1
     fi
-    
+
     # Verify metadata contains version information
     if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then
         echo "[ERROR] [ROLLBACK] Metadata file does not contain version information"
         return 1
     fi
-    
+
     echo "[INFO] [ROLLBACK] Backup validation successful"
     return 0
 }
@@ -1927,15 +1955,15 @@ validate_backup_directory() {
 stop_container_gracefully() {
     local container_name="$1"
     local timeout="${2:-30}"
-    
+
     echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..."
-    
+
     # Try graceful stop first
     if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then
         echo "[INFO] [ROLLBACK] Container stopped gracefully"
         return 0
     fi
-    
+
     # Check if container is still running
     if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then
         echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..."
@@ -1947,16 +1975,16 @@ stop_container_gracefully() {
             return 1
         fi
     fi
-    
+
     return 0
 }
 
 # Restore files from backup
 restore_from_backup() {
     local backup_path="$1"
-    
+
     echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path"
-    
+
     # Restore input files
     if ! podman exec -u root omnia_core bash -c "
         set -e
@@ -1966,19 +1994,19 @@ restore_from_backup() {
         echo "[ERROR] [ROLLBACK] Failed to restore input files"
         return 1
     fi
-    
+
     # Restore metadata
     if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then
         echo "[ERROR] [ROLLBACK] Failed to restore metadata"
         return 1
     fi
-    
+
     # Restore container config on host
     if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then
         echo "[ERROR] [ROLLBACK] Failed to restore container config"
         return 1
     fi
-    
+
     echo "[INFO] [ROLLBACK] Files restored successfully"
     return 0
 }
@@ -2006,8 +2034,8 @@ display_cleanup_instructions() {
     echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}"
     echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}"
     echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}"
-    echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}"
-    echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}"
+    echo -e "${YELLOW}4. Stop all containers: podman stop $(podman ps -aq)${NC}"
+    echo -e "${YELLOW}5. Remove all containers: podman rm -f $(podman ps -aq)${NC}"
     echo -e "${YELLOW}6. Clean shared path: rm -rf <omnia_shared_path>${NC}"
     echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}"
     echo ""
@@ -2015,7 +2043,6 @@ display_cleanup_instructions() {
     echo ""
 }
 
-# Main rollback function
 rollback_omnia_core() {
     echo -e "${GREEN}================================================================================${NC}"
     echo -e "${GREEN}                         OMNIA CORE ROLLBACK${NC}"
@@ -2287,7 +2314,14 @@ rollback_omnia_core() {
     # Clean up lock file before starting long-running ssh session
     rm -f "$lock_file" >/dev/null 2>&1 || true
     echo "[INFO] Rollback lock file removed before starting container session"
-    
+
+    # Clear upgrade guard lock if it exists (shared path visible to container and host)
+    local upgrade_guard_lock_path
+    upgrade_guard_lock_path=$(get_upgrade_guard_lock_path)
+
+    rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true
+    echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_path"
+
     # Initialize SSH config and start container session
     init_ssh_config
     start_container_session
@@ -2325,4 +2359,4 @@ main() {
 }
 
 # Call the main function
-main "$1"
+main "$1"
\ No newline at end of file
diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml
index 50c48fd3e5..f5ea607994 100644
--- a/prepare_oim/prepare_oim.yml
+++ b/prepare_oim/prepare_oim.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml
+
 - name: Set_fact for fetch omnia config credentials
   hosts: localhost
   connection: local
diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
index 2cc6dfed26..444869291b 100644
--- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
+++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml
@@ -29,6 +29,7 @@
 
 - name: Pause for user to review warnings
   ansible.builtin.pause:
+    seconds: 30
     prompt: |
       ╔════════════════════════════════════════════╗
       ║       ⚠️  UPGRADE WARNINGS REVIEW  ⚠️        ║
@@ -42,6 +43,7 @@
 
       Please review these warnings carefully.
       Press ENTER to continue or CTRL+C to abort.
+      Continuing automatically in 30 seconds...
   when:
     - upgrade_warnings is defined
     - upgrade_warnings | length > 0
diff --git a/upgrade/upgrade_omnia.yml b/upgrade/upgrade_omnia.yml
index 61050ec244..ade6b1f173 100644
--- a/upgrade/upgrade_omnia.yml
+++ b/upgrade/upgrade_omnia.yml
@@ -18,3 +18,13 @@
 
 - name: Upgrade cluster tasks
   ansible.builtin.import_playbook: upgrade_cluster.yml
+
+- name: Clear upgrade guard lock
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Remove upgrade guard lock
+      ansible.builtin.file:
+        path: /opt/omnia/.data/upgrade_in_progress.lock
+        state: absent
diff --git a/utils/credential_utility/get_config_credentials.yml b/utils/credential_utility/get_config_credentials.yml
index 0e4c323b94..b77ba14b9b 100644
--- a/utils/credential_utility/get_config_credentials.yml
+++ b/utils/credential_utility/get_config_credentials.yml
@@ -13,6 +13,10 @@
 # limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: ../upgrade_checkup.yml
+  tags: always
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: ../include_input_dir.yml
diff --git a/utils/oim_cleanup.yml b/utils/oim_cleanup.yml
index edb9cfb207..4d959d5ea4 100644
--- a/utils/oim_cleanup.yml
+++ b/utils/oim_cleanup.yml
@@ -13,6 +13,10 @@
 #  limitations under the License.
 ---
 
+- name: Check if upgrade is in progress
+  ansible.builtin.import_playbook: upgrade_checkup.yml
+  tags: always
+
 - name: Include input project directory
   when: not project_dir_status | default(false) | bool
   ansible.builtin.import_playbook: include_input_dir.yml
diff --git a/utils/upgrade_checkup.yml b/utils/upgrade_checkup.yml
new file mode 100644
index 0000000000..5fb8582000
--- /dev/null
+++ b/utils/upgrade_checkup.yml
@@ -0,0 +1,33 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: "Guard: block if upgrade is in progress"
+  hosts: localhost
+  connection: local
+  gather_facts: false
+  tasks:
+    - name: Check upgrade lock file
+      ansible.builtin.stat:
+        path: /opt/omnia/.data/upgrade_in_progress.lock
+      register: upgrade_lock
+
+    - name: Block playbook while upgrade is in progress
+      ansible.builtin.fail:
+        msg: >-
+          Upgrade is not completed fully.
+          Please run upgrade_omnia.yml to complete upgrade before running any other playbook using the below command:
+          "ansible-playbook /omnia/upgrade/upgrade_omnia.yml"
+          If you don't require input files to be migrated, reconfigure the default input files, remove the lock file using the following command
+          "rm /opt/omnia/.data/upgrade_in_progress.lock" and then proceed.
+      when: upgrade_lock.stat.exists

From d11fde8e868837f3c5403bd3b55f36b72ee60ae5 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com>
Date: Fri, 20 Feb 2026 18:14:34 +0530
Subject: [PATCH 54/77] Slurm delete node - drain node before delete -
 skip_merge new option (#3986)

* Node drain logic for deletion

* Shell instead of command for piping

* lint fixes

* Updated permission for slurmdbd
Added new force_conf option for allowing confs pass through validation

* removede new file

* Renamed force_conf to skip_merge
---
 .../input_validation/schema/omnia_config.json |   4 +
 .../validation_flows/common_validation.py     |   9 +-
 .../slurm_config/tasks/build_slurm_conf.yml   |   5 +
 .../slurm_config/tasks/check_ctld_running.yml |  12 +-
 discovery/roles/slurm_config/tasks/confs.yml  |  14 ++-
 .../slurm_config/tasks/create_slurm_dir.yml   |   1 +
 .../tasks/drain_and_remove_node.yml           | 109 ++++++++++++++++++
 .../roles/slurm_config/tasks/remove_node.yml  |   2 +-
 discovery/roles/slurm_config/vars/main.yml    |   6 +-
 input/omnia_config.yml                        |  10 ++
 10 files changed, 161 insertions(+), 11 deletions(-)
 create mode 100644 discovery/roles/slurm_config/tasks/drain_and_remove_node.yml

diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json
index f53485770f..ca7266124c 100644
--- a/common/library/module_utils/input_validation/schema/omnia_config.json
+++ b/common/library/module_utils/input_validation/schema/omnia_config.json
@@ -19,6 +19,10 @@
             "minLength": 1,
             "description": "Name of the nfs storage in storage_config.yml" 
           },
+          "skip_merge": { 
+            "type": "boolean", 
+            "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" 
+          },
           "config_sources": {
             "type": "object",
             "description": "Config can be a file path or inline mapping",
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index f577a4e9b8..36f55130d4 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -1074,9 +1074,12 @@ def validate_omnia_config(
                     "slurm NFS not provided",
                     f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}"
                     ))
-        cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
+
         skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation")
-        for cfg_path_dict in cnfg_src:
+        cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')]
+        skip_merge_list = [clst.get('skip_merge', False) for clst in data.get('slurm_cluster')]
+        for idx, cfg_path_dict in enumerate(cnfg_src):
+            skip_merge = skip_merge_list[idx]
             for k,v in cfg_path_dict.items():
                 conf_dict = None
                 if isinstance(v, str):
@@ -1086,7 +1089,7 @@ def validate_omnia_config(
                                 f"provided conf path for {k} - {v} does not exist"))
                         continue
                     else: # path exists
-                        if not skip_conf_validation:
+                        if not skip_merge and not skip_conf_validation:
                             conf_dict, duplicate_keys = parse_slurm_conf(v, k, False)
                             if duplicate_keys:
                                 errors.append(
diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
index 9d5d0f0944..40b6137172 100644
--- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
+++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Read NodeName parameters from iDRAC
+  ansible.builtin.include_tasks: read_node_idrac.yml
+  when: cmpt_list
+  loop: "{{ cmpt_list }}"
+
 - name: Append node_params list into NodeName list
   ansible.builtin.set_fact:
     apply_config: "{{ apply_config | default({})
diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index 7d908169ab..ce27d3c362 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -22,6 +22,16 @@
   register: ssh_check
   ignore_errors: true
 
+- name: Drain and remove nodes if any
+  ansible.builtin.include_tasks: drain_and_remove_node.yml
+  loop: "{{ nodes_in_normal_not_in_cmpt }}"
+  loop_control:
+    loop_var: node_to_remove
+  when:
+    - ssh_check is success
+    - nodes_in_normal_not_in_cmpt is defined
+    - nodes_in_normal_not_in_cmpt | length > 0
+
 - name: Enter slurm controller when pingable
   when:
     - ssh_check is success
@@ -37,7 +47,7 @@
       register: service_facts
       ignore_unreachable: true
 
-    - name: Fail if slurmctld is unreachable
+    - name: Check slurmctld is reachable
       ansible.builtin.fail:
         msg: "Failed to connect to {{ ctld }}."
       when: service_facts is unreachable
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index c5f7953b0d..1e5a4e507e 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -17,13 +17,16 @@
     apply_config: "{{ __default_config }}"
   no_log: true
 
-- name: Read NodeName parameters
-  ansible.builtin.include_tasks: read_node_idrac.yml
-  when: cmpt_list
-  loop: "{{ cmpt_list }}"
+- name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true)
+  ansible.builtin.set_fact:
+    conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}"
+  when:
+    - skip_merge | default(false)
+    - configs_input is defined
 
 - name: Build slurm.conf
   ansible.builtin.include_tasks: build_slurm_conf.yml
+  when: "'slurm' in conf_files"
 
 - name: Slurm dbd opts
   ansible.builtin.set_fact:
@@ -167,12 +170,13 @@
 - name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd
   ansible.builtin.set_fact:
     conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}"
+  when: slurm_conf_dict is defined
 
 - name: Write merged .conf
   ansible.builtin.copy:
     content: "{{ item.ini_lines | join('\n') }}\n"
     dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf"
-    mode: "0640"
+    mode: "{{ slurm_dbd_mode if item.item.key == 'slurmdbd' else slurm_mode }}"
     owner: "{{ slurm_user }}"
     group: "{{ slurm_user_group }}"
     remote_src: "{{ copy_from_oim }}"
diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
index e4ac760d77..b68bcbbded 100644
--- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
+++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml
@@ -60,6 +60,7 @@
   ansible.builtin.set_fact:
     cluster_name: "{{ slurm_cluster[0].cluster_name }}"
     configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}"
+    skip_merge: "{{ slurm_cluster[0].skip_merge | default(false) }}"
     slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}"
     controller_trackfile_path: "{{ share_path }}/ctld_track"
 
diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
new file mode 100644
index 0000000000..da1c41d3fe
--- /dev/null
+++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
@@ -0,0 +1,109 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+- name: Check if node exists in Slurm cluster
+  ansible.builtin.command: scontrol show node {{ node_to_remove }}
+  register: node_exists_check
+  failed_when: false
+  ignore_unreachable: true
+  changed_when: false
+  delegate_to: "{{ ctld }}"
+
+- name: Skip if node does not exist
+  ansible.builtin.debug:
+    msg: "Node {{ node_to_remove }} not found in cluster, skipping removal"
+  when:
+    - node_exists_check is reachable
+    - node_exists_check.rc != 0
+
+- name: Process node removal
+  when:
+    - node_exists_check is reachable
+    - node_exists_check.rc == 0
+  ignore_unreachable: true
+  block:
+    - name: Get current job count on node
+      ansible.builtin.shell:
+        cmd: |
+          set -o pipefail
+          squeue -w {{ node_to_remove }} -h | wc -l
+      register: current_jobs
+      changed_when: false
+      delegate_to: "{{ ctld }}"
+
+    - name: Display job information
+      ansible.builtin.debug:
+        msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)"
+
+    - name: Drain the node to prevent new job assignments
+      ansible.builtin.command: >
+        scontrol update NodeName={{ node_to_remove }}
+        State=DRAIN
+        Reason="Scheduled removal - waiting for jobs to complete"
+      changed_when: true
+      delegate_to: "{{ ctld }}"
+
+    - name: Wait for all jobs to complete on the node
+      ansible.builtin.shell:
+        cmd: |
+          set -o pipefail
+          squeue -w {{ node_to_remove }} -h | wc -l
+      register: job_count_check
+      until: job_count_check.stdout | int == 0
+      retries: "{{ (node_drain_timeout / node_drain_delay) | int }}"
+      delay: "{{ node_drain_delay }}"
+      changed_when: false
+      delegate_to: "{{ ctld }}"
+      when: current_jobs.stdout | int > 0
+
+    - name: Confirm jobs completed
+      ansible.builtin.debug:
+        msg: "All jobs on {{ node_to_remove }} have completed"
+      when: current_jobs.stdout | int > 0
+
+    - name: Log node removal
+      ansible.builtin.debug:
+        msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state"
+
+  rescue:
+    - name: Log node removal failure
+      ansible.builtin.debug:
+        msg: "Failed to drain node {{ node_to_remove }}"
+
+    - name: Remove slurm node with running job after timeout
+      ansible.builtin.pause:
+        prompt: |
+          Node {{ node_to_remove }} has been DRAINED to prevent new job assignments.
+          Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds.
+          Options:
+            1. Press Ctrl+C then 'A' to abort
+            2. Press Enter to force removal (jobs will be killed)
+      when: not force_scancel_node
+
+    - name: Force cancel jobs if timeout reached
+      ansible.builtin.command: scancel -f -w {{ node_to_remove }}
+      changed_when: true
+      failed_when: false
+      delegate_to: "{{ ctld }}"
+
+  always:
+    - name: Set node to DOWN state
+      ansible.builtin.command: >
+        scontrol update NodeName={{ node_to_remove }}
+        State=DOWN
+        Reason="Node removed from cluster"
+      changed_when: true
+      failed_when: false
+      delegate_to: "{{ ctld }}"
+      when: node_exists_check.rc == 0
diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml
index 4dc0217559..ba93bb086a 100644
--- a/discovery/roles/slurm_config/tasks/remove_node.yml
+++ b/discovery/roles/slurm_config/tasks/remove_node.yml
@@ -30,7 +30,7 @@
 - name: Update normal partition Nodes to match cmpt_list
   ansible.builtin.set_fact:
     updated_partitions: "{{ updated_partitions | default([])
-     + [item | combine({'Nodes': cmpt_list | join(',')}) if item.PartitionName == slurm_partition_name else item] }}"
+     + [item | combine({'Nodes': (cmpt_list | join(',')) if cmpt_list | length > 0 else 'ALL'}) if item.PartitionName == slurm_partition_name else item] }}"
   loop: "{{ slurm_conf_dict.PartitionName | default([]) }}"
   when:
     - "'slurm' in conf_merge_dict"
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index 1593f791cb..d708eb0777 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -68,6 +68,7 @@ gpu_slurm_conf:
   SlurmdParameters: l3cache_as_socket
 innodb_buffer_pool_size: 4G
 innodb_lock_wait_timeout: 900
+conf_server: "--conf-server {{ ctld_list | join(',') }}"
 # TODO tmp
 nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml"
 bmc_username: "{{ hostvars['localhost']['bmc_username'] }}"
@@ -117,12 +118,15 @@ munge_dir_mode: "0700"
 common_mode: "0755"
 slurm_dbd_mode: "0600"
 slurm_db_cnf_mode: "0600"
+node_drain_timeout: 900
+node_drain_delay: 30
+force_scancel_node: false
 dbd_slurm_conf:
   AccountingStoragePort: "{{ slurm_dbd_port }}"
   AccountingStorageType: accounting_storage/slurmdbd
 partition_params:
   PartitionName: "{{ slurm_partition_name }}"
-  Nodes: "{{ cmpt_list | join(',') }}"
+  Nodes: "{{ cmpt_list | join(',') if cmpt_list else 'ALL' }}"
   MaxTime: "INFINITE"
   State: "UP"
   Default: "YES"
diff --git a/input/omnia_config.yml b/input/omnia_config.yml
index bb5a4f06fa..943d70e530 100644
--- a/input/omnia_config.yml
+++ b/input/omnia_config.yml
@@ -27,6 +27,15 @@
 # Storage name corresponding to the NFS share to be used by slurm cluster 
 # This should match with exactly with a entry in storage_config.yml
 
+# skip_merge
+# Variable indicates whether a specific configuration file path
+# under config_sources should be used as-is without merging
+# If skip_merge is set to true for a configuration source path,
+# that configuration file will be applied directly
+# without merging with defaults or existing configurations
+# It accepts true and false values
+# Default value is false
+
 # config_sources
 # defines how the Slurm configuration files are provided to the cluster.
 # <conf name>: 
@@ -50,6 +59,7 @@
 slurm_cluster:
   - cluster_name: slurm_cluster
     nfs_storage_name: nfs_slurm
+    # skip_merge: true
     # config_sources:
     #   slurm:
     #     SlurmctldTimeout: 60

From 8e08eabe0f7787a33c5d3d27c211e479e4627aed Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Sat, 21 Feb 2026 14:51:35 +0000
Subject: [PATCH 55/77] checkmarx fixes - II

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../module_utils/local_repo/registry_utils.py       | 13 ++++++++++---
 .../module_utils/local_repo/user_image_utility.py   | 12 ++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/common/library/module_utils/local_repo/registry_utils.py b/common/library/module_utils/local_repo/registry_utils.py
index 2e7da2f659..6abd75b6dc 100644
--- a/common/library/module_utils/local_repo/registry_utils.py
+++ b/common/library/module_utils/local_repo/registry_utils.py
@@ -27,14 +27,21 @@ def is_https(host, timeout=1):
     context.check_hostname = False
     context.verify_mode = ssl.CERT_NONE
 
+    sock = None
+    wrapped_sock = None
     try:
-        with socket.create_connection((ip, port), timeout=timeout) as sock:
-            with context.wrap_socket(sock, server_hostname=ip):
-                return True
+        sock = socket.create_connection((ip, port), timeout=timeout)
+        wrapped_sock = context.wrap_socket(sock, server_hostname=ip)
+        return True
     except ssl.SSLError:
         return False
     except Exception:
         return False
+    finally:
+        if wrapped_sock:
+            wrapped_sock.close()
+        if sock:
+            sock.close()
 
 def validate_user_registry(user_registry):
     """
diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py
index 4c68cd1803..d50ea41df7 100644
--- a/common/library/module_utils/local_repo/user_image_utility.py
+++ b/common/library/module_utils/local_repo/user_image_utility.py
@@ -58,11 +58,12 @@ def check_image_in_registry(
     """
 
     if not host.startswith(("http://", "https://")):
-        if cacert and key:
-            image_url = f"https://{host}/v2/{image}/manifests/{tag}"
-        else:
-            image_url = f"http://{host}/v2/{image}/manifests/{tag}"
-
+        # Checkmarx: Communication_Over_HTTP
+        # HTTP is intentionally allowed here because this function must support
+        # insecure user registries.
+        protocol = "https" if (cacert and key) else "http"
+        host = f"{protocol}://{host}"
+    image_url = f"{host}/v2/{image}/manifests/{tag}"
     logger.info(f"Checking image existence at: {image_url}")
 
     try:
@@ -409,4 +410,3 @@ def handle_user_image_registry(package, package_content, version_variables, user
 
     logger.info("#" * 30 + f" {handle_user_image_registry.__name__} end " + "#" * 30)
     return result, package_info
-

From 62529b64bebdbaeccf9a54fbeb0be5f0b14c2885 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Sun, 22 Feb 2026 07:32:32 +0000
Subject: [PATCH 56/77] fix for security issue - improper resource shutdown
 issue

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../module_utils/local_repo/registry_utils.py | 48 +++++++++++++++----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/common/library/module_utils/local_repo/registry_utils.py b/common/library/module_utils/local_repo/registry_utils.py
index 6abd75b6dc..965a0880f8 100644
--- a/common/library/module_utils/local_repo/registry_utils.py
+++ b/common/library/module_utils/local_repo/registry_utils.py
@@ -19,29 +19,57 @@
 from ansible.module_utils.local_repo.common_functions import is_file_exists
 
 def is_https(host, timeout=1):
+    """
+    Check whether the given host is serving HTTPS (TLS).
+ 
+    Attempts a TLS handshake without verifying the server certificate.
+ 
+    Args:
+        host (str): The host address in "ip:port" format.
+        timeout (int, optional): Connection timeout in seconds. Defaults to 1.
+ 
+    Returns:
+        bool: True if the host supports HTTPS/TLS, False otherwise.
+    """
     ip, port = host.rsplit(":", 1)
     port = int(port)
 
-    # Don't verify server cert; just see if TLS works
     context = ssl.create_default_context()
     context.check_hostname = False
     context.verify_mode = ssl.CERT_NONE
 
+    result = False
     sock = None
     wrapped_sock = None
+
     try:
         sock = socket.create_connection((ip, port), timeout=timeout)
         wrapped_sock = context.wrap_socket(sock, server_hostname=ip)
-        return True
-    except ssl.SSLError:
-        return False
-    except Exception:
-        return False
+        result = True
+
+    except (ssl.SSLError, OSError):
+        result = False
+
     finally:
-        if wrapped_sock:
-            wrapped_sock.close()
-        if sock:
-            sock.close()
+        # Close wrapped socket first
+        if wrapped_sock is not None:
+            try:
+                wrapped_sock.shutdown(socket.SHUT_RDWR)
+            except Exception:
+                pass
+            try:
+                wrapped_sock.close()
+            except Exception:
+                pass
+
+        # Then explicitly close original socket
+        if sock is not None:
+            try:
+                sock.close()
+            except Exception:
+                pass
+
+    return result
 
 def validate_user_registry(user_registry):
     """

From 95429340d3a67c102043cfced6b097d837b4de47 Mon Sep 17 00:00:00 2001
From: Abhishek S A <abhishek.sa3@dell.com>
Date: Mon, 23 Feb 2026 14:09:02 +0530
Subject: [PATCH 57/77] mapping file update

---
 examples/pxe_mapping_file.csv |  8 ++++----
 input/pxe_mapping_file.csv    | 10 +++++-----
 input/software_config.json    |  1 +
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv
index f9dfdf0cee..4d1c4775ed 100644
--- a/examples/pxe_mapping_file.csv
+++ b/examples/pxe_mapping_file.csv
@@ -1,11 +1,11 @@
 FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP
 slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52
-slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43
-slurm_node_x86_64,grp1,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44
-login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41
+slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43
+slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44
+login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41
 login_node_x86_64,grp9,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42
 service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53
 service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54
 service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55
 service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56
-service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57
+service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57
\ No newline at end of file
diff --git a/input/pxe_mapping_file.csv b/input/pxe_mapping_file.csv
index 849e3a2168..4d1c4775ed 100644
--- a/input/pxe_mapping_file.csv
+++ b/input/pxe_mapping_file.csv
@@ -1,11 +1,11 @@
 FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP
 slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52
-slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43
-slurm_node_x86_64,grp1,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44
-login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41
-login_compiler_node_x86_64,grp8,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42
+slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43
+slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44
+login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41
+login_node_x86_64,grp9,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42
 service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53
 service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54
 service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55
 service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56
-service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57
+service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57
\ No newline at end of file
diff --git a/input/software_config.json b/input/software_config.json
index 0d7f62acc3..4683376057 100644
--- a/input/software_config.json
+++ b/input/software_config.json
@@ -9,6 +9,7 @@
         {"name": "service_k8s","version": "1.34.1", "arch": ["x86_64"]},
         {"name": "slurm_custom", "arch": ["x86_64","aarch64"]},
         {"name": "ldms", "arch": ["x86_64","aarch64"]},
+        {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]},
         {"name": "additional_packages", "arch": ["x86_64","aarch64"]}
     ],
     "slurm_custom": [

From 1499181edd3186ea2c26f6d8842dd1f7fe2db02c Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Mon, 23 Feb 2026 14:28:59 +0530
Subject: [PATCH 58/77] To configure per-repository sync and caching policies
 in localrepo

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../schema/local_repo_config.json             | 278 ++++++++++++++++++
 .../validation_flows/local_repo_validation.py |  30 +-
 .../library/module_utils/local_repo/config.py |  11 +-
 .../module_utils/local_repo/software_utils.py |  59 +++-
 input/local_repo_config.yml                   |  60 +++-
 .../tasks/configure_rhel_os_urls.yml          | 104 ++++++-
 .../deploy_containers/pulp/vars/main.yml      |   9 +-
 7 files changed, 521 insertions(+), 30 deletions(-)

diff --git a/common/library/module_utils/input_validation/schema/local_repo_config.json b/common/library/module_utils/input_validation/schema/local_repo_config.json
index e44cf44df7..587851d0b3 100644
--- a/common/library/module_utils/input_validation/schema/local_repo_config.json
+++ b/common/library/module_utils/input_validation/schema/local_repo_config.json
@@ -1136,6 +1136,284 @@
         ]
       },
       "description": "Optional list of additional repository URLs for aarch64 architecture. These repos are aggregated into a single Pulp repository."
+    },
+     "rhel_subscription_repo_config_x86_64": {
+      "type": [
+        "array",
+        "null"
+      ],
+      "items": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^(https?:\\/\\/).+"
+          },
+          "gpgkey": {
+            "type": "string",
+            "pattern": "^(|[a-zA-Z][a-zA-Z0-9+.-]*:\\/\\/\\S+)$"
+          },
+          "name": {
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^(?!\\s*$).+"
+          },
+          "policy": {
+            "type": "string",
+            "enum": [
+              "always",
+              "partial"
+            ]
+          },
+          "caching": {
+            "type": "boolean"
+          },
+          "sslcacert": {
+            "type": [
+              "string",
+              "null"
+            ]
+          },
+          "sslclientkey": {
+            "type": [
+              "string",
+              "null"
+            ]
+          },
+          "sslclientcert": {
+            "type": [
+              "string",
+              "null"
+            ]
+          }
+        },
+        "required": [
+          "url",
+          "gpgkey",
+          "name"
+        ],
+        "allOf": [
+          {
+            "if": {
+              "required": [
+                "sslcacert"
+              ],
+              "properties": {
+                "sslcacert": {
+                  "minLength": 1
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "sslclientkey",
+                "sslclientcert"
+              ],
+              "properties": {
+                "sslclientkey": {
+                  "minLength": 1
+                },
+                "sslclientcert": {
+                  "minLength": 1
+                }
+              }
+            }
+          },
+          {
+            "if": {
+              "required": [
+                "sslclientkey"
+              ],
+              "properties": {
+                "sslclientkey": {
+                  "minLength": 1
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "sslcacert",
+                "sslclientcert"
+              ],
+              "properties": {
+                "sslcacert": {
+                  "minLength": 1
+                },
+                "sslclientcert": {
+                  "minLength": 1
+                }
+              }
+            }
+          },
+          {
+            "if": {
+              "required": [
+                "sslclientcert"
+              ],
+              "properties": {
+                "sslclientcert": {
+                  "minLength": 1
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "sslcacert",
+                "sslclientkey"
+              ],
+              "properties": {
+                "sslcacert": {
+                  "minLength": 1
+                },
+                "sslclientkey": {
+                  "minLength": 1
+                }
+              }
+            }
+          }
+        ]
+      },
+      "description": "Optional configuration for overriding policy and caching settings for RHEL subscription-based repositories on x86_64 architecture."
+    },
+    "rhel_subscription_repo_config_aarch64": {
+      "type": [
+        "array",
+        "null"
+      ],
+      "items": {
+        "type": "object",
+        "properties": {
+          "url": {
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^(https?:\\/\\/).+"
+          },
+          "gpgkey": {
+            "type": "string",
+            "pattern": "^(|[a-zA-Z][a-zA-Z0-9+.-]*:\\/\\/\\S+)$"
+          },
+          "name": {
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^(?!\\s*$).+"
+          },
+          "policy": {
+            "type": "string",
+            "enum": [
+              "always",
+              "partial"
+            ]
+          },
+          "caching": {
+            "type": "boolean"
+          },
+          "sslcacert": {
+            "type": [
+              "string",
+              "null"
+            ]
+          },
+          "sslclientkey": {
+            "type": [
+              "string",
+              "null"
+            ]
+          },
+          "sslclientcert": {
+            "type": [
+              "string",
+              "null"
+            ]
+          }
+        },
+        "required": [
+          "url",
+          "gpgkey",
+          "name"
+        ],
+        "allOf": [
+          {
+            "if": {
+              "required": [
+                "sslcacert"
+              ],
+              "properties": {
+                "sslcacert": {
+                  "minLength": 1
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "sslclientkey",
+                "sslclientcert"
+              ],
+              "properties": {
+                "sslclientkey": {
+                  "minLength": 1
+                },
+                "sslclientcert": {
+                  "minLength": 1
+                }
+              }
+            }
+          },
+          {
+            "if": {
+              "required": [
+                "sslclientkey"
+              ],
+              "properties": {
+                "sslclientkey": {
+                  "minLength": 1
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "sslcacert",
+                "sslclientcert"
+              ],
+              "properties": {
+                "sslcacert": {
+                  "minLength": 1
+                },
+                "sslclientcert": {
+                  "minLength": 1
+                }
+              }
+            }
+          },
+          {
+            "if": {
+              "required": [
+                "sslclientcert"
+              ],
+              "properties": {
+                "sslclientcert": {
+                  "minLength": 1
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "sslcacert",
+                "sslclientkey"
+              ],
+              "properties": {
+                "sslcacert": {
+                  "minLength": 1
+                },
+                "sslclientkey": {
+                  "minLength": 1
+                }
+              }
+            }
+          }
+        ]
+      },
+      "description": "Optional configuration for overriding policy and caching settings for RHEL subscription-based repositories on aarch64 architecture."
     }
   },
   "required": [
diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
index 343a4f3de1..88e02845d2 100644
--- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
@@ -137,19 +137,41 @@ def validate_local_repo_config(input_file_path, data,
         arch_repo_names = []
         arch_list = url_list + [url+'_'+arch for url in url_list]
          # define base repos dynamically for this arch if subscription registered 
-        if sub_result:       
-            base_repo_names = [f"{arch}_baseos",f"{arch}_appstream",f"{arch}_codeready-builder"]
-            logger.info(f"Adding base repos for {arch}: {base_repo_names}")
+        if sub_result:
+            base_subscription_repos = [f"{arch}_baseos", f"{arch}_appstream", f"{arch}_codeready-builder"]
+            logger.info(f"Base subscription repos for {arch}: {base_subscription_repos}")
+        
+        # Collect repo names from standard repo lists
         for repurl in arch_list:
             repos = data.get(repurl)
             if repos:
                 arch_repo_names = arch_repo_names + [x.get('name') for x in repos]
+
+        # Handle rhel_subscription_repo_config separately
+        # Only add non-base repos to the name list (base repos are overrides, not duplicates)
+        subscription_config_key = f"rhel_subscription_repo_config_{arch}"
+        subscription_config = data.get(subscription_config_key, [])
+        if subscription_config:
+            for repo in subscription_config:
+                repo_name = repo.get('name')
+                if repo_name and repo_name not in base_subscription_repos:
+                    # This is a new repo, not an override of base repos
+                    arch_repo_names.append(repo_name)
+                    logger.info(f"Adding new subscription config repo: {repo_name}")
+                else:
+                    logger.info(f"Skipping base repo override from duplicate check: {repo_name}")
+
         # Add additional_repos names for this arch
         additional_repos_key = f"additional_repos_{arch}"
         additional_repos = data.get(additional_repos_key)
         if additional_repos:
             arch_repo_names = arch_repo_names + [x.get('name') for x in additional_repos]
-        repo_names[arch] = repo_names.get(arch, []) + arch_repo_names + base_repo_names
+        
+        # Add base subscription repos to the final list (they will be dynamically generated)
+        if sub_result:
+            arch_repo_names = arch_repo_names + base_subscription_repos
+        
+        repo_names[arch] = arch_repo_names
         logger.info(f"Total repos for {arch}: {repo_names[arch]}")
 
     for k,v in repo_names.items():
diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 7bfea4b301..3e812a6e47 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -58,8 +58,17 @@
 RHEL_OS_URL = "rhel_os_url"
 SOFTWARES_KEY = "softwares"
 USER_REPO_URL = "user_repo_url"
-REPO_CONFIG = { "always": "on_demand", "partial": "on_demand", "never": "streamed" }
 ARCH_SUFFIXES = {"x86_64", "aarch64"}
+DEFAULT_POLICY = "on_demand"
+DEFAULT_CACHING = True
+POLICY_CACHING_MAP = {
+    ("always", False): "immediate",
+    ("always", True): "on_demand",
+    ("partial", False): "streamed",
+    ("partial", True): "on_demand",
+    ("never", False): "streamed",
+    ("never", True): "streamed"
+}
 DNF_COMMANDS = {
     "x86_64": ["dnf", "download", "--resolve", "--alldeps", "--arch=x86_64,noarch"],
     "aarch64": ["dnf", "download", "--forcearch", "aarch64", "--resolve", "--alldeps", "--exclude=*.x86_64"]
diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py
index 3e06ddc7cd..90faf88a7e 100644
--- a/common/library/module_utils/local_repo/software_utils.py
+++ b/common/library/module_utils/local_repo/software_utils.py
@@ -36,7 +36,9 @@
     RPM_LABEL_TEMPLATE,
     RHEL_OS_URL,
     SOFTWARES_KEY,
-    REPO_CONFIG,
+    POLICY_CACHING_MAP,
+    DEFAULT_POLICY,
+    DEFAULT_CACHING,
     ARCH_SUFFIXES,
     ADDITIONAL_REPOS_KEY,
     pulp_container_commands
@@ -210,6 +212,32 @@ def transform_package_dict(data, arch_val,logger):
     logger.info("Transformation complete for arch '%s'. Final result keys: %s", arch_val, list(final_result.keys()))
     return final_result
 
+def resolve_pulp_policy(policy_str, caching_val, logger=None):
+    """
+    Resolve user-facing policy and caching into Pulp download policy.
+    Args:
+        policy_str (str): User policy ('always', 'on_demand', 'partial').
+        caching_val: Caching flag (bool, str 'true'/'false', or None).
+        logger: Optional logger instance.
+    Returns:
+        str: Pulp download policy ('immediate', 'on_demand', 'streamed').
+    """
+    policy = str(policy_str).lower() if policy_str else DEFAULT_POLICY
+    if isinstance(caching_val, str):
+        caching = caching_val.lower() in ('true', '1', 'yes')
+    elif isinstance(caching_val, bool):
+        caching = caching_val
+    else:
+        caching = DEFAULT_CACHING
+    pulp_policy = POLICY_CACHING_MAP.get(
+        (policy, caching), "on_demand"
+    )
+    if logger:
+        logger.info(
+            f"Resolved policy='{policy}', caching={caching}"
+            f" -> pulp_policy='{pulp_policy}'"
+        )
+    return pulp_policy
 
 def parse_repo_urls(repo_config, local_repo_config_path,
                     version_variables, vault_key_path, sub_urls,logger,sw_archs=None):
@@ -271,7 +299,10 @@ def parse_repo_urls(repo_config, local_repo_config_path,
             client_key = url_.get("sslclientkey", "")
             client_cert = url_.get("sslclientcert", "")
             policy_given = url_.get("policy", repo_config)
-            policy = REPO_CONFIG.get(policy_given)
+            caching_given = url_.get("caching", True)
+            policy = resolve_pulp_policy(
+                policy_given, caching_given, logger
+            )
 
             logger.info(f"Processing user repo '{name}' for arch '{arch}' - URL: {url}")
 
@@ -302,7 +333,7 @@ def parse_repo_urls(repo_config, local_repo_config_path,
 
             logger.info(f"Added user repo entry: {name}")
 
-    # Handle RHEL repositories
+    # Handle RHEL repositories (includes subscription-based repos)
     for arch, repo_list in rhel_repo_entry.items():
         for url_ in repo_list:
             name = url_.get("name", "unknown")
@@ -312,7 +343,10 @@ def parse_repo_urls(repo_config, local_repo_config_path,
             client_key = url_.get("sslclientkey", "")
             client_cert = url_.get("sslclientcert", "")
             policy_given = url_.get("policy", repo_config)
-            policy = REPO_CONFIG.get(policy_given)
+            caching_given = url_.get("caching", True)
+            policy = resolve_pulp_policy(
+                policy_given, caching_given, logger
+            )
 
             logger.info(f"Processing RHEL repo '{name}' for arch '{arch}' - URL: {url}")
 
@@ -357,7 +391,10 @@ def parse_repo_urls(repo_config, local_repo_config_path,
             url = repo.get("url", "")
             gpgkey = repo.get("gpgkey", "")
             policy_given = repo.get("policy", repo_config)
-            policy = REPO_CONFIG.get(policy_given)
+            caching_given = repo.get("caching", True)
+            policy = resolve_pulp_policy(
+                policy_given, caching_given, logger
+            )
             logger.info(f"Processing OMNIA repo '{name}' for arch '{arch}' - Template URL: {url}")
 
             # Find unresolved template vars in URL
@@ -476,17 +513,11 @@ def get_subgroup_dict(user_data,logger):
 def get_csv_software(file_name):
 
     """
-
     Retrieves a list of software names from a CSV file.
- 
     Parameters:
-
         file_name (str): The name of the CSV file.
- 
     Returns:
-
         list: A list of software names.
-
     """
 
     csv_software = []
@@ -892,7 +923,9 @@ def parse_additional_repos(local_repo_config_path, repo_config, vault_key_path,
     local_yaml = load_yaml(local_repo_config_path)
 
     additional_repos_config = {}
-    policy = REPO_CONFIG.get(repo_config, "on_demand")
+    global_policy = resolve_pulp_policy(
+        repo_config, True, logger
+    )
 
     vault_key_full_path = os.path.join(vault_key_path, ".local_repo_credentials_key")
 
@@ -949,7 +982,7 @@ def parse_additional_repos(local_repo_config_path, repo_config, vault_key_path,
                 "ca_cert": ca_cert,
                 "client_key": client_key,
                 "client_cert": client_cert,
-                "policy": policy,
+                "policy": global_policy,
                 "arch": arch
             })
             logger.info(f"Added additional repo entry: {name}")
diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml
index 8428e6d94c..f81d62640c 100644
--- a/input/local_repo_config.yml
+++ b/input/local_repo_config.yml
@@ -43,7 +43,10 @@
 #   sslcacert   : Path to SSL CA certificate (if using SSL)
 #   sslclientkey: Path to SSL client key (if using SSL)
 #   sslclientcert: Path to SSL client certificate (if using SSL)
-#   policy      : Repository policy (always, partial)
+#   policy      : Repository sync policy. Allowed values: always, partial (OPTIONAL)
+#                   If not provided, uses repo_config from software_config.json
+#   caching     : Enable or disable local caching. Allowed values: true, false (OPTIONAL)
+#                   If not provided, defaults to true
 # Notes:
 #   - Do not use Jinja variables in this configuration.
 #   - Omit SSL fields entirely if SSL is not in use.
@@ -63,7 +66,10 @@
 #   sslcacert   : Path to SSL CA certificate (if using SSL)
 #   sslclientkey: Path to SSL client key (if using SSL)
 #   sslclientcert: Path to SSL client certificate (if using SSL)
-#   policy      : Repository policy if mentioned allowed values (always, partial). IF not mentioned will consider from software_config.json
+#   policy      : Repository policy if mentioned allowed values (always, partial). 
+#                   If not provided, uses repo_config from software_config.json
+#   caching     : Enable or disable local caching. Allowed values: true, false (OPTIONAL)
+#                   If not provided, defaults to true
 #   name        : Name of the repository [ Allowed repo names <arch>_codeready-builder, <arch>_appstream, <arch>_baseos
 # Notes:
 #   - Do not use Jinja variables in this configuration.
@@ -75,8 +81,35 @@
 #----------------------------
 #    Same as above but for aarch64 architecture.
 #
+# 6. rhel_subscription_repo_config_x86_64
+#-------------------------------------------
+#    Optional configuration for overriding policy and caching settings for RHEL 
+#    subscription-based repositories on x86_64 architecture.
+#    When subscription is enabled, this config takes precedence over dynamically 
+#    generated URLs for matching repositories and adds any additional repositories.
+# Fields:
+#   url         : Base URL of the repository (REQUIRED)
+#   gpgkey      : GPG key URL (REQUIRED, can be empty to disable gpgcheck)
+#   name        : Repository name for matching (REQUIRED)
+#   policy      : Repository sync policy. Allowed values: always, partial (OPTIONAL)
+#                   If not provided, uses repo_config from software_config.json
+#   caching     : Enable or disable local caching. Allowed values: true, false (OPTIONAL)
+#                   If not provided, defaults to true
+#   sslcacert   : Path to SSL CA certificate (optional)
+#   sslclientkey: Path to SSL client key (optional)
+#   sslclientcert: Path to SSL client certificate (optional)
+# Notes:
+#   - Do not use Jinja variables in this configuration.
+#   - Omit SSL fields entirely if SSL is not in use.
+#   - Matching is done by repository name (e.g., x86_64_appstream)
+#   - Non-matching repositories are added as additional repos
+#
+# 7. rhel_subscription_repo_config_aarch64
+#--------------------------------------------
+#    Same as above but for aarch64 architecture.
+#
 #### ADVANCE CONFIGURATIONS FOR LOCAL REPO ###
-# 6. omnia_repo_url_rhel_x86_64
+# 8. omnia_repo_url_rhel_x86_64
 #-------------------------------
 #    Mandatory repository URLs for downloading RPMS for Omnia features on RHEL x86_64.
 #    Each entry includes url, gpgkey, and name.
@@ -88,12 +121,15 @@
 #  gpgkey     : URL of the GPG key for the repository.
 #                   If left empty, gpgcheck=0 for that repository.
 #  name       : A unique identifier for the repository or registry.
-#
-# 7. omnia_repo_url_rhel_aarch64
+#  policy      : Repository sync policy. Allowed values: always, partial (OPTIONAL)
+#                   If not provided, uses repo_config from software_config.json
+#  caching     : Enable or disable local caching. Allowed values: true, false (OPTIONAL)
+#                   If not provided, defaults to true
+# 9. omnia_repo_url_rhel_aarch64
 #--------------------------------
 #    Same as above but for RHEL aarch64.
 #
-# 8. additional_repos_x86_64
+# 10. additional_repos_x86_64
 #----------------------------
 #    Optional list of additional repository URLs for x86_64 architecture.
 #    These repos are aggregated into a single Pulp repository, allowing dynamic
@@ -105,6 +141,10 @@
 #   sslcacert     : Path to SSL CA certificate (optional)
 #   sslclientkey  : Path to SSL client key (optional)
 #   sslclientcert : Path to SSL client certificate (optional)
+#   policy      : Repository sync policy. Allowed values: always, partial (OPTIONAL)
+#                   If not provided, uses repo_config from software_config.json
+#   caching     : Enable or disable local caching. Allowed values: true, false (OPTIONAL)
+#                   If not provided, defaults to true
 # Notes:
 #   - All repos are synced into a single aggregated Pulp repository
 #   - Compute nodes are configured once with a fixed URL that never changes
@@ -112,7 +152,7 @@
 #   - Name must be unique within this list and must not conflict with names in other repo keys
 #   - Packages from these repos can only be used via additional_packages.json
 #
-# 9. additional_repos_aarch64
+# 11. additional_repos_aarch64
 #-----------------------------
 #    Same as above but for aarch64 architecture.
 
@@ -133,6 +173,12 @@ user_repo_url_aarch64:
 #  - { url: "http://AppStream.com/AppStream/x86_64/os/", gpgkey: "http://AppStream.com/AppStream/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_appstream" }
 rhel_os_url_x86_64:
 rhel_os_url_aarch64:
+# Example:
+# rhel_subscription_repo_config_x86_64:
+#  - { url: "https://example.com/appstream", gpgkey: "", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_appstream", policy: "always", caching: true }
+#  - { url: "https://cdn.redhat.com/content/dist/rhel10/10.0/x86_64/supplementary/os/", gpgkey: "file:///etc/pki/rpm-gpg/RPM-GPG-KEY-redhat-release", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_supplementary", policy: "always", caching: false }
+rhel_subscription_repo_config_x86_64:
+rhel_subscription_repo_config_aarch64:
 # Making incorrect changes to this variable can cause omnia failure. Please edit cautiously.
 omnia_repo_url_rhel_x86_64:
   - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"}
diff --git a/local_repo/roles/validation/tasks/configure_rhel_os_urls.yml b/local_repo/roles/validation/tasks/configure_rhel_os_urls.yml
index ec6766f2a3..9464284758 100644
--- a/local_repo/roles/validation/tasks/configure_rhel_os_urls.yml
+++ b/local_repo/roles/validation/tasks/configure_rhel_os_urls.yml
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -51,6 +51,11 @@
         sslclientkey: "{{ lookup('pipe', 'ls {{ omnia_rhel_cert_dir }}/*-key.pem | head -n1') }}"
         sslclientcert: "{{ lookup('pipe', 'ls {{ omnia_rhel_cert_dir }}/*.pem | grep -v -- -key.pem | head -n1') }}"
         sub_rhel_x86_64_urls: []
+        sub_rhel_aarch64_urls: []
+        sub_policy_default: "{{ sw_config.repo_config | default('on_demand') }}"
+        sub_caching_default: true
+        sub_x86_64_override_config: "{{ local_config.rhel_subscription_repo_config_x86_64 | default([]) }}"
+        sub_aarch64_override_config: "{{ local_config.rhel_subscription_repo_config_aarch64 | default([]) }}"
 
     - name: Append repo entries to x86_64 list
       ansible.builtin.set_fact:
@@ -64,7 +69,8 @@
               'sslcacert' : sslcacert,
               'sslclientkey' : sslclientkey,
               'sslclientcert': sslclientcert,
-              'policy': 'partial',
+              'policy': sub_policy_default,
+              'caching': sub_caching_default,
               'name': (
                 arch ~ '_appstream' if 'appstream' in repo_url else
                 arch ~ '_baseos' if 'baseos' in repo_url else
@@ -83,12 +89,106 @@
           'name': item.name | replace('x86_64', 'aarch64'),
           'gpgkey': item.gpgkey,
           'policy': item.policy,
+          'caching': item.caching,
           'sslcacert': item.sslcacert,
           'sslclientcert': item.sslclientcert,
           'sslclientkey': item.sslclientkey}] }}"
       loop: "{{ sub_rhel_x86_64_urls }}"
       loop_control:
         loop_var: item
+
+    # 3️ Apply override configurations and merge additional repositories
+    - name: Create name mapping for x86_64 dynamic repos
+      ansible.builtin.set_fact:
+        x86_64_dynamic_names: "{{ sub_rhel_x86_64_urls | map(attribute='name') | list }}"
+
+    - name: Apply x86_64 overrides to matching repos
+      ansible.builtin.set_fact:
+        sub_rhel_x86_64_urls: >-
+          {%- set result = [] -%}
+          {%- for repo in sub_rhel_x86_64_urls -%}
+            {%- set override = (sub_x86_64_override_config | selectattr('name', 'equalto', repo.name) | first | default({})) -%}
+            {%- set updated_repo = repo | combine({
+              'policy': override.policy | default(repo.policy),
+              'caching': override.caching | default(repo.caching),
+              'url': override.url | default(repo.url),
+              'gpgkey': override.gpgkey | default(repo.gpgkey)
+            }) -%}
+            {%- set _ = result.append(updated_repo) -%}
+          {%- endfor -%}
+          {{ result }}
+
+    - name: Identify non-matching x86_64 override repos
+      ansible.builtin.set_fact:
+        additional_x86_64_repos: >-
+          {{
+            sub_x86_64_override_config | rejectattr('name', 'in', x86_64_dynamic_names) | list
+          }}
+
+    - name: Add non-matching x86_64 override repos as additional
+      ansible.builtin.set_fact:
+        sub_rhel_x86_64_urls: >-
+          {%- set result = sub_rhel_x86_64_urls -%}
+          {%- for repo in additional_x86_64_repos -%}
+            {%- set new_repo = {
+              'url': repo.url,
+              'gpgkey': repo.gpgkey | default(''),
+              'name': repo.name,
+              'policy': repo.policy | default(sub_policy_default),
+              'caching': repo.caching | default(sub_caching_default),
+              'sslcacert': sslcacert,
+              'sslclientcert': sslclientcert,
+              'sslclientkey': sslclientkey
+            } -%}
+            {%- set _ = result.append(new_repo) -%}
+          {%- endfor -%}
+          {{ result }}
+
+    - name: Apply aarch64 overrides to matching repos
+      ansible.builtin.set_fact:
+        sub_rhel_aarch64_urls: >-
+          {%- set result = [] -%}
+          {%- for repo in sub_rhel_aarch64_urls -%}
+            {%- set override = (sub_aarch64_override_config | selectattr('name', 'equalto', repo.name) | first | default({})) -%}
+            {%- set updated_repo = repo | combine({
+              'policy': override.policy | default(repo.policy),
+              'caching': override.caching | default(repo.caching),
+              'url': override.url | default(repo.url),
+              'gpgkey': override.gpgkey | default(repo.gpgkey)
+            }) -%}
+            {%- set _ = result.append(updated_repo) -%}
+          {%- endfor -%}
+          {{ result }}
+
+    - name: Identify non-matching aarch64 override repos
+      ansible.builtin.set_fact:
+        aarch64_dynamic_names: "{{ sub_rhel_aarch64_urls | map(attribute='name') | list }}"
+        additional_aarch64_repos: >-
+          {{
+            sub_aarch64_override_config | rejectattr('name', 'in', aarch64_dynamic_names) | list
+          }}
+      when: "'aarch64' in archs"
+
+    - name: Add non-matching aarch64 override repos as additional
+      ansible.builtin.set_fact:
+        sub_rhel_aarch64_urls: >-
+          {%- set result = sub_rhel_aarch64_urls -%}
+          {%- for repo in additional_aarch64_repos -%}
+            {%- set new_repo = {
+              'url': repo.url,
+              'gpgkey': repo.gpgkey | default(''),
+              'name': repo.name,
+              'policy': repo.policy | default(sub_policy_default),
+              'caching': repo.caching | default(sub_caching_default),
+              'sslcacert': sslcacert,
+              'sslclientcert': sslclientcert,
+              'sslclientkey': sslclientkey
+            } -%}
+            {%- set _ = result.append(new_repo) -%}
+          {%- endfor -%}
+          {{ result }}
+      when: "'aarch64' in archs"
+
 - name: Build final repo dict
   ansible.builtin.set_fact:
     sub_final_repo_urls:
diff --git a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
index 26dbec2dae..bcca679033 100644
--- a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
+++ b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
@@ -40,9 +40,12 @@ arg_list:
   - "-e PULP_API_WORKERS_MAX_REQUESTS_JITTER=50"
 pulp_deployed_msg: "The {{ pulp_container_name }} container has been successfully deployed."
 pulp_deployed_fail_msg:
-  The deployment of the {{ pulp_container_name }} container has failed. To resolve this issue,
-  please run the utility/oim_cleanup.yml playbook to clean up any existing OIM resources.
-  After the cleanup, you can re-run the original playbook to deploy the {{ pulp_container_name }} container successfully.
+  "The {{ pulp_container_name }} container deployment failed. Common causes:
+  • Missing or inaccessible pulp container image
+  • Pulp service not starting successfully  
+  • NFS storage not reachable or not mounted
+  Run utility/oim_cleanup.yml to cleanup, then re-run the playbook to deploy the {{ pulp_container_name }} 
+  container successfully."
 retries_var: 8
 delay_var: 30
 delay_var_sixty: 30

From 381a64055d4734e6b29b0c1432919e86bb9466ea Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 23 Feb 2026 15:48:27 +0530
Subject: [PATCH 59/77] SSH pemission and access issue fix for upgrade after
 prepare_oim

---
 omnia.sh                                      | 38 ++++----
 .../prepare_oim_validation/tasks/main.yml     |  3 +
 .../tasks/validate_ssh_permissions.yml        | 93 +++++++++++++++++++
 .../templates/network_spec.j2                 |  4 +-
 .../templates/omnia_config.j2                 |  2 +
 5 files changed, 121 insertions(+), 19 deletions(-)
 create mode 100644 prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml

diff --git a/omnia.sh b/omnia.sh
index 530c168e7d..a9d35defd6 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -381,6 +381,8 @@ setup_omnia_core() {
     # Post container setup configuration
     post_setup_config
 
+    remove_container_omnia_sh
+
     # Start the container
     start_container_session
 }
@@ -1102,8 +1104,6 @@ EOF
     firewall-cmd --permanent --zone=public --add-port=2222/tcp
     firewall-cmd --reload
 }
-
-# This function sets up the configuration for the Omnia core.
 #  post_setup_config is a function that sets up the configuration for the Omnia core.
 #  It creates the necessary directories and files, copies input files from the Omnia container,
 #  and creates the oim_metadata.yml file.
@@ -1117,7 +1117,6 @@ post_setup_config() {
     mkdir -p "$OMNIA_INPUT_DIR/"
 
     # Create the default.yml file if it does not exist.
-    # This file contains the name of the project.
     if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then
         echo -e "${BLUE} Creating default.yml file.${NC}"
         {
@@ -1140,33 +1139,38 @@ post_setup_config() {
 }
 
 validate_nfs_server() {
-
-    # Validate NFS server permission
     if [ "$share_option" = "NFS" ]; then
-        # Create a temporary file inside $omnia_path
-        temp_file="$omnia_path/temp_file"
+        local temp_file="$omnia_path/temp_file"
         touch "$temp_file"
-        # Check if the file can be chown to root
         if chown root:root "$temp_file"; then
-            rm "$temp_file"
+            rm -f "$temp_file"
         else
             echo "Error: Unable to chown file to root in $omnia_path. NFS server permission validation failed. Please ensure no_root_squash option is enabled in the NFS export configuration."
             exit 1
         fi
+
         if [ "`ls -ld $omnia_path/omnia/ssh_config/.ssh/id_rsa | awk '{print $3 ":" $4}'`" != "root:root" ]; then
             echo "Error: The $omnia_path/omnia/ssh_config/.ssh/id_rsa file should be owned by root:root. NFS server permission validation failed. Please verify the NFS export configuration."
             exit 1
         fi
     fi
+}
 
+refresh_known_hosts() {
+    local ssh_port=2222
+
+    mkdir -p "$HOME/.ssh"
+    touch "$HOME/.ssh/known_hosts"
+    ssh-keygen -R "[localhost]:$ssh_port" >/dev/null 2>&1 || true
+    ssh-keyscan -p "$ssh_port" localhost 2>/dev/null | grep -v "^#" >> "$HOME/.ssh/known_hosts" || true
 }
 
 init_ssh_config() {
-    mkdir -p "$HOME/.ssh"
-    touch $HOME/.ssh/known_hosts
-    # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host
-    ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1  # Remove existing entry if it exists
-    ssh-keyscan -p 2222 localhost 2>/dev/null | grep -v "^#" >> $HOME/.ssh/known_hosts  # Scan and add the new key
+    refresh_known_hosts
+}
+
+remove_container_omnia_sh() {
+    podman exec -u root omnia_core bash -c 'if [ -f /omnia/omnia.sh ]; then rm -f /omnia/omnia.sh; fi' >/dev/null 2>&1 || true
 }
 
 start_container_session() {
@@ -1213,8 +1217,8 @@ show_help() {
     echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]"
     echo "  -i, --install     Install and start the Omnia core container"
     echo "  -u, --uninstall   Uninstall the Omnia core container and clean up configuration"
-    echo "      --upgrade     Upgrade the Omnia core container to newer version
-    echo "      --rollback    Rollback the Omnia core container to previous version
+    echo "      --upgrade     Upgrade the Omnia core container to newer version"
+    echo "      --rollback    Rollback the Omnia core container to previous version"
     echo "  -v, --version     Display Omnia version information"
     echo "  -h, --help        More information about usage"
 }
@@ -1906,6 +1910,7 @@ upgrade_omnia_core() {
     show_post_upgrade_instructions "$TARGET_OMNIA_VERSION"
     # Initialize SSH config and start container session
     init_ssh_config
+    remove_container_omnia_sh
     start_container_session
     exit 0
 }
@@ -2324,6 +2329,7 @@ rollback_omnia_core() {
 
     # Initialize SSH config and start container session
     init_ssh_config
+    remove_container_omnia_sh
     start_container_session
 }
 
diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/main.yml b/prepare_oim/roles/prepare_oim_validation/tasks/main.yml
index 5a252a6114..7da0078e44 100644
--- a/prepare_oim/roles/prepare_oim_validation/tasks/main.yml
+++ b/prepare_oim/roles/prepare_oim_validation/tasks/main.yml
@@ -13,6 +13,9 @@
 # limitations under the License.
 ---
 
+- name: Validate SSH permissions and ownership
+  ansible.builtin.include_tasks: validate_ssh_permissions.yml
+
 - name: Validate passwordless ssh host
   ansible.builtin.include_tasks: validate_passwordless_ssh_oim.yml
 
diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
new file mode 100644
index 0000000000..aa5a019b93
--- /dev/null
+++ b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
@@ -0,0 +1,93 @@
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+- name: Ensure SSH critical paths have safe ownership and permissions
+  block:
+    - name: Define SSH critical paths
+      ansible.builtin.set_fact:
+        ssh_critical_paths:
+          - { path: "/root/.ssh", state: "directory", mode: "0700" }
+          - { path: "/root/.ssh/authorized_keys", state: "file", mode: "0600" }
+          - { path: "/root/.ssh/id_rsa", state: "file", mode: "0600" }
+          - { path: "/root/.ssh/id_rsa.pub", state: "file", mode: "0644" }
+          - { path: "/root/.ssh/known_hosts", state: "file", mode: "0644" }
+          - { path: "/root/.ssh/config", state: "file", mode: "0600" }
+
+    - name: Ensure SSH directory exists with secure mode
+      ansible.builtin.file:
+        path: "/root/.ssh"
+        state: directory
+        mode: "0700"
+        owner: root
+        group: root
+      register: ssh_dir_result
+
+    - name: Stat SSH critical paths
+      ansible.builtin.stat:
+        path: "{{ item.path }}"
+        get_checksum: false
+      register: ssh_path_stats
+      loop: "{{ ssh_critical_paths }}"
+      loop_control:
+        label: "{{ item.path }}"
+
+    - name: Enforce SSH ownership and permissions for existing files
+      ansible.builtin.file:
+        path: "{{ item.item.path }}"
+        state: "{{ item.item.state }}"
+        mode: "{{ item.item.mode }}"
+        owner: root
+        group: root
+      loop: "{{ ssh_path_stats.results }}"
+      loop_control:
+        label: "{{ item.item.path }}"
+      when:
+        - item.stat.exists | default(false)
+        - item.item.state == 'file'
+      register: ssh_path_fixes
+
+    - name: Log SSH permission adjustments
+      ansible.builtin.debug:
+        msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root"
+      loop: "{{ (ssh_path_fixes.results | default([]))\
+                | selectattr('item', 'defined')\
+                | selectattr('changed', 'defined')\
+                | selectattr('changed')\
+                | list }}"
+      loop_control:
+        label: "{{ item.item.path | default('unknown path') }}"
+
+    - name: Log SSH directory adjustments
+      ansible.builtin.debug:
+        msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root"
+      when: ssh_dir_result.changed | default(false)
+
+    - name: Validate SSH permission state
+      ansible.builtin.assert:
+        that:
+          - not (item.stat.exists | default(false)) or (item.stat.pw_name == 'root' and item.stat.gr_name == 'root')
+          - not (item.stat.exists | default(false)) or (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path]
+        fail_msg: "SSH path {{ item.item.path }} has invalid ownership or mode. Expected root:root with mode {{ item_mode_expected[item.item.path] }}. Fix manually or rerun prepare_oim."
+      vars:
+        item_mode_expected: "{{ dict(ssh_critical_paths | map(attribute='path') | zip(ssh_critical_paths | map(attribute='mode'))) }}"
+      loop: "{{ ssh_path_stats.results }}"
+      loop_control:
+        label: "{{ item.item.path }}"
+      when: item.stat.exists | default(false)
+
+  rescue:
+    - name: Fail upgrade due to SSH permission issues
+      ansible.builtin.fail:
+        msg: "SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}. Correct SSH file permissions/ownership and rerun prepare_oim."
diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2
index d9e41ba469..564c057db4 100644
--- a/upgrade/roles/import_input_parameters/templates/network_spec.j2
+++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2
@@ -43,9 +43,7 @@ Networks:
     oim_nic_name: "{{ admin_network.oim_nic_name | default('') }}"
     netmask_bits: "{{ admin_network.netmask_bits | default('24') }}"
     primary_oim_admin_ip: "{{ admin_network.primary_oim_admin_ip | default('') }}"
-{% if (admin_network.primary_oim_bmc_ip is defined) and ((admin_network.primary_oim_bmc_ip | string | trim) != '') %}
-    primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip }}"
-{% endif %}
+    primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip | default('') }}"
     dynamic_range: "{{ admin_network.dynamic_range | default('') }}"
     dns: {{ admin_network.dns | default([]) }}
     ntp_servers: {{ admin_network.ntp_servers | default([]) }}
diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2
index aec7a05ab7..eff82ee1c5 100644
--- a/upgrade/roles/import_input_parameters/templates/omnia_config.j2
+++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2
@@ -47,6 +47,7 @@ slurm_cluster:
   - cluster_name: {{ _cluster.cluster_name | default('') }}
     nfs_storage_name: {{ _cluster.nfs_storage_name | default('') }}
 {% if _cluster.config_sources is defined and (_cluster.config_sources | length > 0) %}
+    skip_merge: {{ _cluster.skip_merge | default(true) }}
     config_sources:
 {% set _supported = ['slurm', 'cgroup', 'slurmdbd', 'gres'] %}
 {% for _conf_name, _conf_val in _cluster.config_sources.items() %}
@@ -84,6 +85,7 @@ slurm_cluster:
     #   slurmdbd: /path/to/custom_slurmdbd.conf
     #   gres: /path/to/custom_gres.conf
 {% else %}
+    # skip_merge: True
     # config_sources:
     #   slurm:
     #     SlurmctldTimeout: 60

From f397f00701f5b6b0c1bdce13bd112c3b2c70ec63 Mon Sep 17 00:00:00 2001
From: pullan1 <sudha.pullalaravu@dell.com>
Date: Mon, 23 Feb 2026 16:28:31 +0530
Subject: [PATCH 60/77] ansible lint fix

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 prepare_oim/roles/deploy_containers/pulp/vars/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
index bcca679033..da17b168d3 100644
--- a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
+++ b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml
@@ -42,9 +42,9 @@ pulp_deployed_msg: "The {{ pulp_container_name }} container has been successfull
 pulp_deployed_fail_msg:
   "The {{ pulp_container_name }} container deployment failed. Common causes:
   • Missing or inaccessible pulp container image
-  • Pulp service not starting successfully  
+  • Pulp service not starting successfully
   • NFS storage not reachable or not mounted
-  Run utility/oim_cleanup.yml to cleanup, then re-run the playbook to deploy the {{ pulp_container_name }} 
+  Run utility/oim_cleanup.yml to cleanup, then re-run the playbook to deploy the {{ pulp_container_name }}
   container successfully."
 retries_var: 8
 delay_var: 30

From f0a6461a589523cda229ecd6ccbd797863812e32 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 23 Feb 2026 16:53:08 +0530
Subject: [PATCH 61/77] Update validate_ssh_permissions.yml

---
 .../tasks/validate_ssh_permissions.yml        | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
index aa5a019b93..6c01a95f47 100644
--- a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
+++ b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
@@ -61,10 +61,11 @@
     - name: Log SSH permission adjustments
       ansible.builtin.debug:
         msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root"
-      loop: "{{ (ssh_path_fixes.results | default([]))\
-                | selectattr('item', 'defined')\
-                | selectattr('changed', 'defined')\
-                | selectattr('changed')\
+      loop: "{{ (ssh_path_fixes.results | default([]))
+                | selectattr('item', 'defined')
+                | selectattr('item.path', 'defined')
+                | selectattr('changed', 'defined')
+                | selectattr('changed')
                 | list }}"
       loop_control:
         label: "{{ item.item.path | default('unknown path') }}"
@@ -73,13 +74,20 @@
       ansible.builtin.debug:
         msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root"
       when: ssh_dir_result.changed | default(false)
+      changed_when: false
 
     - name: Validate SSH permission state
       ansible.builtin.assert:
         that:
           - not (item.stat.exists | default(false)) or (item.stat.pw_name == 'root' and item.stat.gr_name == 'root')
-          - not (item.stat.exists | default(false)) or (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path]
-        fail_msg: "SSH path {{ item.item.path }} has invalid ownership or mode. Expected root:root with mode {{ item_mode_expected[item.item.path] }}. Fix manually or rerun prepare_oim."
+          - >-
+            not (item.stat.exists | default(false)) or
+            (item.item.path == '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] in ['0600', '0644']) or
+            (item.item.path != '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path])
+        fail_msg: >-
+          SSH path {{ item.item.path }} has invalid ownership or mode.
+          Expected root:root with mode {{ item_mode_expected[item.item.path] }}{% if item.item.path == '/root/.ssh/known_hosts' %} (or 0600/0644 for known_hosts){% endif %}.
+          Fix manually or rerun prepare_oim.
       vars:
         item_mode_expected: "{{ dict(ssh_critical_paths | map(attribute='path') | zip(ssh_critical_paths | map(attribute='mode'))) }}"
       loop: "{{ ssh_path_stats.results }}"
@@ -90,4 +98,6 @@
   rescue:
     - name: Fail upgrade due to SSH permission issues
       ansible.builtin.fail:
-        msg: "SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}. Correct SSH file permissions/ownership and rerun prepare_oim."
+        msg: >-
+          SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}.
+          Correct SSH file permissions/ownership and rerun prepare_oim.

From d7f2cef080f952a7eae3d4da2f72907ce7b7d094 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 23 Feb 2026 17:04:46 +0530
Subject: [PATCH 62/77] updated validate_ssh_permissions.yml

---
 .../tasks/validate_ssh_permissions.yml              | 13 ++++---------
 .../roles/prepare_oim_validation/vars/main.yml      | 12 ++++++++++++
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
index 6c01a95f47..6d01c94a5e 100644
--- a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
+++ b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
@@ -60,7 +60,7 @@
 
     - name: Log SSH permission adjustments
       ansible.builtin.debug:
-        msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root"
+        msg: "{{ ssh_file_log_msg }}"
       loop: "{{ (ssh_path_fixes.results | default([]))
                 | selectattr('item', 'defined')
                 | selectattr('item.path', 'defined')
@@ -72,7 +72,7 @@
 
     - name: Log SSH directory adjustments
       ansible.builtin.debug:
-        msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root"
+        msg: "{{ ssh_dir_log_msg }}"
       when: ssh_dir_result.changed | default(false)
       changed_when: false
 
@@ -84,10 +84,7 @@
             not (item.stat.exists | default(false)) or
             (item.item.path == '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] in ['0600', '0644']) or
             (item.item.path != '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path])
-        fail_msg: >-
-          SSH path {{ item.item.path }} has invalid ownership or mode.
-          Expected root:root with mode {{ item_mode_expected[item.item.path] }}{% if item.item.path == '/root/.ssh/known_hosts' %} (or 0600/0644 for known_hosts){% endif %}.
-          Fix manually or rerun prepare_oim.
+        fail_msg: "{{ ssh_permission_fail_msg }}"
       vars:
         item_mode_expected: "{{ dict(ssh_critical_paths | map(attribute='path') | zip(ssh_critical_paths | map(attribute='mode'))) }}"
       loop: "{{ ssh_path_stats.results }}"
@@ -98,6 +95,4 @@
   rescue:
     - name: Fail upgrade due to SSH permission issues
       ansible.builtin.fail:
-        msg: >-
-          SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}.
-          Correct SSH file permissions/ownership and rerun prepare_oim.
+        msg: "{{ ssh_validation_fail_msg }}"
diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml
index 79bd5f5b4d..5eda60a210 100644
--- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml
+++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml
@@ -84,3 +84,15 @@ functional_groups_config_syntax_fail_msg: "Failed. Syntax errors present in func
 telemetry_config_file: "telemetry_config.yml"
 fail_msg_telemetry_config_file: "telemetry_config.yml file doesn't exist in the input folder."
 telemetry_config_syntax_fail_msg: "Failed. Syntax errors present in telemetry_config.yml. Fix errors and re-run playbook again. Common syntax Errors:"
+
+# Usage: validate_ssh_permissions.yml
+ssh_dir_log_msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root"
+ssh_file_log_msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root"
+ssh_permission_fail_msg: >-
+  SSH path {{ item.item.path }} has invalid ownership or mode.
+  Expected root:root with mode {{ item_mode_expected[item.item.path] }}
+  {% if item.item.path == '/root/.ssh/known_hosts' %} (or 0600/0644 for known_hosts){% endif %}.
+  Fix manually or rerun prepare_oim.
+ssh_validation_fail_msg: >-
+  SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}.
+  Correct SSH file permissions/ownership and rerun prepare_oim.

From b606a13e91de44a56d2fb86c9c1aef19ca814322 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 23 Feb 2026 17:17:04 +0530
Subject: [PATCH 63/77] Update validate_ssh_permissions.yml

---
 .../prepare_oim_validation/tasks/validate_ssh_permissions.yml    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
index 6d01c94a5e..a3581b39e1 100644
--- a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
+++ b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
@@ -73,7 +73,6 @@
     - name: Log SSH directory adjustments
       ansible.builtin.debug:
         msg: "{{ ssh_dir_log_msg }}"
-      when: ssh_dir_result.changed | default(false)
       changed_when: false
 
     - name: Validate SSH permission state

From 1d6e7e31cf18205ea4927e0f39bf7b29402e85e6 Mon Sep 17 00:00:00 2001
From: Nagachandan-P <Nagachandan.p@dell.com>
Date: Mon, 23 Feb 2026 12:48:35 +0000
Subject: [PATCH 64/77] ARM nodes gpu detection

---
 .../slurm_config/tasks/read_node_idrac.yml    | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac.yml b/discovery/roles/slurm_config/tasks/read_node_idrac.yml
index 8424f69603..a713863438 100644
--- a/discovery/roles/slurm_config/tasks/read_node_idrac.yml
+++ b/discovery/roles/slurm_config/tasks/read_node_idrac.yml
@@ -41,6 +41,99 @@
      | selectattr('Manufacturer', 'defined')
      | selectattr('Manufacturer', 'search', '(?i)nvidia') | list }}" # TODO: other GPUs also
 
+- name: Fallback - Read PCIe Devices for GPU detection (when no GPUs found via Processors)
+  ansible.builtin.uri:
+    url: "https://{{ bmc_ip_map[item] }}/redfish/v1/Chassis/System.Embedded.1/PCIeDevices"
+    user: "{{ bmc_username }}"
+    password: "{{ bmc_password }}"
+    method: GET
+    force_basic_auth: true
+    validate_certs: false
+    return_content: true
+    body_format: json
+    timeout: 60
+    headers:
+      Accept: "application/json"
+      Content-Type: "application/json"
+      OData-Version: "4.0"
+    status_code:
+      - 200
+  register: pcie_devices
+  failed_when: false
+  when: gpus | length == 0
+
+- name: Debug - Show PCIe devices structure
+  ansible.builtin.debug:
+    var: pcie_devices.json.Members
+  when: gpus | length == 0 and pcie_devices.json.Members is defined
+
+- name: Fallback - Extract PCIe device URLs
+  ansible.builtin.set_fact:
+    pcie_device_urls: "{{ pcie_devices.json.Members | default([]) | json_query('[*].\"@odata.id\"') }}"
+  when: gpus | length == 0
+
+- name: Fallback - Get PCIe Device details for GPU detection
+  ansible.builtin.uri:
+    url: "https://{{ bmc_ip_map[item.0] }}{{ item.1 }}"
+    user: "{{ bmc_username }}"
+    password: "{{ bmc_password }}"
+    method: GET
+    force_basic_auth: true
+    validate_certs: false
+    return_content: true
+    body_format: json
+    timeout: 60
+    headers:
+      Accept: "application/json"
+      Content-Type: "application/json"
+      OData-Version: "4.0"
+    status_code:
+      - 200
+  register: pcie_device_details
+  with_nested:
+    - ["{{ item }}"]
+    - "{{ pcie_device_urls | default([]) }}"
+  loop_control:
+    label: "{{ item.1 }}"
+  failed_when: false
+  when: gpus | length == 0 and pcie_device_urls is defined and pcie_device_urls | length > 0
+
+- name: Fallback - Detect GPUs from PCIe devices
+  ansible.builtin.set_fact:
+    fallback_gpus: "{{ pcie_device_details.results | default([])
+      | selectattr('json', 'defined')
+      | map(attribute='json')
+      | selectattr('ClassCode', 'defined')
+      | selectattr('VendorId', 'defined')
+      | selectattr('ClassCode', 'equalto', '0x0300') | list }}"
+  when: gpus | length == 0
+
+- name: Fallback - Detect GPUs from PCIe devices (additional criteria)
+  ansible.builtin.set_fact:
+    fallback_gpus_additional: "{{ pcie_device_details.results | default([])
+      | selectattr('json', 'defined')
+      | map(attribute='json')
+      | selectattr('ClassCode', 'defined')
+      | selectattr('VendorId', 'defined')
+      | selectattr('ClassCode', 'equalto', '0x0302') | list }}"
+  when: gpus | length == 0 and fallback_gpus | default([]) | length == 0
+
+- name: Fallback - Detect GPUs from Manufacturer/Name (NVIDIA only)
+  ansible.builtin.set_fact:
+    fallback_gpus_manufacturer: "{{ pcie_device_details.results | default([])
+      | selectattr('json', 'defined')
+      | map(attribute='json')
+      | selectattr('Manufacturer', 'defined')
+      | selectattr('Name', 'defined')
+      | selectattr('Manufacturer', 'search', '(?i)NVIDIA')
+      | selectattr('Name', 'search', '(?i)GPU|RTX|TESLA|A100|H100|L40|GB') | list }}"
+  when: gpus | length == 0 and fallback_gpus | default([]) | length == 0 and fallback_gpus_additional | default([]) | length == 0
+
+- name: Fallback - Update GPUs list if PCIe detection found GPUs
+  ansible.builtin.set_fact:
+    gpus: "{{ (fallback_gpus | default([])) or (fallback_gpus_additional | default([])) or (fallback_gpus_manufacturer | default([])) }}"
+  when: gpus | length == 0
+
 
 - name: Read Memory NodeParams
   ansible.builtin.uri:

From 513443ebb62348834fcd35ba71a4a0d1bc9c27ca Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 23 Feb 2026 18:50:55 +0530
Subject: [PATCH 65/77] Update omnia.sh

---
 omnia.sh                                      | 13 +--
 .../prepare_oim_validation/tasks/main.yml     |  3 -
 .../tasks/validate_ssh_permissions.yml        | 97 -------------------
 .../prepare_oim_validation/vars/main.yml      | 12 ---
 4 files changed, 5 insertions(+), 120 deletions(-)
 delete mode 100644 prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml

diff --git a/omnia.sh b/omnia.sh
index a9d35defd6..9de277a56d 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -1156,7 +1156,7 @@ validate_nfs_server() {
     fi
 }
 
-refresh_known_hosts() {
+init_ssh_config() {
     local ssh_port=2222
 
     mkdir -p "$HOME/.ssh"
@@ -1165,10 +1165,6 @@ refresh_known_hosts() {
     ssh-keyscan -p "$ssh_port" localhost 2>/dev/null | grep -v "^#" >> "$HOME/.ssh/known_hosts" || true
 }
 
-init_ssh_config() {
-    refresh_known_hosts
-}
-
 remove_container_omnia_sh() {
     podman exec -u root omnia_core bash -c 'if [ -f /omnia/omnia.sh ]; then rm -f /omnia/omnia.sh; fi' >/dev/null 2>&1 || true
 }
@@ -1235,14 +1231,15 @@ install_omnia_core() {
             exit 1
         fi
     fi
-
+    
     local omnia_core_tag="2.1"
     local omnia_core_registry=""
     
     # Check if local omnia_core image exists using validate function
-    if validate_container_image "" "$omnia_core_tag" "install"; then
-        echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
+    if ! validate_container_image "" "$omnia_core_tag" "install"; then
+        exit 1
     fi
+    echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
 
     # Check if any other containers with 'omnia' in their name are running
     other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core')
diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/main.yml b/prepare_oim/roles/prepare_oim_validation/tasks/main.yml
index 7da0078e44..5a252a6114 100644
--- a/prepare_oim/roles/prepare_oim_validation/tasks/main.yml
+++ b/prepare_oim/roles/prepare_oim_validation/tasks/main.yml
@@ -13,9 +13,6 @@
 # limitations under the License.
 ---
 
-- name: Validate SSH permissions and ownership
-  ansible.builtin.include_tasks: validate_ssh_permissions.yml
-
 - name: Validate passwordless ssh host
   ansible.builtin.include_tasks: validate_passwordless_ssh_oim.yml
 
diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
deleted file mode 100644
index a3581b39e1..0000000000
--- a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
----
-
-- name: Ensure SSH critical paths have safe ownership and permissions
-  block:
-    - name: Define SSH critical paths
-      ansible.builtin.set_fact:
-        ssh_critical_paths:
-          - { path: "/root/.ssh", state: "directory", mode: "0700" }
-          - { path: "/root/.ssh/authorized_keys", state: "file", mode: "0600" }
-          - { path: "/root/.ssh/id_rsa", state: "file", mode: "0600" }
-          - { path: "/root/.ssh/id_rsa.pub", state: "file", mode: "0644" }
-          - { path: "/root/.ssh/known_hosts", state: "file", mode: "0644" }
-          - { path: "/root/.ssh/config", state: "file", mode: "0600" }
-
-    - name: Ensure SSH directory exists with secure mode
-      ansible.builtin.file:
-        path: "/root/.ssh"
-        state: directory
-        mode: "0700"
-        owner: root
-        group: root
-      register: ssh_dir_result
-
-    - name: Stat SSH critical paths
-      ansible.builtin.stat:
-        path: "{{ item.path }}"
-        get_checksum: false
-      register: ssh_path_stats
-      loop: "{{ ssh_critical_paths }}"
-      loop_control:
-        label: "{{ item.path }}"
-
-    - name: Enforce SSH ownership and permissions for existing files
-      ansible.builtin.file:
-        path: "{{ item.item.path }}"
-        state: "{{ item.item.state }}"
-        mode: "{{ item.item.mode }}"
-        owner: root
-        group: root
-      loop: "{{ ssh_path_stats.results }}"
-      loop_control:
-        label: "{{ item.item.path }}"
-      when:
-        - item.stat.exists | default(false)
-        - item.item.state == 'file'
-      register: ssh_path_fixes
-
-    - name: Log SSH permission adjustments
-      ansible.builtin.debug:
-        msg: "{{ ssh_file_log_msg }}"
-      loop: "{{ (ssh_path_fixes.results | default([]))
-                | selectattr('item', 'defined')
-                | selectattr('item.path', 'defined')
-                | selectattr('changed', 'defined')
-                | selectattr('changed')
-                | list }}"
-      loop_control:
-        label: "{{ item.item.path | default('unknown path') }}"
-
-    - name: Log SSH directory adjustments
-      ansible.builtin.debug:
-        msg: "{{ ssh_dir_log_msg }}"
-      changed_when: false
-
-    - name: Validate SSH permission state
-      ansible.builtin.assert:
-        that:
-          - not (item.stat.exists | default(false)) or (item.stat.pw_name == 'root' and item.stat.gr_name == 'root')
-          - >-
-            not (item.stat.exists | default(false)) or
-            (item.item.path == '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] in ['0600', '0644']) or
-            (item.item.path != '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path])
-        fail_msg: "{{ ssh_permission_fail_msg }}"
-      vars:
-        item_mode_expected: "{{ dict(ssh_critical_paths | map(attribute='path') | zip(ssh_critical_paths | map(attribute='mode'))) }}"
-      loop: "{{ ssh_path_stats.results }}"
-      loop_control:
-        label: "{{ item.item.path }}"
-      when: item.stat.exists | default(false)
-
-  rescue:
-    - name: Fail upgrade due to SSH permission issues
-      ansible.builtin.fail:
-        msg: "{{ ssh_validation_fail_msg }}"
diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml
index 5eda60a210..79bd5f5b4d 100644
--- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml
+++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml
@@ -84,15 +84,3 @@ functional_groups_config_syntax_fail_msg: "Failed. Syntax errors present in func
 telemetry_config_file: "telemetry_config.yml"
 fail_msg_telemetry_config_file: "telemetry_config.yml file doesn't exist in the input folder."
 telemetry_config_syntax_fail_msg: "Failed. Syntax errors present in telemetry_config.yml. Fix errors and re-run playbook again. Common syntax Errors:"
-
-# Usage: validate_ssh_permissions.yml
-ssh_dir_log_msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root"
-ssh_file_log_msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root"
-ssh_permission_fail_msg: >-
-  SSH path {{ item.item.path }} has invalid ownership or mode.
-  Expected root:root with mode {{ item_mode_expected[item.item.path] }}
-  {% if item.item.path == '/root/.ssh/known_hosts' %} (or 0600/0644 for known_hosts){% endif %}.
-  Fix manually or rerun prepare_oim.
-ssh_validation_fail_msg: >-
-  SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}.
-  Correct SSH file permissions/ownership and rerun prepare_oim.

From f2498e45ca1cf7c8be34d33bcfb127e65622d7c9 Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Mon, 23 Feb 2026 19:09:29 +0530
Subject: [PATCH 66/77] SSH pemission and access issue fix for upgrade after
 prepare_oim (#4019)

---
 omnia.sh                                      | 41 ++++++++++---------
 .../templates/network_spec.j2                 |  4 +-
 .../templates/omnia_config.j2                 |  2 +
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 530c168e7d..9de277a56d 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -381,6 +381,8 @@ setup_omnia_core() {
     # Post container setup configuration
     post_setup_config
 
+    remove_container_omnia_sh
+
     # Start the container
     start_container_session
 }
@@ -1102,8 +1104,6 @@ EOF
     firewall-cmd --permanent --zone=public --add-port=2222/tcp
     firewall-cmd --reload
 }
-
-# This function sets up the configuration for the Omnia core.
 #  post_setup_config is a function that sets up the configuration for the Omnia core.
 #  It creates the necessary directories and files, copies input files from the Omnia container,
 #  and creates the oim_metadata.yml file.
@@ -1117,7 +1117,6 @@ post_setup_config() {
     mkdir -p "$OMNIA_INPUT_DIR/"
 
     # Create the default.yml file if it does not exist.
-    # This file contains the name of the project.
     if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then
         echo -e "${BLUE} Creating default.yml file.${NC}"
         {
@@ -1140,33 +1139,34 @@ post_setup_config() {
 }
 
 validate_nfs_server() {
-
-    # Validate NFS server permission
     if [ "$share_option" = "NFS" ]; then
-        # Create a temporary file inside $omnia_path
-        temp_file="$omnia_path/temp_file"
+        local temp_file="$omnia_path/temp_file"
         touch "$temp_file"
-        # Check if the file can be chown to root
         if chown root:root "$temp_file"; then
-            rm "$temp_file"
+            rm -f "$temp_file"
         else
             echo "Error: Unable to chown file to root in $omnia_path. NFS server permission validation failed. Please ensure no_root_squash option is enabled in the NFS export configuration."
             exit 1
         fi
+
         if [ "`ls -ld $omnia_path/omnia/ssh_config/.ssh/id_rsa | awk '{print $3 ":" $4}'`" != "root:root" ]; then
             echo "Error: The $omnia_path/omnia/ssh_config/.ssh/id_rsa file should be owned by root:root. NFS server permission validation failed. Please verify the NFS export configuration."
             exit 1
         fi
     fi
-
 }
 
 init_ssh_config() {
+    local ssh_port=2222
+
     mkdir -p "$HOME/.ssh"
-    touch $HOME/.ssh/known_hosts
-    # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host
-    ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1  # Remove existing entry if it exists
-    ssh-keyscan -p 2222 localhost 2>/dev/null | grep -v "^#" >> $HOME/.ssh/known_hosts  # Scan and add the new key
+    touch "$HOME/.ssh/known_hosts"
+    ssh-keygen -R "[localhost]:$ssh_port" >/dev/null 2>&1 || true
+    ssh-keyscan -p "$ssh_port" localhost 2>/dev/null | grep -v "^#" >> "$HOME/.ssh/known_hosts" || true
+}
+
+remove_container_omnia_sh() {
+    podman exec -u root omnia_core bash -c 'if [ -f /omnia/omnia.sh ]; then rm -f /omnia/omnia.sh; fi' >/dev/null 2>&1 || true
 }
 
 start_container_session() {
@@ -1213,8 +1213,8 @@ show_help() {
     echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]"
     echo "  -i, --install     Install and start the Omnia core container"
     echo "  -u, --uninstall   Uninstall the Omnia core container and clean up configuration"
-    echo "      --upgrade     Upgrade the Omnia core container to newer version
-    echo "      --rollback    Rollback the Omnia core container to previous version
+    echo "      --upgrade     Upgrade the Omnia core container to newer version"
+    echo "      --rollback    Rollback the Omnia core container to previous version"
     echo "  -v, --version     Display Omnia version information"
     echo "  -h, --help        More information about usage"
 }
@@ -1231,14 +1231,15 @@ install_omnia_core() {
             exit 1
         fi
     fi
-
+    
     local omnia_core_tag="2.1"
     local omnia_core_registry=""
     
     # Check if local omnia_core image exists using validate function
-    if validate_container_image "" "$omnia_core_tag" "install"; then
-        echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
+    if ! validate_container_image "" "$omnia_core_tag" "install"; then
+        exit 1
     fi
+    echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}"
 
     # Check if any other containers with 'omnia' in their name are running
     other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core')
@@ -1906,6 +1907,7 @@ upgrade_omnia_core() {
     show_post_upgrade_instructions "$TARGET_OMNIA_VERSION"
     # Initialize SSH config and start container session
     init_ssh_config
+    remove_container_omnia_sh
     start_container_session
     exit 0
 }
@@ -2324,6 +2326,7 @@ rollback_omnia_core() {
 
     # Initialize SSH config and start container session
     init_ssh_config
+    remove_container_omnia_sh
     start_container_session
 }
 
diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2
index d9e41ba469..564c057db4 100644
--- a/upgrade/roles/import_input_parameters/templates/network_spec.j2
+++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2
@@ -43,9 +43,7 @@ Networks:
     oim_nic_name: "{{ admin_network.oim_nic_name | default('') }}"
     netmask_bits: "{{ admin_network.netmask_bits | default('24') }}"
     primary_oim_admin_ip: "{{ admin_network.primary_oim_admin_ip | default('') }}"
-{% if (admin_network.primary_oim_bmc_ip is defined) and ((admin_network.primary_oim_bmc_ip | string | trim) != '') %}
-    primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip }}"
-{% endif %}
+    primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip | default('') }}"
     dynamic_range: "{{ admin_network.dynamic_range | default('') }}"
     dns: {{ admin_network.dns | default([]) }}
     ntp_servers: {{ admin_network.ntp_servers | default([]) }}
diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2
index aec7a05ab7..eff82ee1c5 100644
--- a/upgrade/roles/import_input_parameters/templates/omnia_config.j2
+++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2
@@ -47,6 +47,7 @@ slurm_cluster:
   - cluster_name: {{ _cluster.cluster_name | default('') }}
     nfs_storage_name: {{ _cluster.nfs_storage_name | default('') }}
 {% if _cluster.config_sources is defined and (_cluster.config_sources | length > 0) %}
+    skip_merge: {{ _cluster.skip_merge | default(true) }}
     config_sources:
 {% set _supported = ['slurm', 'cgroup', 'slurmdbd', 'gres'] %}
 {% for _conf_name, _conf_val in _cluster.config_sources.items() %}
@@ -84,6 +85,7 @@ slurm_cluster:
     #   slurmdbd: /path/to/custom_slurmdbd.conf
     #   gres: /path/to/custom_gres.conf
 {% else %}
+    # skip_merge: True
     # config_sources:
     #   slurm:
     #     SlurmctldTimeout: 60

From 7710bd435a8fae845778fbddf295143286e22777 Mon Sep 17 00:00:00 2001
From: Katakam-Rakesh <katakam.rakesh@dell.com>
Date: Mon, 23 Feb 2026 19:21:05 +0530
Subject: [PATCH 67/77] updating pulp_cleanup and timeout for tarball and iso

Signed-off-by: pullan1 <sudha.pullalaravu@dell.com>
---
 .../library/module_utils/local_repo/config.py |  9 ++-
 common/library/modules/pulp_cleanup.py        | 55 ++++++++++++-------
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py
index 3e812a6e47..d8e5593778 100644
--- a/common/library/module_utils/local_repo/config.py
+++ b/common/library/module_utils/local_repo/config.py
@@ -78,6 +78,11 @@
     "aarch64": ["dnf", "info", "--quiet", "--forcearch=aarch64"]
 }
 
+# ----------------------------
+# Cleanup File Types
+# Used by pulp_cleanup.py
+# ----------------------------
+CLEANUP_FILE_TYPES = ["iso", "manifest", "pip_module", "tarball", "git", "ansible_galaxy_collection"]
 # ----------------------------
 # Used by download_common.py
 # ----------------------------
@@ -116,9 +121,9 @@
 
 CLI_FILE_PATH = "/root/.config/pulp/cli.toml"
 POST_TIMEOUT = 3600  # seconds
-TAR_POLL_VAL = 25    # minutes
+TAR_POLL_VAL = 45    # minutes
 FILE_POLL_VAL = 1    # minutes
-ISO_POLL_VAL = 15    # minutes
+ISO_POLL_VAL = 45    # minutes
 FILE_URI = "/pulp/api/v3/content/file/files/"
 PULP_SSL_CA_CERT = "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt"
 # ----------------------------
diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py
index a3c155ebdb..a97f6d28d2 100644
--- a/common/library/modules/pulp_cleanup.py
+++ b/common/library/modules/pulp_cleanup.py
@@ -36,6 +36,7 @@
 from ansible.module_utils.local_repo.config import (
     CLEANUP_BASE_PATH_DEFAULT,
     CLEANUP_STATUS_FILE_PATH_DEFAULT,
+    CLEANUP_FILE_TYPES,
     pulp_rpm_commands,
     pulp_container_commands,
     pulp_file_commands,
@@ -173,26 +174,40 @@ def convert_to_pulp_container_name(image_name: str) -> str:
 # TYPE DETECTION
 # =============================================================================
 
-def detect_file_type(name: str) -> str:
-    """Detect artifact type from name."""
-    # Pip module: contains == (e.g., cffi==1.17.1)
-    if '==' in name:
-        return "pip_module"
-    # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix)
-    if '.' in name and '/' not in name and '==' not in name and any(
-        x in name.lower() for x in ['ansible', 'community', 'galaxy']
-    ):
-        return "ansible_galaxy_collection"
-    if name.startswith('ansible_galaxy_collection'):
-        return "ansible_galaxy_collection"
-    if any(x in name.lower() for x in ['chart', 'tar', 'tgz', 'helm', 'bundle']):
-        return "tarball"
-    if any(x in name.lower() for x in ['git', 'repo', 'source', 'scm']):
-        return "git"
-    if any(x in name.lower() for x in ['manifest', 'calico', 'yml', 'yaml']):
-        return "manifest"
-    return "file"
-
+def detect_file_type(name: str, base_path: str = "/opt/omnia/offline_repo/cluster") -> str:
+    """Detect artifact type by searching for the package name in the filesystem.
+    
+    Searches in base_path/<arch>/<os>/<version>/{type_folder}/name
+    and returns the folder type where the package is found.
+    
+    Storage structure:
+        - iso/          : ISO files, run files (e.g., cuda-run)
+        - manifest/     : Kubernetes manifests (e.g., calico-v3.30.3, metallb-native-v0.15.2)
+        - pip_module/   : Python pip packages (e.g., PyMySQL==1.1.2, kubernetes==33.1.0)
+        - tarball/      : Tarballs, helm charts (e.g., helm-v3.19.0-amd64, nvhpc_2025_2511_Linux_x86_64_cuda_13.0)
+        - git/          : Git repositories
+        - ansible_galaxy_collection/ : Ansible Galaxy collections
+    
+    Args:
+        name: Package name from JSON (e.g., "calico-v3.30.3", "helm-v3.19.0-amd64")
+        base_path: Base path to search (default: /opt/omnia/offline_repo/cluster)
+    
+    Returns:
+        str: Type based on folder where package is found, or fallback to name-based detection
+    """
+    
+    # Search for the package name in the filesystem
+    # Pattern: base_path/*/*/*/{type_folder}/name
+    for file_type in CLEANUP_FILE_TYPES:
+        pattern = f"{base_path}/*/*/*/{file_type}/{name}"
+        matches = glob.glob(pattern)
+        if matches:
+            # Extract the parent folder name and return it
+            parent_folder = os.path.basename(os.path.dirname(matches[0]))
+            return parent_folder
+    
+    # If not found in filesystem, return None
+    return None
 
 # =============================================================================
 # EXISTENCE CHECKS

From b0ca27f9b3893f68380903d913fa74d926daddeb Mon Sep 17 00:00:00 2001
From: Nethra mg <g_nethravathi@dell.com>
Date: Mon, 23 Feb 2026 23:01:41 +0530
Subject: [PATCH 68/77] Input validation fix for duplicate admin IP in pxe
 mapping file

---
 .../validation_flows/provision_validation.py  | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py
index cc6b4d8e76..4ba1515129 100644
--- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py
@@ -225,6 +225,52 @@ def validate_duplicate_service_tags_in_mapping_file(pxe_mapping_file_path):
         raise ValueError(f"Duplicate SERVICE_TAG found in PXE mapping file: {'; '.join(duplicates)}")
 
 
+def validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path):
+    """Validates that ADMIN_IP values in the mapping file are unique."""
+    if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path):
+        raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}")
+
+    with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh:
+        raw_lines = fh.readlines()
+
+    non_comment_lines = [ln for ln in raw_lines if ln.strip()]
+    reader = csv.DictReader(non_comment_lines)
+
+    fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames}
+    admin_ip_col = fieldname_map.get("ADMIN_IP")
+    hostname_col = fieldname_map.get("HOSTNAME")
+
+    if not admin_ip_col:
+        raise ValueError("ADMIN_IP column not found in PXE mapping file")
+
+    seen_admin_ips = {}
+    duplicates = []
+
+    for row_idx, row in enumerate(reader, start=2):
+        admin_ip = row.get(admin_ip_col, "").strip() if row.get(admin_ip_col) else ""
+        hostname = ""
+        if hostname_col:
+            hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else ""
+
+        if not admin_ip:
+            continue
+
+        if admin_ip in seen_admin_ips:
+            first_row = seen_admin_ips[admin_ip]["row"]
+            first_host = seen_admin_ips[admin_ip]["hostname"]
+            dup_host = hostname or "<empty>"
+            first_host_disp = first_host or "<empty>"
+            duplicates.append(
+                f"'{admin_ip}' at CSV rows {first_row} ({first_host_disp}) and {row_idx} ({dup_host})"
+            )
+            continue
+
+        seen_admin_ips[admin_ip] = {"row": row_idx, "hostname": hostname}
+
+    if duplicates:
+        raise ValueError(f"Duplicate ADMIN_IP found in PXE mapping file: {'; '.join(duplicates)}")
+
+
 def validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path):
     """Validates that GROUP_NAME has a consistent PARENT_SERVICE_TAG across the mapping file."""
     if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path):
@@ -740,6 +786,7 @@ def validate_provision_config(
             validate_functional_groups_in_mapping_file(pxe_mapping_file_path)
             validate_duplicate_service_tags_in_mapping_file(pxe_mapping_file_path)
             validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path)
+            validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path)
             validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path)
             validate_functional_groups_separation(pxe_mapping_file_path)
             validate_parent_service_tag_hierarchy(pxe_mapping_file_path)

From c1fc2315cee81e065abd9a70b7c21517bbf379a8 Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Tue, 24 Feb 2026 03:05:53 +0530
Subject: [PATCH 69/77] Handled delete scenarios along with /etc/hosts

---
 .../slurm_config/tasks/build_slurm_conf.yml   | 12 +--
 discovery/roles/slurm_config/tasks/confs.yml  | 20 ++--
 .../tasks/drain_and_remove_node.yml           | 96 ++++++++++---------
 .../tasks/extract_path_overrides.yml          |  2 +-
 .../slurm_config/tasks/handle_extra_confs.yml |  4 +-
 .../slurm_config/tasks/update_hosts_munge.yml | 48 ++++++++--
 discovery/roles/slurm_config/vars/main.yml    |  1 +
 utils/roles/idrac_pxe_boot/vars/main.yml      |  2 +-
 8 files changed, 111 insertions(+), 74 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
index 40b6137172..84bb493442 100644
--- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
+++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml
@@ -23,7 +23,7 @@
      | combine({'slurm': (apply_config['slurm']
      | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + (node_params | default([]))}))}) }}"
   when: node_params is defined and node_params
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Append login nodes to NodeName list
   ansible.builtin.set_fact:
@@ -32,7 +32,7 @@
      | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}"
   loop: "{{ login_list }}"
   when: login_list is defined and login_list
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Append compiler login nodes to NodeName list
   ansible.builtin.set_fact:
@@ -41,7 +41,7 @@
      | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}"
   loop: "{{ compiler_login_list }}"
   when: compiler_login_list is defined and compiler_login_list
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Append Partition
   ansible.builtin.set_fact:
@@ -49,16 +49,16 @@
      | combine({'slurm': (apply_config['slurm']
      | combine({'PartitionName': (apply_config['slurm'].PartitionName | default([])) + [partition_params]}))}) }}"
   when: node_params is defined and node_params
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Add gpu parameters to slurm conf
   ansible.builtin.set_fact:
     apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}"
   when: gpu_params is defined and gpu_params
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Add dbd parameters to slurm conf
   ansible.builtin.set_fact:
     apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}"
   when: dbd_list is defined and dbd_list
-  no_log: true
+  no_log: "{{ _no_log }}"
diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml
index 1e5a4e507e..d2069497eb 100644
--- a/discovery/roles/slurm_config/tasks/confs.yml
+++ b/discovery/roles/slurm_config/tasks/confs.yml
@@ -15,7 +15,7 @@
 - name: Slurm dict ops
   ansible.builtin.set_fact:
     apply_config: "{{ __default_config }}"
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true)
   ansible.builtin.set_fact:
@@ -34,7 +34,7 @@
      | combine({'slurmdbd': (apply_config['slurmdbd']
      | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}"
   when: ctld_list
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Check .conf files existence
   ansible.builtin.stat:
@@ -51,7 +51,7 @@
   delegate_to: localhost
   loop: "{{ configs_input | default({}) | dict2items }}"
   register: parsed_configs_input_results
-  no_log: true
+  no_log: "{{ _no_log }}"
   when:
     - configs_input is defined
     - configs_input
@@ -62,7 +62,7 @@
   ansible.builtin.set_fact:
     parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.item.key: item.conf_dict}) }}"
   loop: "{{ parsed_configs_input_results.results }}"
-  no_log: true
+  no_log: "{{ _no_log }}"
   when:
     - parsed_configs_input_results is defined
     - not item.skipped | default(false)
@@ -71,7 +71,7 @@
   ansible.builtin.set_fact:
     parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.key: item.value}) }}"
   loop: "{{ configs_input | default({}) | dict2items }}"
-  no_log: true
+  no_log: "{{ _no_log }}"
   when:
     - configs_input is defined
     - configs_input
@@ -94,7 +94,7 @@
   loop_control:
     loop_var: existing_conf_set
   register: prepared_conf_lists
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 # All the updates to the confs follow after this point before merge
 - name: Prepend ClusterName and SlurmctldHost to slurm conf sources
@@ -102,14 +102,14 @@
     conf_merge_dict: "{{ conf_merge_dict
      | combine({'slurm': [{'ClusterName': cluster_name, 'AccountingStorageHost': dbd_list[0], 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}"
   when: "'slurm' in conf_merge_dict"
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Slurm dbd - DbdHost and StorageHost
   ansible.builtin.set_fact:
     conf_merge_dict: "{{ conf_merge_dict
      | combine({'slurmdbd': [{'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}] + conf_merge_dict['slurmdbd']}) }}"
   when: "'slurmdbd' in conf_merge_dict"
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Merge the confs
   slurm_conf:
@@ -118,7 +118,7 @@
     conf_name: "{{ item.key }}"
   loop: "{{ conf_merge_dict | dict2items }}"
   register: merged_conf
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Update slurm_conf_dict with merged configuration for cloud_init read. # TODO: Remove cloud init dependency
   ansible.builtin.set_fact:
@@ -182,7 +182,7 @@
     remote_src: "{{ copy_from_oim }}"
   loop: "{{ merged_conf.results }}"
   register: ctld_conf_files
-  no_log: true
+  no_log: "{{ _no_log }}"
   when:
     - item.ini_lines
 
diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
index da1c41d3fe..1c60299ed2 100644
--- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
+++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
@@ -46,64 +46,72 @@
       ansible.builtin.debug:
         msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)"
 
-    - name: Drain the node to prevent new job assignments
-      ansible.builtin.command: >
-        scontrol update NodeName={{ node_to_remove }}
-        State=DRAIN
-        Reason="Scheduled removal - waiting for jobs to complete"
-      changed_when: true
-      delegate_to: "{{ ctld }}"
+    - name: Prompt for user input when jobs are running
+      ansible.builtin.pause:
+        prompt: |
+          ================================================================================
+          WARNING: ACTIVE JOBS DETECTED ON NODE {{ node_to_remove }}
+          ================================================================================
 
-    - name: Wait for all jobs to complete on the node
-      ansible.builtin.shell:
-        cmd: |
-          set -o pipefail
-          squeue -w {{ node_to_remove }} -h | wc -l
-      register: job_count_check
-      until: job_count_check.stdout | int == 0
-      retries: "{{ (node_drain_timeout / node_drain_delay) | int }}"
-      delay: "{{ node_drain_delay }}"
-      changed_when: false
-      delegate_to: "{{ ctld }}"
-      when: current_jobs.stdout | int > 0
+          Current Status:
+            - Node: {{ node_to_remove }}
+            - Running Jobs: {{ current_jobs.stdout }}
+            - Node State: Will be set to DOWN and removed from cluster
+            - Impact: All running jobs on this node will be terminated
 
-    - name: Confirm jobs completed
-      ansible.builtin.debug:
-        msg: "All jobs on {{ node_to_remove }} have completed"
-      when: current_jobs.stdout | int > 0
+          To view job details, run:
+            squeue -w {{ node_to_remove }}
 
-    - name: Log node removal
-      ansible.builtin.debug:
-        msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state"
+          Available Options:
+            1. ABORT AND CANCEL MANUALLY (Recommended)
+                - Press Ctrl+C, then 'A' to abort this playbook
+                - Manually cancel jobs: scancel -w {{ node_to_remove }}
+                - Or wait for jobs to complete naturally
+                - Then re-run this playbook
 
-  rescue:
-    - name: Log node removal failure
-      ansible.builtin.debug:
-        msg: "Failed to drain node {{ node_to_remove }}"
+            2. FORCE REMOVAL (Destructive)
+                - Press Enter to proceed with immediate node removal
+                - All {{ current_jobs.stdout }} job(s) will be forcefully terminated
+                - Users will lose any unsaved work
+                - Job data may be incomplete or corrupted
 
-    - name: Remove slurm node with running job after timeout
-      ansible.builtin.pause:
-        prompt: |
-          Node {{ node_to_remove }} has been DRAINED to prevent new job assignments.
-          Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds.
-          Options:
-            1. Press Ctrl+C then 'A' to abort
-            2. Press Enter to force removal (jobs will be killed)
-      when: not force_scancel_node
+          ================================================================================
+          Your choice (Ctrl+C then 'A' to abort, or Enter to force remove):
+      when:
+        - current_jobs.stdout | int > 0
+        - not force_scancel_node
 
-    - name: Force cancel jobs if timeout reached
-      ansible.builtin.command: scancel -f -w {{ node_to_remove }}
+    - name: Force cancel jobs on the node to be removed from cluster
+      ansible.builtin.command: scancel -f -w {{ node_to_remove }} # Safe does not fail if no jobs are running
       changed_when: true
-      failed_when: false
+      register: scancel_result
+      failed_when: scancel_result.rc != 0
       delegate_to: "{{ ctld }}"
 
-  always:
     - name: Set node to DOWN state
       ansible.builtin.command: >
         scontrol update NodeName={{ node_to_remove }}
         State=DOWN
-        Reason="Node removed from cluster"
+        Reason="Node removed from cluster via OMNIA discovery.yml"
       changed_when: true
       failed_when: false
       delegate_to: "{{ ctld }}"
       when: node_exists_check.rc == 0
+
+    - name: Stop the slurmd service on node
+      ansible.builtin.service:
+        name: slurmd
+        state: stopped
+      delegate_to: "{{ node_to_remove }}"
+      ignore_unreachable: true
+      failed_when: false
+
+    - name: Delete the dir from NFS
+      ansible.builtin.file:
+        path: "{{ slurm_config_path }}/{{ node_to_remove }}"
+        state: absent
+  rescue:
+    - name: Failure to remove node
+      ansible.builtin.fail:
+        msg: "Node {{ node_to_remove }} failed to be removed from slurm cluster,
+          as task {{ ansible_failed_task.name }} failed."
diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
index 0efcf18962..9e4ae518a2 100644
--- a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
+++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml
@@ -24,7 +24,7 @@
   ansible.builtin.set_fact:
     slurmdbd_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurmdbd') | first).conf_dict }}"
   when: "'slurmdbd' in conf_merge_dict"
-  no_log: true
+  no_log: "{{ _no_log }}"
 
 - name: Extract cgroup.conf merged dict
   ansible.builtin.set_fact:
diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
index 544822ec28..d7a0b4f382 100644
--- a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
+++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml
@@ -19,7 +19,7 @@
     conf_name: "{{ extra_conf }}"
   register: ex_conf
   delegate_to: localhost
-  no_log: true
+  no_log: "{{ _no_log }}"
   when:
     - "'.' not in extra_conf"
 
@@ -31,7 +31,7 @@
     owner: "{{ slurm_user }}"
     group: "{{ slurm_user_group }}"
     remote_src: "{{ copy_from_oim }}"
-  no_log: true
+  no_log: "{{ _no_log }}"
   when:
     - "'.' not in extra_conf"
     - ex_conf is success
diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
index 64c36dbeaf..147f1b484c 100644
--- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
+++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml
@@ -12,18 +12,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-- name: Update /etc/hosts with controller hostname and IP
-  ansible.builtin.lineinfile:
-    path: /etc/hosts
-    regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}'
-    line: "{{ host_entry.value }} {{ host_entry.key }}"
-    state: present
-  loop: "{{ ip_name_map | dict2items | list }}"
-  loop_control:
-    loop_var: host_entry
+- name: Edit /etc/hosts file till DNS
   ignore_unreachable: true
-  failed_when: false
   delegate_to: "{{ slurmhost_ip }}"
+  block:
+    - name: Remove deleted nodes if any hostname exists in /etc/hosts
+      ansible.builtin.lineinfile:
+        path: "/etc/hosts"
+        regexp: '(\b{{ node_to_remove }}\b)'
+        state: absent
+      loop: "{{ nodes_in_normal_not_in_cmpt }}"
+      loop_control:
+        loop_var: node_to_remove
+      when:
+        - nodes_in_normal_not_in_cmpt is defined
+        - nodes_in_normal_not_in_cmpt | length > 0
+
+    - name: Remove existing /etc/hosts entries containing the IP or hostname
+      ansible.builtin.lineinfile:
+        path: "/etc/hosts"
+        regexp: '(\b{{ host_entry.value }}\b|\b{{ host_entry.key }}\b)'
+        state: absent
+      loop: "{{ ip_name_map | dict2items | list }}"
+      loop_control:
+        loop_var: host_entry
+
+    - name: Add correct /etc/hosts entry for controller hostname and IP
+      ansible.builtin.lineinfile:
+        path: "/etc/hosts"
+        line: "{{ host_entry.value }} {{ host_entry.key }}"
+        state: present
+        mode: '0644'
+        create: true
+      loop: "{{ ip_name_map | dict2items | list }}"
+      loop_control:
+        loop_var: host_entry
+  rescue:
+    - name: Print error if editing /etc/hosts fails
+      ansible.builtin.debug:
+        msg: "Failed to edit /etc/hosts file on {{ slurmhost_ip }}"
 
 - name: Get munge changes
   ansible.builtin.set_fact:
@@ -37,6 +64,7 @@
     - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false)
     - restart_slurm_services
   delegate_to: "{{ slurmhost_ip }}"
+  no_log: "{{ _no_log }}"
   ignore_unreachable: true
   block:
     - name: Update munge key permissions
diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml
index d708eb0777..cc57a984da 100644
--- a/discovery/roles/slurm_config/vars/main.yml
+++ b/discovery/roles/slurm_config/vars/main.yml
@@ -121,6 +121,7 @@ slurm_db_cnf_mode: "0600"
 node_drain_timeout: 900
 node_drain_delay: 30
 force_scancel_node: false
+_no_log: true
 dbd_slurm_conf:
   AccountingStoragePort: "{{ slurm_dbd_port }}"
   AccountingStorageType: accounting_storage/slurmdbd
diff --git a/utils/roles/idrac_pxe_boot/vars/main.yml b/utils/roles/idrac_pxe_boot/vars/main.yml
index bebd2b4a42..53de8aa0e9 100644
--- a/utils/roles/idrac_pxe_boot/vars/main.yml
+++ b/utils/roles/idrac_pxe_boot/vars/main.yml
@@ -16,7 +16,7 @@
 restart_host: true
 
 # Change to true for forceful reboot. by default graceful will happen
-force_restart: false
+force_restart: true
 
 # Set boot source override mode. Valid values are once, continuous, or disabled
 boot_source_override_enabled: continuous

From a6722a966e462791b51d77c584377eaee084e23c Mon Sep 17 00:00:00 2001
From: Jagadeesh N V <jagadeesh_n_v@dell.com>
Date: Tue, 24 Feb 2026 12:45:30 +0530
Subject: [PATCH 70/77] Delete node removal of service and NFS data

---
 .../slurm_config/tasks/check_ctld_running.yml | 10 ----------
 .../tasks/drain_and_remove_node.yml           |  8 ++++----
 .../roles/slurm_config/tasks/remove_node.yml  | 20 +++++++++++++++++++
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
index ce27d3c362..92ba39376e 100644
--- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml
+++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml
@@ -22,16 +22,6 @@
   register: ssh_check
   ignore_errors: true
 
-- name: Drain and remove nodes if any
-  ansible.builtin.include_tasks: drain_and_remove_node.yml
-  loop: "{{ nodes_in_normal_not_in_cmpt }}"
-  loop_control:
-    loop_var: node_to_remove
-  when:
-    - ssh_check is success
-    - nodes_in_normal_not_in_cmpt is defined
-    - nodes_in_normal_not_in_cmpt | length > 0
-
 - name: Enter slurm controller when pingable
   when:
     - ssh_check is success
diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
index 1c60299ed2..cf62b156aa 100644
--- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
+++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml
@@ -18,7 +18,7 @@
   failed_when: false
   ignore_unreachable: true
   changed_when: false
-  delegate_to: "{{ ctld }}"
+  delegate_to: "{{ ctld_list[0] }}"
 
 - name: Skip if node does not exist
   ansible.builtin.debug:
@@ -40,7 +40,7 @@
           squeue -w {{ node_to_remove }} -h | wc -l
       register: current_jobs
       changed_when: false
-      delegate_to: "{{ ctld }}"
+      delegate_to: "{{ ctld_list[0] }}"
 
     - name: Display job information
       ansible.builtin.debug:
@@ -86,7 +86,7 @@
       changed_when: true
       register: scancel_result
       failed_when: scancel_result.rc != 0
-      delegate_to: "{{ ctld }}"
+      delegate_to: "{{ ctld_list[0] }}"
 
     - name: Set node to DOWN state
       ansible.builtin.command: >
@@ -95,7 +95,7 @@
         Reason="Node removed from cluster via OMNIA discovery.yml"
       changed_when: true
       failed_when: false
-      delegate_to: "{{ ctld }}"
+      delegate_to: "{{ ctld_list[0] }}"
       when: node_exists_check.rc == 0
 
     - name: Stop the slurmd service on node
diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml
index ba93bb086a..eecf6d4f1b 100644
--- a/discovery/roles/slurm_config/tasks/remove_node.yml
+++ b/discovery/roles/slurm_config/tasks/remove_node.yml
@@ -12,6 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Check if controller is reachable via SSH
+  ansible.builtin.wait_for:
+    host: "{{ ctld_list[0] }}"
+    port: 22 # TODO: make it configurable
+    timeout: 10
+    state: started
+  delegate_to: localhost
+  register: ssh_check
+  ignore_errors: true
+
+- name: Drain and remove nodes if any
+  ansible.builtin.include_tasks: drain_and_remove_node.yml
+  loop: "{{ nodes_in_normal_not_in_cmpt }}"
+  loop_control:
+    loop_var: node_to_remove
+  when:
+    - ssh_check is success
+    - nodes_in_normal_not_in_cmpt is defined
+    - nodes_in_normal_not_in_cmpt | length > 0
+
 - name: Remove nodes from NodeName list that are not in cmpt_list
   ansible.builtin.set_fact:
     filtered_nodenames: "{{ slurm_conf_dict.NodeName | rejectattr('NodeName', 'in', nodes_in_normal_not_in_cmpt) | list }}"

From 8e198f40ad2aca07d96080ba9255695239a6a6eb Mon Sep 17 00:00:00 2001
From: "balajikumaran.cs" <balajikumaran.c.s@gmail.com>
Date: Tue, 24 Feb 2026 13:05:52 +0530
Subject: [PATCH 71/77] Fix admin dynamic_range subnet validation and remove
 flawed netmask check (#4010)

---
 .../common_utils/validation_utils.py          | 59 ++++++++-----------
 .../validation_flows/provision_validation.py  | 25 ++++----
 2 files changed, 34 insertions(+), 50 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/validation_utils.py b/common/library/module_utils/input_validation/common_utils/validation_utils.py
index 21a54cabe5..7f5693e9c9 100644
--- a/common/library/module_utils/input_validation/common_utils/validation_utils.py
+++ b/common/library/module_utils/input_validation/common_utils/validation_utils.py
@@ -527,6 +527,30 @@ def validate_netmask_bits(bits):
     except (ValueError, TypeError):
         return False
 
+def is_range_within_subnet(ip_range, reference_ip, netmask_bits):
+    """
+    Validates that the given IP range falls within the subnet
+    derived from reference_ip and netmask_bits.
+
+    Args:
+        ip_range (str): IP range in "start_ip-end_ip" format.
+        reference_ip (str): A reference IP in the subnet (e.g., primary_oim_admin_ip).
+        netmask_bits (str or int): The CIDR prefix length (e.g., "24").
+
+    Returns:
+        bool: True if both start and end IPs are within the subnet, False otherwise.
+    """
+    try:
+        network = ipaddress.IPv4Network(f"{reference_ip}/{netmask_bits}", strict=False)
+        parts = ip_range.split("-")
+        if len(parts) != 2:
+            return False
+        start_ip = ipaddress.IPv4Address(parts[0].strip())
+        end_ip = ipaddress.IPv4Address(parts[1].strip())
+        return start_ip in network and end_ip in network
+    except (ValueError, TypeError):
+        return False
+
 def check_bmc_static_range_overlap(static_range, static_range_group_mapping) -> list:
     """
     Checks if the given static BMC range overlaps with any of the ranges in other groups.
@@ -625,41 +649,6 @@ def check_port_ranges(port_ranges) -> bool:
 
     return True
 
-def is_range_within_netmask(ip_range, netmask_bits):
-    """
-    Check if a given IP range falls within the valid IP address range for a given netmask.
-
-    Args:
-        ip_range (str): The IP range in format "start_ip-end_ip"
-            (e.g., "192.168.1.10-192.168.1.50").
-        netmask_bits (int or str): The netmask bits (e.g., 20 for /20).
-
-    Returns:
-        bool: True if the IP range is valid for the given netmask, False otherwise.
-    """
-    try:
-        # Parse the IP range
-        start_ip, end_ip = ip_range.split('-')
-        start_ip_obj = ipaddress.ip_address(start_ip)
-        end_ip_obj = ipaddress.ip_address(end_ip)
-
-        # Ensure start_ip <= end_ip
-        if start_ip_obj > end_ip_obj:
-            return False
-
-        # Create network from start_ip with the given netmask
-        network = ipaddress.ip_network(f"{start_ip}/{netmask_bits}", strict=False)
-
-        # Get first and last usable addresses (excluding network and broadcast)
-        first_usable = network.network_address + 1
-        last_usable = network.broadcast_address - 1
-
-        # Check if both start and end IPs are within the usable range
-        return (first_usable <= start_ip_obj <= last_usable and
-                first_usable <= end_ip_obj <= last_usable)
-    except (ValueError, TypeError):
-        return False
-
 def is_ip_within_range(ip_range, ip):
     """
     Check if a given IP falls within a specified IP range.
diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py
index cc6b4d8e76..63c8c25387 100644
--- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py
@@ -905,6 +905,16 @@ def _validate_admin_network(network):
             )
         )
 
+        # Ensure dynamic_range is inside the admin subnet (primary_oim_admin_ip/netmask_bits)
+        if not validation_utils.is_range_within_subnet(admin_net["dynamic_range"], primary_oim_admin_ip, netmask):
+            errors.append(
+                create_error_msg(
+                    "admin_network.dynamic_range",
+                    admin_net["dynamic_range"],
+                    en_us_validation_msg.RANGE_NETMASK_BOUNDARY_FAIL_MSG,
+                )
+            )
+
     #  Admin and BMC IP should not be the same
     errors.extend(validate_admin_bmc_ip_not_same(primary_oim_admin_ip, primary_oim_bmc_ip))
 
@@ -1034,20 +1044,5 @@ def _validate_ip_ranges(dynamic_range, network_type, netmask_bits):
             )
         )
 
-    # Validate that IP ranges are within the netmask boundaries
-    if netmask_bits:
-        # Check dynamic range
-        if (validation_utils.validate_ipv4_range(dynamic_range) and
-                not validation_utils.is_range_within_netmask(
-                    dynamic_range, netmask_bits
-                )):
-            errors.append(
-                create_error_msg(
-                    f"{network_type}.dynamic_range",
-                    dynamic_range,
-                    en_us_validation_msg.RANGE_NETMASK_BOUNDARY_FAIL_MSG,
-                )
-            )
-
     return errors
 

From 6a5d4f0ba6ca729f89b3ef9a266754ce44ca5315 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Tue, 24 Feb 2026 09:32:58 +0000
Subject: [PATCH 72/77] fix input validation for high_availability_config.yml

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../common_utils/en_us_validation_msg.py               |  2 ++
 .../validation_flows/high_availability_validation.py   | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
index e72c474513..a8027529b9 100644
--- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
+++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
@@ -384,6 +384,8 @@ def server_spec_network_key_fail_msg(nic_device):
                             "roles_config.yml")
 FEILD_MUST_BE_EMPTY = "feild must be empty."
 DUPLICATE_VIRTUAL_IP = "is already used. Please give unique virtual ip address"
+VIRTUAL_IP_SAME_AS_PRIMARY_OIM_ADMIN_IP = ("virtual_ip_address provided in high_availability_config.yml must not be the same as primary_oim_admin_ip in network_spec.yml. "
+                                           "Please provide a different virtual IP address.")
 INVALID_PASSIVE_NODE_SERVICE_TAG = "active node and passive node service tag cannot be same."
 GROUP_NOT_FOUND = "is not defined in the roles_config.yml. Please define the group in roles_config.yml"
 ROLE_NODE_FOUND = "is not defined in roles_config.yml. Please define the role in roles_config.yml"
diff --git a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py
index 4b67f09789..d7d2415b2b 100644
--- a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py
@@ -310,6 +310,16 @@ def validate_vip_address(
         - None: The function does not return any value, it only appends
             error messages to the errors list.
     """
+
+    if vip_address == oim_admin_ip:
+        errors.append(
+            create_error_msg(
+                f"{config_type} virtual_ip_address",
+                vip_address,
+                en_us_validation_msg.VIRTUAL_IP_SAME_AS_PRIMARY_OIM_ADMIN_IP,
+            )
+        )
+
     # virtual_ip_address is mutually exclusive with admin dynamic ranges
     vip_within_dynamic_range = validation_utils.is_ip_within_range(
         admin_network["dynamic_range"], vip_address

From 663db575afa003886307229cf139fee8b7ce19e8 Mon Sep 17 00:00:00 2001
From: Vrinda_Marwah <vrinda.marwah@dell.com>
Date: Tue, 24 Feb 2026 09:39:09 +0000
Subject: [PATCH 73/77] updating copyrights

Signed-off-by: Vrinda_Marwah <vrinda.marwah@dell.com>
---
 .../input_validation/common_utils/en_us_validation_msg.py       | 2 +-
 .../validation_flows/high_availability_validation.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
index a8027529b9..15a8537ac5 100644
--- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
+++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py
index d7d2415b2b..5e222d04b5 100644
--- a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved.
+# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 4e2b39196df305724aa1baf926bb85f7f7e8e7cd Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Tue, 24 Feb 2026 15:32:08 +0530
Subject: [PATCH 74/77] Updating Reprovision guidance, update success msg and
 rollback flow

---
 omnia.sh                                     | 46 ++++++++------------
 upgrade/roles/upgrade_cluster/tasks/main.yml | 40 +++++++++++++----
 upgrade/roles/upgrade_cluster/vars/main.yml  |  4 ++
 3 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/omnia.sh b/omnia.sh
index 9de277a56d..e380de2745 100755
--- a/omnia.sh
+++ b/omnia.sh
@@ -255,19 +255,17 @@ show_post_upgrade_instructions() {
     echo -e "${YELLOW}                    IMPORTANT POST-UPGRADE STEP${NC}"
     echo -e "${YELLOW}================================================================================${NC}"
     echo ""
-    echo -e "${GREEN}✓ Omnia core container has been successfully upgraded${NC}"
-    echo -e "${GREEN}✓ Version updated to: $upgraded_version${NC}"
-    echo ""
     echo -e "${BLUE}NEXT REQUIRED ACTION:${NC}"
     echo -e "${YELLOW}You must now run the upgrade playbook inside the omnia_core container:${NC}"
     echo ""
-    echo -e "${GREEN}podman exec -it omnia_core ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}"
+    echo -e "${GREEN}ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}"
     echo ""
     echo -e "${BLUE}This playbook will:${NC}"
-    echo -e "• Update input files"
-    echo -e "• Update internal configurations"
+    echo -e "• Update input files based on the previous version inputs"
+    echo -e "• Provide further steps to follow"
+    echo -e "• Provide user guidance for provisioning nodes"
     echo ""
-    echo -e "${YELLOW}Note: Run this command after the container is fully healthy and stable${NC}"
+    echo -e "${YELLOW}Note: Run the above command after the container is fully healthy and stable${NC}"
     echo -e "${YELLOW}================================================================================${NC}"
     echo ""
 }
@@ -1167,6 +1165,7 @@ init_ssh_config() {
 
 remove_container_omnia_sh() {
     podman exec -u root omnia_core bash -c 'if [ -f /omnia/omnia.sh ]; then rm -f /omnia/omnia.sh; fi' >/dev/null 2>&1 || true
+    podman exec -u root omnia_core bash -c 'if [ -d /omnia/input ]; then rm -rf /omnia/input; fi' >/dev/null 2>&1 || true
 }
 
 start_container_session() {
@@ -1904,6 +1903,16 @@ upgrade_omnia_core() {
     # Seed inputs and defaults after upgrade
     post_setup_config
 
+    echo ""
+    echo -e "${GREEN}================================================================================${NC}"
+    echo -e "${GREEN}                    UPGRADE COMPLETED SUCCESSFULLY${NC}"
+    echo -e "${GREEN}================================================================================${NC}"
+    echo ""
+    echo -e "${GREEN}✓ Omnia core has been upgraded to version $TARGET_OMNIA_VERSION${NC}"
+    echo -e "${GREEN}✓ Container is running and healthy${NC}"
+    echo -e "${GREEN}✓ Configuration backed up to: $backup_base${NC}"
+    echo ""
+
     show_post_upgrade_instructions "$TARGET_OMNIA_VERSION"
     # Initialize SSH config and start container session
     init_ssh_config
@@ -2184,26 +2193,9 @@ rollback_omnia_core() {
         exit 1
     fi
     
-    echo ""
-    echo "Available backups for version $selected_version:"
-    for i in "${!backup_dirs[@]}"; do
-        local backup_path="${backup_dirs[$i]}"
-        local backup_date=$(podman exec -u root omnia_core stat -c '%y' "$backup_path" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1)
-        echo "  $((i+1)). Backup created: $backup_date"
-    done
-    
-    # Prompt for backup selection
-    echo ""
-    echo -n "Select backup to restore from (1-${#backup_dirs[@]}): "
-    read -r backup_selection
-    
-    # Validate backup selection
-    if ! [[ "$backup_selection" =~ ^[0-9]+$ ]] || [ "$backup_selection" -lt 1 ] || [ "$backup_selection" -gt ${#backup_dirs[@]} ]; then
-        echo -e "${RED}ERROR: Invalid backup selection.${NC}"
-        exit 1
-    fi
-    
-    local selected_backup="${backup_dirs[$((backup_selection-1))]}"
+    # Auto-select the most recent backup (first in sorted list)
+    local selected_backup="${backup_dirs[0]}"
+    echo "Auto-selecting backup: $selected_backup"
     
     # Validate selected backup exists
     if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then
diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index 90b25611b5..ce91c4c598 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -12,56 +12,80 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+- name: Gather NFS share paths from storage_config.yml
+  ansible.builtin.set_fact:
+    nfs_slurm_server_share_path: "{{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_slurm') | map(attribute='server_share_path') | first | default('not specified') }}"
+    nfs_k8s_server_share_path: "{{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_k8s') | map(attribute='server_share_path') | first | default('not specified') }}"
 
 
 - name: Display cluster reprovision guidance
   ansible.builtin.pause:
     prompt: "{{ '\x1b[32m' }}===================================================
           CLUSTER REPROVISION REQUIRED
-      ===========================================================
+      ==========================================================
 
       Cluster reprovisioning is required after upgrade to enable new features.
 
       Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning:
 
+
         1. local_repo_config.yml
 
             - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64)
 
             - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64)
 
+
         2. network_spec.yml (ib_network section)
 
             - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable)
 
             - Ensure host IB interfaces map to the IB network entries
 
+
         3. omnia_config.yml (slurm_cluster.config_source)
 
             - Use the new structure: config_source: { type: <local|url>, location: <path_or_url> }
 
             - Populate location to point to your Slurm config bundle (local path or remote URL)
 
-      Do NFS cleanup (if NFS share is used for k8s/slurm)
+            - New variable: skip_merge (set to true to skip merging configs during upgrade when using external bundles)
+
+
+      Optional: NFS cleanup (only if you are reprovisioning the cluster)
+
+      If you choose to reprovision the cluster and your setup uses an NFS share for Kubernetes and/or Slurm, you may optionally perform an NFS   cleanup beforehand:
+
+        Detected NFS share paths from storage_config.yml:
 
-         - Clean stale mounts and ensure the NFS share is accessible before reprovision
+          - Slurm (nfs_slurm) server_share_path:  {{ nfs_slurm_server_share_path }}
 
-         - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment
+          - Kubernetes (nfs_k8s) server_share_path:  {{ nfs_k8s_server_share_path }}
 
 
-      Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster:
+        Clean stale mounts and confirm the NFS share is reachable and accessible.
+
+        Remove any leftover cluster state on the NFS share that could conflict with a fresh deployment.
+
+
+      Optional: Reprovision playbooks (run in order from the Omnia root directory)
 
         1. ansible-playbook local_repo/local_repo.yml
 
         2. ansible-playbook build_image_x86_64/build_image_x86_64.yml
 
-        3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64:
+        3. Only if using aarch64 nodes (run after x86_64 image build):
 
-        ansible-playbook build_image_aarch64/build_image_aarch64.yml
+        -> ansible-playbook build_image_aarch64/build_image_aarch64.yml
 
         4. ansible-playbook discovery/discovery.yml
 
-      Please follow the omnia documentation for steps in more detail.
+
+      For detailed steps and prerequisites, follow the official Omnia documentation.
+
+
+          ==================================================================
+     ========================================================================
 
     {{ '\x1b[0m' }}"
     seconds: 1
diff --git a/upgrade/roles/upgrade_cluster/vars/main.yml b/upgrade/roles/upgrade_cluster/vars/main.yml
index f4c5b1b7cb..fc50eacddb 100644
--- a/upgrade/roles/upgrade_cluster/vars/main.yml
+++ b/upgrade/roles/upgrade_cluster/vars/main.yml
@@ -12,3 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
+storage_config_path: "/opt/omnia/input/project_default/storage_config.yml"
+storage_content: "{{ lookup('file', storage_config_path, errors='ignore') | default('') }}"
+storage_yaml: "{{ storage_content | length > 0 | ternary(storage_content | from_yaml, {}) }}"
+nfs_params: "{{ storage_yaml.nfs_client_params | default([]) }}"

From 986f21349b940c8f41b6e833c567a3f4c86b2edb Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Tue, 24 Feb 2026 15:41:15 +0530
Subject: [PATCH 75/77] Update main.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index ce91c4c598..e1b5ec2a29 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -14,8 +14,12 @@
 ---
 - name: Gather NFS share paths from storage_config.yml
   ansible.builtin.set_fact:
-    nfs_slurm_server_share_path: "{{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_slurm') | map(attribute='server_share_path') | first | default('not specified') }}"
-    nfs_k8s_server_share_path: "{{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_k8s') | map(attribute='server_share_path') | first | default('not specified') }}"
+    nfs_slurm_server_share_path: >-
+      {{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_slurm')
+      | map(attribute='server_share_path') | first | default('not specified') }}
+    nfs_k8s_server_share_path: >-
+      {{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_k8s')
+      | map(attribute='server_share_path') | first | default('not specified') }}
 
 
 - name: Display cluster reprovision guidance

From d16bd01d8ddc7eccdb11ee74e208059c3b75d14f Mon Sep 17 00:00:00 2001
From: mithileshreddy04 <mithilesh.reddy@dell.com>
Date: Tue, 24 Feb 2026 15:45:29 +0530
Subject: [PATCH 76/77] Update main.yml

---
 upgrade/roles/upgrade_cluster/tasks/main.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml
index e1b5ec2a29..ada4408f2e 100644
--- a/upgrade/roles/upgrade_cluster/tasks/main.yml
+++ b/upgrade/roles/upgrade_cluster/tasks/main.yml
@@ -58,7 +58,8 @@
 
       Optional: NFS cleanup (only if you are reprovisioning the cluster)
 
-      If you choose to reprovision the cluster and your setup uses an NFS share for Kubernetes and/or Slurm, you may optionally perform an NFS   cleanup beforehand:
+      If you choose to reprovision the cluster and your setup uses an NFS share for Kubernetes and/or Slurm, you may optionally perform an NFS
+      cleanup beforehand:
 
         Detected NFS share paths from storage_config.yml:
 

From 53c9023f27958f132c86b02d75dc3234c5aa13f4 Mon Sep 17 00:00:00 2001
From: sakshi-singla-1735 <sakshi.s@dell.com>
Date: Wed, 25 Feb 2026 12:23:27 +0530
Subject: [PATCH 77/77] Additional_Packages defect fix (#4042)

* additional packages defect

* reverse check

* parent key check

* removing the additional_package group
---
 .../input_validation/common_utils/config.py   | 11 +++++
 .../validation_flows/common_validation.py     | 44 +++++++++++++++++--
 .../validation_flows/local_repo_validation.py | 29 ++++++++++++
 .../rhel/10.0/additional_packages.json        | 15 -------
 4 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py
index 0f369f3950..58cae556c4 100644
--- a/common/library/module_utils/input_validation/common_utils/config.py
+++ b/common/library/module_utils/input_validation/common_utils/config.py
@@ -33,6 +33,17 @@
 OMNIA_ENTITLEMENT_PATH = '/opt/omnia/rhel_repo_certs/*.pem'
 OMNIA_REDHAT_REPO = '/opt/omnia/rhel_repo_certs/redhat.repo'
 
+# Supported functional groups for additional_packages per architecture
+ADDITIONAL_PACKAGES_SUPPORTED_SUBGROUPS = {
+    "x86_64": [
+        "slurm_control_node", "slurm_node", "login_node", "login_compiler_node",
+        "service_kube_control_plane", "service_kube_control_plane_first", "service_kube_node"
+    ],
+    "aarch64": [
+        "slurm_control_node", "slurm_node", "login_node", "login_compiler_node"
+    ]
+}
+
 # dict to hold the file names. If any file's name changes just change it here.
 files = {
     "local_repo_config": "local_repo_config.yml",
diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py
index 36f55130d4..dcf812c929 100644
--- a/common/library/module_utils/input_validation/validation_flows/common_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py
@@ -246,14 +246,14 @@ def validate_software_config(
                     )
                 )
 
+    supported_subgroups = config.ADDITIONAL_PACKAGES_SUPPORTED_SUBGROUPS
+
     for software_pkg in data['softwares']:
         software = software_pkg['name']
         arch_list = software_pkg.get('arch')
-        json_paths = []
         for arch in arch_list:
-            json_paths.append(get_json_file_path(
-                software, cluster_os_type, cluster_os_version, input_file_path, arch))
-        for json_path in json_paths:
+            json_path = get_json_file_path(
+                software, cluster_os_type, cluster_os_version, input_file_path, arch)
             # Check if json_path is None or if the JSON syntax is invalid
             if not json_path:
                 errors.append(
@@ -266,7 +266,43 @@ def validate_software_config(
                 try:
                     subgroup_softwares = subgroup_dict.get(software, None)
                     json_data = load_json(json_path)
+                    # For additional_packages, validate subgroup keys in the JSON
+                    if software == "additional_packages":
+                        if "additional_packages" not in json_data:
+                            errors.append(
+                                create_error_msg(
+                                    software + '/' + arch,
+                                    json_path,
+                                    f"Required key 'additional_packages' is missing from the JSON file."
+                                )
+                            )
+                        arch_supported = supported_subgroups.get(arch, [])
+                        user_subgroups = [p.get('name') for p in data.get(software, [])]
+                        for json_key in json_data:
+                            if json_key == "additional_packages":
+                                continue
+                            if json_key not in arch_supported:
+                                errors.append(
+                                    create_error_msg(
+                                        software + '/' + arch,
+                                        json_path,
+                                        f"Subgroup '{json_key}' is not supported for architecture {arch}."
+                                    )
+                                )
+                            elif json_key not in user_subgroups:
+                                errors.append(
+                                    create_error_msg(
+                                        software + '/' + arch,
+                                        json_path,
+                                        f"Subgroup '{json_key}' is present in JSON but not listed under additional_packages in software_config.json."
+                                    )
+                                )
                     for subgroup_software in subgroup_softwares:
+                        # For additional_packages, skip subgroups that are
+                        # not supported for this arch
+                        if software == "additional_packages":
+                            if subgroup_software not in supported_subgroups.get(arch, []):
+                                continue
                         _, fail_data = validation_utils.validate_softwaresubgroup_entries(
                             subgroup_software, json_path, json_data, validation_results, failures
                         )
diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
index 88e02845d2..447bd33c8d 100644
--- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
+++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py
@@ -208,6 +208,8 @@ def validate_local_repo_config(input_file_path, data,
                     )
 
     os_ver_path = f"/{software_config_json['cluster_os_type']}/{software_config_json['cluster_os_version']}/"
+    supported_subgroups = config.ADDITIONAL_PACKAGES_SUPPORTED_SUBGROUPS
+
     for software in software_config_json["softwares"]:
         sw = software["name"]
         arch_list = software.get("arch")
@@ -221,10 +223,37 @@ def validate_local_repo_config(input_file_path, data,
             else:
                 curr_json = load_json(json_path)
                 pkg_list = curr_json[sw]['cluster']
+                # For additional_packages, validate subgroup keys in the JSON
+                if sw == "additional_packages":
+                    if "additional_packages" not in curr_json:
+                        errors.append(
+                            create_error_msg(sw + '/' + arch,
+                                            json_path,
+                                            f"Required key 'additional_packages' is missing from the JSON file."))
+                    arch_supported = supported_subgroups.get(arch, [])
+                    user_subgroups = [p.get('name') for p in software_config_json.get(sw, [])]
+                    for json_key in curr_json:
+                        if json_key == "additional_packages":
+                            continue
+                        if json_key not in arch_supported:
+                            errors.append(
+                                create_error_msg(sw + '/' + arch,
+                                                json_path,
+                                                f"Subgroup '{json_key}' is not supported for architecture {arch}."))
+                        elif json_key not in user_subgroups:
+                            errors.append(
+                                create_error_msg(sw + '/' + arch,
+                                                json_path,
+                                                f"Subgroup '{json_key}' is present in JSON but not listed under additional_packages in software_config.json."))
                 if sw in software_config_json:
                     for sub_pkg in software_config_json[sw]:
                         sub_sw = sub_pkg.get('name')
                         if sub_sw not in curr_json:
+                            # For additional_packages, skip subgroups that
+                            # are not supported for this arch
+                            if sw == "additional_packages":
+                                if sub_sw not in supported_subgroups.get(arch, []):
+                                    continue
                             errors.append(
                                 create_error_msg(sw + '/' + arch,
                                                 json_path,
diff --git a/input/config/aarch64/rhel/10.0/additional_packages.json b/input/config/aarch64/rhel/10.0/additional_packages.json
index 0d6d9a0452..b01c3f78b5 100644
--- a/input/config/aarch64/rhel/10.0/additional_packages.json
+++ b/input/config/aarch64/rhel/10.0/additional_packages.json
@@ -4,21 +4,6 @@
 
         ]
     },
-    "service_kube_control_plane_first": {
-        "cluster": [
-
-        ]
-    },
-    "service_kube_control_plane": {
-        "cluster": [
-
-        ]
-    },
-    "service_kube_node": {
-        "cluster": [
-
-        ]
-    },
     "slurm_control_node": {
         "cluster": [