From ca14a61c68fd71392a872b994c2ad00a88214518 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Wed, 11 Feb 2026 17:46:22 +0530 Subject: [PATCH 01/77] Update omnia.sh --- omnia.sh | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 320 insertions(+), 11 deletions(-) diff --git a/omnia.sh b/omnia.sh index 235cc1dbc1..9c46a04dc9 100755 --- a/omnia.sh +++ b/omnia.sh @@ -979,10 +979,11 @@ start_container_session() { } show_help() { - echo "Usage: $0 [--install | --uninstall | --upgrade | --version | --help]" + echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]" echo " -i, --install Install and start the Omnia core container" echo " -u, --uninstall Uninstall the Omnia core container and clean up configuration" - echo " --upgrade Upgrade the Omnia core container from image tag 1.0 to 1.1" + echo " --upgrade Upgrade the Omnia core container to newer version + echo " --rollback Rollback the Omnia core container to previous version echo " -v, --version Display Omnia version information" echo " -h, --help More information about usage" } @@ -1248,15 +1249,6 @@ phase1_validate() { return 1 fi - if ! echo "$current_image" | grep -qE '(:|@)1\.0(\b|$)'; then - echo "[ERROR] [ORCHESTRATOR] Container version mismatch: expected 1.0, got: $current_image" - return 1 - fi - - echo "[INFO] [ORCHESTRATOR] Container version validated: 1.0 (Omnia 2.0.0.0)" - - - if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." @@ -1372,6 +1364,9 @@ phase4_container_swap() { if [ ! -f "$quadlet_file" ]; then echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi @@ -1385,27 +1380,42 @@ phase4_container_swap() { if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop 1.0 container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit" if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 image not available" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi systemctl daemon-reload || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 } systemctl start omnia_core.service || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 1.1 container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 } @@ -1419,6 +1429,9 @@ phase4_container_swap() { if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 container failed health check" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi @@ -1436,6 +1449,9 @@ phase4_container_swap() { fi "; then echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + rollback_omnia_core return 1 fi @@ -1490,6 +1506,296 @@ upgrade_omnia_core() { exit 0 } +# Validate backup directory structure and files +validate_backup_directory() { + local backup_path="$1" + + echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path" + + # Check if backup directory exists + if ! podman exec -u root omnia_core test -d "$backup_path"; then + echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path" + return 1 + fi + + # Check for required subdirectories + for subdir in input metadata configs; do + if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then + echo "[ERROR] [ROLLBACK] Missing required subdirectory: $backup_path/$subdir" + return 1 + fi + done + + # Check for required files + if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then + echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml" + return 1 + fi + + if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then + echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container" + return 1 + fi + + # Verify metadata contains version information + if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then + echo "[ERROR] [ROLLBACK] Metadata file does not contain version information" + return 1 + fi + + echo "[INFO] [ROLLBACK] Backup validation successful" + return 0 +} + +# Stop container gracefully with timeout +stop_container_gracefully() { + local container_name="$1" + local timeout="${2:-30}" + + echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..." + + # Try graceful stop first + if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then + echo "[INFO] [ROLLBACK] Container stopped gracefully" + return 0 + fi + + # Check if container is still running + if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then + echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..." + if podman stop "$container_name" >/dev/null 2>&1; then + echo "[INFO] [ROLLBACK] Container force stopped" + return 0 + else + echo "[ERROR] [ROLLBACK] Failed to stop container" + return 1 + fi + fi + + return 0 +} + +# Restore files from backup +restore_from_backup() { + local backup_path="$1" + + echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path" + + # Restore input files + if ! podman exec -u root omnia_core bash -c " + set -e + rm -rf /opt/omnia/input + cp -a '$backup_path/input' /opt/omnia/ + "; then + echo "[ERROR] [ROLLBACK] Failed to restore input files" + return 1 + fi + + # Restore metadata + if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then + echo "[ERROR] [ROLLBACK] Failed to restore metadata" + return 1 + fi + + # Restore container config on host + if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then + echo "[ERROR] [ROLLBACK] Failed to restore container config" + return 1 + fi + + echo "[INFO] [ROLLBACK] Files restored successfully" + return 0 +} + +# Main rollback function +rollback_omnia_core() { + echo -e "${GREEN}================================================================================${NC}" + echo -e "${GREEN} OMNIA CORE ROLLBACK${NC}" + echo -e "${GREEN}================================================================================${NC}" + echo "" + + # Audit log start + local rollback_start=$(date -Iseconds) + echo "[AUDIT] Rollback operation started at: $rollback_start" + + # Check if omnia_core container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo -e "${RED}ERROR: Omnia core container is not running.${NC}" + exit 1 + fi + + # Get current version + if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then + echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}" + exit 1 + fi + + local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') + if [ "$current_version" != "2.1.0.0" ]; then + echo -e "${RED}ERROR: Cannot rollback from version $current_version. Rollback is only supported from version 2.1.0.0.${NC}" + exit 1 + fi + + # List available backups + echo "[INFO] [ROLLBACK] Scanning for available backups..." + local backup_dirs=() + while IFS= read -r line; do + backup_dirs+=("$line") + done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_*" 2>/dev/null | sort -r) + + if [ ${#backup_dirs[@]} -eq 0 ]; then + echo -e "${RED}ERROR: No backup directories found.${NC}" + exit 1 + fi + + echo "" + echo "Available backup versions:" + for i in "${!backup_dirs[@]}"; do + local version=$(basename "${backup_dirs[$i]}" | sed 's/version_//') + local backup_date=$(podman exec -u root omnia_core stat -c '%y' "${backup_dirs[$i]}" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1) + echo " $((i+1)). Version $version (created: $backup_date)" + done + + # Prompt for backup selection + echo "" + echo -n "Select backup to restore from (1-${#backup_dirs[@]}): " + read -r selection + + # Validate selection + if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#backup_dirs[@]} ]; then + echo -e "${RED}ERROR: Invalid selection.${NC}" + exit 1 + fi + + local selected_backup="${backup_dirs[$((selection-1))]}" + local backup_version=$(basename "$selected_backup" | sed 's/version_//') + + echo "" + echo "Selected backup: Version $backup_version" + echo -n "Are you sure you want to rollback to version $backup_version? [y/N]: " + read -r confirm + + if [[ ! "$confirm" =~ ^[yY] ]]; then + echo "Rollback cancelled by user." + exit 0 + fi + + # Validate selected backup - only check if directory exists without podman exec + if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then + # Try to check on host if container check fails + # Get shared path from metadata to check on host + local shared_path=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + local host_backup_path="${selected_backup#/opt/omnia}" + if [ -z "$shared_path" ] || [ ! -d "$shared_path$host_backup_path" ]; then + echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}" + exit 1 + fi + fi + + echo "" + echo "[INFO] [ROLLBACK] Starting rollback process..." + + # Step 1: Stop 1.1 container gracefully + echo "" + echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 1.1 container..." + if ! stop_container_gracefully "omnia_core" 30; then + echo -e "${RED}ERROR: Failed to stop container.${NC}" + exit 1 + fi + + # Step 2: Check for 1.0 image + echo "" + echo "[INFO] [ROLLBACK] Step 2: Checking for Omnia core 1.0 image..." + if ! podman inspect omnia_core:1.0 >/dev/null 2>&1; then + echo -e "${YELLOW}WARNING: Omnia core 1.0 image not found locally.${NC}" + echo -e "${YELLOW}Attempting to tag image...${NC}" + + # Try to tag latest as 1.0 if available + if podman inspect omnia_core:latest >/dev/null 2>&1; then + podman tag omnia_core:latest omnia_core:1.0 + else + echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}" + exit 1 + fi + fi + + # Step 3: Start 1.0 container + echo "" + echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core 1.0 container..." + systemctl daemon-reload + if ! systemctl start omnia_core.service; then + echo -e "${RED}ERROR: Failed to start container service.${NC}" + exit 1 + fi + + # Step 4: Wait for container to be healthy + echo "" + echo "[INFO] [ROLLBACK] Step 4: Waiting for container to be healthy..." + local health_timeout=60 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ROLLBACK] Container is healthy" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}" + exit 1 + fi + + # Step 5: Validate backup directory structure + echo "" + echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..." + if ! validate_backup_directory "$selected_backup"; then + echo -e "${RED}ERROR: Backup validation failed.${NC}" + exit 1 + fi + + # Step 6: Restore files from backup + echo "" + echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..." + if ! restore_from_backup "$selected_backup"; then + echo -e "${RED}ERROR: Failed to restore from backup.${NC}" + exit 1 + fi + + # Step 7: Verify container version + echo "" + echo "[INFO] [ROLLBACK] Step 7: Verifying container version..." + local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') + + if [ "$verify_version" != "$backup_version" ]; then + echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}" + exit 1 + fi + + # Audit log end + local rollback_end=$(date -Iseconds) + echo "[AUDIT] Rollback operation completed at: $rollback_end" + echo "[AUDIT] Rolled back from version $current_version to $backup_version" + + echo "" + echo -e "${GREEN}================================================================================${NC}" + echo -e "${GREEN} ROLLBACK COMPLETED SUCCESSFULLY${NC}" + echo -e "${GREEN}================================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Omnia core has been rolled back to version $backup_version${NC}" + echo -e "${GREEN}✓ Container is running and healthy${NC}" + echo -e "${GREEN}✓ Configuration restored from backup${NC}" + echo "" + + # Initialize SSH config and start container session + init_ssh_config + start_container_session +} + # Main function to check if omnia_core container is already running. # If yes, ask the user if they want to enter the container or reinstall. # If no, set it up. @@ -1504,6 +1810,9 @@ main() { --upgrade) upgrade_omnia_core ;; + --rollback) + rollback_omnia_core + ;; --version|-v) display_version ;; From 46c63c095c51a3f2df5097a3b9739e61e7b8b6ad Mon Sep 17 00:00:00 2001 From: pullan1 Date: Wed, 11 Feb 2026 18:06:48 +0530 Subject: [PATCH 02/77] cleanup of files under offline_repo dir during pulp cleanup Signed-off-by: pullan1 --- common/library/modules/pulp_cleanup.py | 104 ++++++++++++++++++++++--- local_repo/pulp_cleanup.yml | 2 + 2 files changed, 95 insertions(+), 11 deletions(-) diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index 00ed27d0dd..f3da3e2004 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -27,6 +27,7 @@ import csv import glob import json +import shutil import subprocess from typing import Dict, List, Any, Tuple @@ -399,7 +400,7 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) return False, f"Pulp deletion error: {str(e)}" -def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: +def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: """Cleanup a pip module from Pulp Python repository. Pip modules are stored as: pip_module== @@ -408,6 +409,7 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: result = {"name": name, "type": "pip_module", "status": "Failed", "message": ""} messages = [] pulp_deleted = False + content_removed = False try: # Pulp Python repo name format: pip_module @@ -467,11 +469,17 @@ def cleanup_pip_module(name: str, base_path: str, logger) -> Dict[str, Any]: messages.append("Status files updated") mark_software_partial(affected, base_path, logger, 'pip_module') - if pulp_deleted: + # Clean up uploaded content from filesystem + fs_result = cleanup_content_directory(name, 'pip_module', repo_store_path, logger) + if fs_result["status"] == "Success": + content_removed = True + messages.append(fs_result["message"]) + + if pulp_deleted or content_removed: result["status"] = "Success" result["message"] = "; ".join(messages) if messages else "Cleaned up" else: - result["message"] = f"pip_module '{name}' not found in Pulp" + result["message"] = f"pip_module '{name}' not found in Pulp or filesystem" except Exception as e: result["message"] = f"Error: {str(e)}" @@ -493,7 +501,7 @@ def get_pulp_file_repo_name(name: str, file_type: str) -> str: return name -def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) -> Dict[str, Any]: +def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: """Cleanup artifact from Pulp File repository. Handles: tarball, git, manifest, ansible_galaxy_collection @@ -503,6 +511,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - messages = [] pulp_deleted = False status_removed = False + content_removed = False try: # Get the expected Pulp repository name @@ -559,12 +568,18 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - messages.append("Status files updated") mark_software_partial(affected, base_path, logger, file_type) + # Clean up uploaded content from filesystem + fs_result = cleanup_content_directory(name, file_type, repo_store_path, logger) + if fs_result["status"] == "Success": + content_removed = True + messages.append(fs_result["message"]) + # Determine overall result - if pulp_deleted or status_removed: + if pulp_deleted or status_removed or content_removed: result["status"] = "Success" result["message"] = "; ".join(messages) if messages else "Cleaned up" else: - result["message"] = f"{file_type} '{name}' not found in Pulp or status files" + result["message"] = f"{file_type} '{name}' not found in Pulp, status files, or filesystem" except Exception as e: result["message"] = f"Error: {str(e)}" @@ -572,7 +587,7 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, logger) - return result -def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]: +def cleanup_file(name: str, base_path: str, repo_store_path: str, logger) -> Dict[str, Any]: """Cleanup a file artifact. Routes to appropriate handler: @@ -583,10 +598,75 @@ def cleanup_file(name: str, base_path: str, logger) -> Dict[str, Any]: # Handle pip modules separately - they use Python repositories if file_type == "pip_module": - return cleanup_pip_module(name, base_path, logger) + return cleanup_pip_module(name, base_path, repo_store_path, logger) # All other file types use Pulp File repository - return cleanup_file_repository(name, file_type, base_path, logger) + return cleanup_file_repository(name, file_type, base_path, repo_store_path, logger) + + +# ============================================================================= +# FILESYSTEM CONTENT CLEANUP +# ============================================================================= + +def cleanup_content_directory(content_name: str, content_type: str, repo_store_path: str, logger) -> Dict[str, Any]: + """Remove uploaded content directory from the filesystem. + + Builds the content path the same way as download_common.py: + /offline_repo/cluster//rhel/// + + This mirrors how remove_from_status_files iterates over ARCH_SUFFIXES to + clean status.csv entries. + + Args: + content_name: Name of the content item (e.g., 'helm-v3.19.0-amd64') + content_type: Directory category (tarball, git, pip_module, manifest, + ansible_galaxy_collection, rpm_file) + repo_store_path: Root store path (e.g., '/opt/omnia') + logger: Logger instance + + Returns: + Dict with name, type, status, and message keys + """ + result = {"name": content_name, "type": f"filesystem_{content_type}", + "status": "Failed", "message": ""} + removed_dirs = [] + + cluster_path = os.path.join(repo_store_path, "offline_repo", "cluster") + if not os.path.exists(cluster_path): + result["message"] = f"Content store path not found: {cluster_path}" + logger.warning(result["message"]) + return result + + try: + for arch in ARCH_SUFFIXES: + # Walk version directories (e.g., rhel/10.0) + arch_path = os.path.join(cluster_path, arch) + if not os.path.isdir(arch_path): + continue + + for version_dir in glob.glob(f"{arch_path}/rhel/*/"): + content_dir = os.path.join(version_dir, content_type, content_name) + if os.path.exists(content_dir): + logger.info(f"Removing content directory: {content_dir}") + if os.path.isdir(content_dir): + shutil.rmtree(content_dir) + else: + os.remove(content_dir) + removed_dirs.append(content_dir) + + if removed_dirs: + result["status"] = "Success" + result["message"] = f"Removed content: {', '.join(removed_dirs)}" + else: + result["message"] = (f"No filesystem content found for " + f"'{content_name}' under {content_type}") + logger.info(result["message"]) + + except Exception as e: + result["message"] = f"Filesystem cleanup error: {str(e)}" + logger.error(f"Failed to cleanup content {content_name}: {e}") + + return result # ============================================================================= @@ -868,7 +948,8 @@ def run_module(): cleanup_repos=dict(type='list', elements='str', default=[]), cleanup_containers=dict(type='list', elements='str', default=[]), cleanup_files=dict(type='list', elements='str', default=[]), - base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT) + base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT), + repo_store_path=dict(type='str', default='/opt/omnia') ), supports_check_mode=True ) @@ -877,6 +958,7 @@ def run_module(): cleanup_containers = module.params['cleanup_containers'] cleanup_files = module.params['cleanup_files'] base_path = module.params['base_path'] + repo_store_path = module.params['repo_store_path'] # Setup logger - setup_standard_logger expects a directory, creates standard.log inside log_dir = os.path.join(base_path, "cleanup") @@ -915,7 +997,7 @@ def run_module(): # Process files for file in cleanup_files: - result = cleanup_file(file, base_path, logger) + result = cleanup_file(file, base_path, repo_store_path, logger) all_results.append(result) logger.info(f"File {file}: {result['status']} - {result['message']}") diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml index 5d409bbc1f..93e379833b 100644 --- a/local_repo/pulp_cleanup.yml +++ b/local_repo/pulp_cleanup.yml @@ -77,6 +77,8 @@ cleanup_repos: "{{ repo_list | default([]) }}" cleanup_containers: "{{ container_list | default([]) }}" cleanup_files: "{{ file_list | default([]) }}" + base_path: "{{ base_path | default('/opt/omnia/log/local_repo') }}" + repo_store_path: "{{ repo_store_path | default('/opt/omnia') }}" register: cleanup_result post_tasks: From 7ef0c3153135cfdd1d82b59f09ceb9bcc30da584 Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Thu, 12 Feb 2026 11:44:15 +0530 Subject: [PATCH 03/77] removing doca-ofed from nfs share Signed-off-by: Katakam-Rakesh --- .../templates/doca-ofed/doca-install.sh.j2 | 3 --- discovery/roles/k8s_config/vars/main.yml | 13 ++----------- discovery/roles/slurm_config/vars/main.yml | 12 ++---------- 3 files changed, 4 insertions(+), 24 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 index 111abcb3a1..db8a7cb9cc 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -44,9 +44,6 @@ else dnf install -y kernel-headers-$(uname -r) fi -echo "Bootstrap doca-ofed package..." -rpm -i "/var/lib/packages/${arch}/doca-ofed/doca-host-3.2.1-044000_25.10_rhel10.${arch}.rpm" - echo "Installing doca-ofed..." if rpm -q doca-ofed >/dev/null 2>&1; then echo "doca-ofed package is already installed." diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml index 433b8e9f76..a80fb9b257 100644 --- a/discovery/roles/k8s_config/vars/main.yml +++ b/discovery/roles/k8s_config/vars/main.yml @@ -78,19 +78,10 @@ packages_base_dir_aarch64: "{{ k8s_client_mount_path }}/packages/aarch64" offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" packages_layout_x86_64: - - doca-ofed - cuda packages_layout_aarch64: - - doca-ofed - cuda print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" -offline_path_x86_64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" -offline_path_aarch64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" - +offline_path_x86_64: [] +offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 43ee995e5a..3616b55068 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -141,19 +141,11 @@ packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64" offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" packages_layout_x86_64: - - doca-ofed - cuda packages_layout_aarch64: - - doca-ofed - cuda print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" -offline_path_x86_64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" -offline_path_aarch64: - - name: doca-ofed - source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" - dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" +offline_path_x86_64: [] +offline_path_aarch64: [] ssh_private_key_path: /root/.ssh/oim_rsa From b4f064ee0d7feed5bf0b3bd6233e992a5bd133e1 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 15:52:26 +0530 Subject: [PATCH 04/77] Upgrade of input credential files to 2.1 --- .../tasks/display_warnings.yml | 53 ++++++ .../import_input_parameters/tasks/main.yml | 12 ++ .../restore_omnia_config_credentials.yml | 171 ++++++++++++++++++ .../restore_user_registry_credential.yml | 130 +++++++++++++ .../tasks/set_backup_location.yml | 33 ++++ .../templates/omnia_config_credentials.yml.j2 | 48 +++++ .../import_input_parameters/vars/main.yml | 66 ++++++- 7 files changed, 512 insertions(+), 1 deletion(-) create mode 100644 upgrade/roles/import_input_parameters/tasks/display_warnings.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml create mode 100644 upgrade/roles/import_input_parameters/tasks/set_backup_location.yml create mode 100644 upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml new file mode 100644 index 0000000000..ac1eb69998 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml @@ -0,0 +1,53 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Display collected warnings + ansible.builtin.debug: + msg: | + ================================= + UPGRADE WARNINGS SUMMARY + ================================= + + {% if upgrade_warnings | length > 0 %} + {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected. + You will now be shown the detailed list. + {% else %} + No warnings detected. Upgrade completed successfully! + {% endif %} + when: upgrade_warnings is defined + + +- name: Pause for user to review warnings + ansible.builtin.pause: + prompt: | + ╔════════════════════════════════════════════╗ + ║ ⚠️ UPGRADE WARNINGS REVIEW ⚠️ ║ + ╚════════════════════════════════════════════╝ + + {% if upgrade_warnings | length > 0 %} + {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected: + + {% for warning in upgrade_warnings %} + {{ loop.index }}. {{ warning }} + {% endfor %} + + Please review these warnings carefully. + Press ENTER to continue or CTRL+C to abort. + {% else %} + No warnings detected. Upgrade completed successfully! + + Press ENTER to continue... + {% endif %} + when: upgrade_warnings is defined diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index ff77cf2c0e..2aacba7451 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Set backup location based on oim_metadata.yml + ansible.builtin.include_tasks: set_backup_location.yml + - name: Validate backup location for upgrade input processing ansible.builtin.include_tasks: precheck_backup_location.yml @@ -39,3 +42,12 @@ - name: Restore input files from backup ansible.builtin.include_tasks: restore_input_files.yml + +- name: Restore user_registry_credential.yml from backup + ansible.builtin.include_tasks: restore_user_registry_credential.yml + +- name: Restore omnia_config_credentials.yml from backup + ansible.builtin.include_tasks: restore_omnia_config_credentials.yml + +- name: Display upgrade warnings summary + ansible.builtin.include_tasks: display_warnings.yml diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml new file mode 100644 index 0000000000..0abafee26b --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -0,0 +1,171 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup omnia_config_credentials.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/omnia_config_credentials.yml" + register: backup_omnia_config_credentials_stat + +- name: Check if backup omnia_config_credentials_key exists + ansible.builtin.stat: + path: "{{ backup_location }}/.omnia_config_credentials_key" + register: backup_omnia_config_credentials_key_stat + +- name: Add warning for missing omnia_config_credentials.yml to list + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }} + when: + - not backup_omnia_config_credentials_stat.stat.exists + - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))" + +- name: Process omnia_config_credentials.yml when present in backup + block: + - name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/omnia_config_credentials.yml" + register: backup_omnia_config_credentials_content + changed_when: false + failed_when: false + no_log: true + + - name: "Case 1: Key present and file encrypted - Process and update" + block: + - name: Copy encrypted omnia_config_credentials.yml from backup to temp location + ansible.builtin.copy: + src: "{{ backup_location }}/omnia_config_credentials.yml" + dest: "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" + mode: '0600' + remote_src: true + + - name: Copy omnia_config_credentials_key from backup + ansible.builtin.copy: + src: "{{ backup_location }}/.omnia_config_credentials_key" + dest: "{{ input_project_dir }}/.omnia_config_credentials_key" + mode: '0600' + remote_src: true + + - name: Decrypt omnia_config_credentials.yml using the key + ansible.builtin.shell: + cmd: | + ansible-vault decrypt "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" \ + --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \ + --output "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + args: + executable: /bin/bash + no_log: true + register: vault_decrypt_result + failed_when: vault_decrypt_result.rc != 0 + + - name: Read decrypted content + ansible.builtin.slurp: + src: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + register: decrypted_content + no_log: true + + - name: Parse YAML content and extract credentials + ansible.builtin.set_fact: + credentials_dict: >- + {{ decrypted_content.content | b64decode | from_yaml }} + no_log: true + + rescue: + - name: Fail with decryption error message + ansible.builtin.fail: + msg: "{{ msg_omnia_config_decrypt_error }}" + + - name: "Case 1.1: Apply template and encrypt" + block: + - name: Set template variables from credentials + ansible.builtin.set_fact: + provision_password: "{{ credentials_dict.provision_password | default('') }}" + bmc_username: "{{ credentials_dict.bmc_username | default('') }}" + bmc_password: "{{ credentials_dict.bmc_password | default('') }}" + minio_s3_password: "{{ credentials_dict.minio_s3_password | default('') }}" + pulp_password: "{{ credentials_dict.pulp_password | default('') }}" + docker_username: "{{ credentials_dict.docker_username | default('') }}" + docker_password: "{{ credentials_dict.docker_password | default('') }}" + slurm_db_password: "{{ credentials_dict.slurm_db_password | default('') }}" + openldap_db_username: "{{ credentials_dict.openldap_db_username | default('') }}" + openldap_db_password: "{{ credentials_dict.openldap_db_password | default('') }}" + mysqldb_user: "{{ credentials_dict.mysqldb_user | default('') }}" + mysqldb_password: "{{ credentials_dict.mysqldb_password | default('') }}" + mysqldb_root_password: "{{ credentials_dict.mysqldb_root_password | default('') }}" + csi_username: "{{ credentials_dict.csi_username | default('') }}" + csi_password: "{{ credentials_dict.csi_password | default('') }}" + ldms_sampler_password: "{{ credentials_dict.ldms_sampler_password | default('') }}" + no_log: true + + - name: Write updated content using template + ansible.builtin.template: + src: omnia_config_credentials.yml.j2 + dest: "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + mode: '0600' + no_log: true + + - name: Encrypt updated file using the same key + ansible.builtin.shell: + cmd: | + ansible-vault encrypt "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" \ + --vault-password-file "{{ input_project_dir }}/.omnia_config_credentials_key" \ + --output "{{ input_project_dir }}/omnia_config_credentials.yml" + args: + executable: /bin/bash + no_log: true + register: vault_encrypt_result + failed_when: vault_encrypt_result.rc != 0 + + - name: Clean up temporary files + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "{{ input_project_dir }}/omnia_config_credentials.yml.tmp" + - "{{ input_project_dir }}/omnia_config_credentials.yml.decrypted" + + - name: Display success message + ansible.builtin.debug: + msg: "{{ msg_omnia_config_credentials_success }}" + + rescue: + - name: Fail with template/encryption error message + ansible.builtin.fail: + msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}" + when: >- + backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout + + - name: "Case 2: Both key and file missing - Add info warning" + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }} + when: >- + not backup_omnia_config_credentials_key_stat.stat.exists and + (backup_omnia_config_credentials_content.stdout is not defined or + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and + "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))" + + - name: "Case 3: Error - Mismatched state" + ansible.builtin.fail: + msg: "{{ msg_omnia_config_credentials_error }}" + when: >- + (not backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or + (backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) + when: backup_omnia_config_credentials_stat.stat.exists diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml new file mode 100644 index 0000000000..de337310b8 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -0,0 +1,130 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if backup user_registry_credential.yml exists + ansible.builtin.stat: + path: "{{ backup_location }}/user_registry_credential.yml" + register: backup_user_registry_credential_stat + +- name: Check if user_registry_credential.yml exists in current directory + ansible.builtin.stat: + path: "{{ input_project_dir }}/user_registry_credential.yml" + register: user_registry_credential_stat + +- name: Check if backup local_repo_credentials_key exists + ansible.builtin.stat: + path: "{{ backup_location }}/.local_repo_credentials_key" + register: backup_local_repo_credentials_key_stat + +- name: Add warning for missing user_registry_credential.yml to list + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [ + "WARNING: user_registry_credential.yml not found in backup at " + + backup_location + "/user_registry_credential.yml. " + + "This might be due to complete Omnia execution not being completed. " + + "Skipping restoration of this file." + ] }} + when: + - not backup_user_registry_credential_stat.stat.exists + - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))" + +- name: Process user_registry_credential.yml when present in backup + block: + - name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/user_registry_credential.yml" + register: backup_user_registry_content + changed_when: false + failed_when: false + no_log: true + + - name: "Case 1: Key present and file encrypted - Copy both" + block: + - name: Decrypt user_registry_credential.yml using the key + ansible.builtin.shell: + cmd: | + ansible-vault decrypt "{{ input_project_dir }}/user_registry_credential.yml.tmp" \ + --vault-password-file "{{ input_project_dir }}/.local_repo_credentials_key" \ + --output "{{ input_project_dir }}/user_registry_credential.yml.decrypted" + args: + executable: /bin/bash + no_log: true + register: vault_decrypt_result + failed_when: vault_decrypt_result.rc != 0 + + - name: Copy encrypted user_registry_credential.yml from backup + ansible.builtin.copy: + src: "{{ backup_location }}/user_registry_credential.yml" + dest: "{{ input_project_dir }}/user_registry_credential.yml" + mode: '0600' + remote_src: true + + - name: Copy local_repo_credentials_key from backup + ansible.builtin.copy: + src: "{{ backup_location }}/.local_repo_credentials_key" + dest: "{{ input_project_dir }}/.local_repo_credentials_key" + mode: '0600' + remote_src: true + + - name: Display success message for encrypted file restoration + ansible.builtin.debug: + msg: | + user_registry_credential.yml restored from backup. + Backup: {{ backup_location }}/user_registry_credential.yml + Target: {{ input_project_dir }}/user_registry_credential.yml + Status: Encrypted (key file also restored) + rescue: + - name: Fail with decryption error message + ansible.builtin.fail: + msg: "{{ msg_user_registry_decrypt_error }}" + when: >- + backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout + + - name: "Case 2: Both key and file missing - Add info warning" + ansible.builtin.set_fact: + upgrade_warnings: >- + {{ upgrade_warnings + [ + "INFO: Both user_registry_credential.yml and .local_repo_credentials_key " + + "are not present in backup. This is expected if registry credentials " + + "were not configured in the source installation." + ] }} + when: >- + not backup_local_repo_credentials_key_stat.stat.exists and + (backup_user_registry_content.stdout is not defined or + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and + "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))" + + - name: "Case 3: Error - Mismatched state" + ansible.builtin.fail: + msg: | + ERROR: Inconsistent state detected for user_registry_credential.yml: + {% if not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %} + - File is encrypted but key file (.local_repo_credentials_key) is missing + {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} + - Key file exists but file is not encrypted + {% endif %} + Please check the backup integrity and ensure both files are present + in consistent states. + when: >- + (not backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or + (backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) + when: backup_user_registry_credential_stat.stat.exists diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml new file mode 100644 index 0000000000..4f6a96e83f --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml @@ -0,0 +1,33 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Read oim_metadata.yml to get upgrade_backup_dir + ansible.builtin.slurp: + src: /opt/omnia/.data/oim_metadata.yml + register: oim_metadata_slurp + +- name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}" + +- name: Set backup_location from metadata + ansible.builtin.set_fact: + backup_location: "{{ oim_metadata.upgrade_backup_dir }}/input/project_default" + when: oim_metadata.upgrade_backup_dir is defined + +- name: Fail if upgrade_backup_dir is not defined in metadata + ansible.builtin.fail: + msg: "{{ msg_upgrade_backup_dir_missing }}" + when: oim_metadata.upgrade_backup_dir is not defined diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 new file mode 100644 index 0000000000..4b3b63d8c7 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/omnia_config_credentials.yml.j2 @@ -0,0 +1,48 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Provision credentials +provision_password: "{{ provision_password | default('') }}" +bmc_username: "{{ bmc_username | default('') }}" +bmc_password: "{{ bmc_password | default('') }}" + +# Prepare_oim credentials +minio_s3_password: "{{ minio_s3_password | default('') }}" +pulp_password: "{{ pulp_password | default('') }}" +docker_username: "{{ docker_username | default('') }}" +docker_password: "{{ docker_password | default('') }}" + +# Omnia credentials +slurm_db_password: "{{ slurm_db_password | default('') }}" + +# Security credentials +openldap_db_username: "{{ openldap_db_username | default('') }}" +openldap_db_password: "{{ openldap_db_password | default('') }}" + +# iDrac Telemetry credentials +mysqldb_user: "{{ mysqldb_user | default('') }}" +mysqldb_password: "{{ mysqldb_password | default('') }}" +mysqldb_root_password: "{{ mysqldb_root_password | default('') }}" + +# csi powerscale credentials +csi_username: "{{ csi_username | default('') }}" +csi_password: "{{ csi_password | default('') }}" + +# LDMS sampler +ldms_sampler_password: "{{ ldms_sampler_password | default('') }}" + +# postgres credentials +postgres_user: "{{ postgres_user | default('') }}" +postgres_password: "{{ postgres_password | default('') }}" diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index c27f111cde..5eee4a2f50 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -13,18 +13,82 @@ # limitations under the License. --- -backup_location: /opt/omnia/backups/upgrade/input/project_default +# backup_location will be set from oim_metadata.yml upgrade_backup_dir +# Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default +backup_location: "" backup_dir_mode: '0755' default_file_mode: '0644' +# List to collect warnings during execution +upgrade_warnings: [] + # Precheck backup location messages msg_backup_location_missing: "backup_location must be provided" +msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.data/oim_metadata.yml" # Restore input files messages msg_restore_item_name_missing: "restore_item must define 'name'" msg_validation_failed: "Validation failed for {{ restore_item.name }}" msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" +msg_user_registry_credential_missing: |- + \033[93mWARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml\033[0m + This might be due to complete Omnia execution not being completed. + Skipping restoration of this file. + +# Omnia config credentials messages +msg_omnia_config_credentials_missing: |- + WARNING: omnia_config_credentials.yml not found in backup at {{ backup_location }}/omnia_config_credentials.yml. + This might be due to complete Omnia execution not being completed. + Skipping restoration of this file. + +msg_omnia_config_credentials_info_missing: |- + INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key + are not present in backup. This is expected if credentials + were not configured in the source installation. + +msg_omnia_config_credentials_success: |- + omnia_config_credentials.yml restored and updated from backup. + Backup: {{ backup_location }}/omnia_config_credentials.yml + Target: {{ input_project_dir }}/omnia_config_credentials.yml + Status: Updated with postgres credentials and re-encrypted (key file also restored) + +msg_omnia_config_credentials_error: |- + ERROR: Inconsistent state detected for omnia_config_credentials.yml: + {% if not backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %} + - File is encrypted but key file (.omnia_config_credentials_key) is missing + {% elif backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %} + - Key file exists but file is not encrypted + {% endif %} + Please check the backup integrity and ensure both files are present + in consistent states. + +# Rescue warning messages +msg_user_registry_decrypt_error: |- + ERROR: Failed to decrypt user_registry_credential.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file + matches the encrypted file. + +msg_omnia_config_decrypt_error: |- + ERROR: Failed to decrypt omnia_config_credentials.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file + matches the encrypted file. + +msg_omnia_config_template_error: |- + ERROR: Failed to generate updated omnia_config_credentials.yml. + Template processing may have failed due to invalid data format. + Please check the backup file format and ensure it contains valid YAML. + +msg_omnia_config_encrypt_error: |- + ERROR: Failed to encrypt updated omnia_config_credentials.yml. + The key file may be corrupted or there may be permission issues. + Please check the key file integrity and file permissions. + +msg_decryption_failed: "Decryption failed. Check warnings for details." +msg_template_failed: "Template processing failed. Check warnings for details." +msg_encryption_failed: "Encryption failed. Check warnings for details." # Network spec transformation messages msg_backup_network_spec_missing: "Backup network_spec.yml missing" From d3b9c749b5096eaa4ca708def872e51ad38e1ed4 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Thu, 12 Feb 2026 16:16:44 +0530 Subject: [PATCH 05/77] Added new package type rpm_repo Signed-off-by: pullan1 --- .../input_validation/common_utils/config.py | 1 + .../library/module_utils/local_repo/config.py | 2 +- .../local_repo/parse_and_download.py | 183 ++++++++++++------ .../module_utils/local_repo/software_utils.py | 6 +- common/library/modules/parallel_tasks.py | 163 ++++++++++------ common/library/modules/pulp_cleanup.py | 177 +++++++++++------ local_repo/pulp_cleanup.yml | 13 +- 7 files changed, 354 insertions(+), 191 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index e6e8a09042..0f369f3950 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -147,6 +147,7 @@ "rpm": ["package", "repo_name"], "rpm_list": ["package_list", "repo_name"], "rpm_file": ["package", "url"], + "rpm_repo": ["package", "repo_name"], "ansible_galaxy_collection": ["package", "version"], "git": ["package", "version", "url"], "image": ["package", ["tag", "digest"]], # Special: one of tag or digest diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 0518e2bb01..cfc3b20c9d 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -51,7 +51,7 @@ # Used by software_utils.py # ---------------------------- PACKAGE_TYPES = ['rpm', 'deb', 'tarball', 'image', 'manifest', 'git', - 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file'] + 'pip_module', 'deb', 'shell', 'ansible_galaxy_collection', 'iso', 'rpm_list', 'rpm_file', 'rpm_repo'] CSV_COLUMNS = {"column1": "name", "column2": "status"} SOFTWARE_CONFIG_SUBDIR = "config" RPM_LABEL_TEMPLATE = "RPMs for {key}" diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index 367f9561f5..72efd4566b 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # pylint: disable=import-error,no-name-in-module +""" +Utility functions for parsing and downloading artifacts. + +This module provides common functions for command execution, status file management, +and repository operations used across the local repo management system. +""" + import os import subprocess import json import re from multiprocessing import Lock -from ansible.module_utils.local_repo.standard_logger import setup_standard_logger +from ansible.module_utils.local_repo.config import ARCH_SUFFIXES, STATUS_CSV_HEADER def mask_sensitive_data(cmd_string): @@ -57,35 +64,87 @@ def execute_command(cmd_string, logger, type_json=False): stderr=subprocess.PIPE, shell=True, ) - - status["returncode"] = cmd.returncode - status["stdout"] = cmd.stdout.strip() if cmd.stdout else None - status["stderr"] = cmd.stderr.strip() if cmd.stderr else None - - if cmd.returncode != 0: - logger.error(f"Command failed with return code {cmd.returncode}") - logger.error(f"Error: {status['stderr']}") - return False - - if type_json and status["stdout"]: - try: - status["stdout"] = json.loads(status["stdout"]) - except json.JSONDecodeError as error: - logger.error(f"Failed to parse JSON output: {error}") - return False - - return status - - except Exception as error: - logger.error(f"Error executing command: {error}") + logger.info(f"Command succeeded: {cmd_string}") + return True + except subprocess.CalledProcessError as e: + logger.error(f"Command failed: {cmd_string} - {e}") + return False + except subprocess.TimeoutExpired as e: + logger.error(f"Command timed out: {cmd_string} - {e}") + return False + except OSError as e: + logger.error(f"OS error during command: {cmd_string} - {e}") return False finally: logger.info("#" * 30 + f" {execute_command.__name__} end " + "#" * 30) +def get_arch_from_status_path(status_file_path): + """Extract architecture from status file path. + + Args: + status_file_path: Path like '/opt/omnia/log/local_repo/x86_64/software_name/status.csv' + + Returns: + str: Architecture ('x86_64' or 'aarch64') or None if not found + """ + for arch in ARCH_SUFFIXES: + if f"/{arch}/" in status_file_path: + return arch + return None + +def _prefix_repo_name_with_arch(repo_name: str, status_file_path: str, logger) -> str: + """Add architecture prefix to repo_name if not already present. + + Args: + repo_name: Repository name to prefix + status_file_path: Path to extract architecture from + logger: Logger instance + + Returns: + str: Repository name with architecture prefix + """ + if not repo_name: + return repo_name + + arch = get_arch_from_status_path(status_file_path) + if arch and not any(repo_name.startswith(f"{prefix}_") for prefix in ARCH_SUFFIXES): + prefixed_name = f"{arch}_{repo_name}" + logger.info(f"Auto-prefixed repo_name with architecture: {prefixed_name}") + return prefixed_name + return repo_name + + +def _update_existing_line(line: str, package_name: str, package_type: str, status: str, repo_name: str, status_file_path: str) -> str: + """Update an existing line in status file. + + Args: + line: Existing line content + package_name: Package name to match + package_type: Package type + status: New status + repo_name: Repository name + status_file_path: Path for architecture extraction + + Returns: + str: Updated line content + """ + parts = line.strip().split(',') + if len(parts) >= 4: + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + parts[2] = final_repo_name if final_repo_name else '' + parts[3] = status + return ','.join(parts) + '\n' + + # Handle short lines + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + return f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n" + + def write_status_to_file(status_file_path, package_name, package_type, status, logger, file_lock: Lock, repo_name=None): """ - Writes or updates the status of a package in the status file, using a lock to ensure safe access across processes. + Writes or updates the status of a package in the status file. + Args: status_file_path: Path to the status file package_name: Name of the package @@ -97,44 +156,56 @@ def write_status_to_file(status_file_path, package_name, package_type, status, l """ logger.info("#" * 30 + f" {write_status_to_file.__name__} start " + "#" * 30) + # Auto-prefix repo_name with architecture if needed + repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, logger) + try: with file_lock: # Ensure only one process can write at a time if os.path.exists(status_file_path): - with open(status_file_path, "r") as f: - lines = f.readlines() - - updated = False - with open(status_file_path, "w") as f: - # Write header (new files always have repo_name column) - if lines: - f.write(lines[0]) # Keep existing header - - # Write data lines - for line in lines[1:]: # Skip header - if line.startswith(f"{package_name},"): - # f.write(f"{package_name},{package_type},{status}\n") - # Update existing line with repo_name (order: name,type,repo_name,status) - parts = line.strip().split(',') - if len(parts) >= 4: - parts[2] = repo_name if repo_name else '' - parts[3] = status - f.write(','.join(parts) + '\n') - else: - f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") - updated = True - else: - f.write(line) - - if not updated: - f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") + _update_existing_file(status_file_path, package_name, package_type, status, repo_name) else: - with open(status_file_path, "w") as f: - f.write(STATUS_CSV_HEADER) - f.write(f"{package_name},{package_type},{repo_name if repo_name else ''},{status}\n") + _create_new_file(status_file_path, package_name, package_type, status, repo_name) logger.info(f"Status written to {status_file_path} for {package_name}.") - except Exception as e: + except OSError as e: logger.error(f"Failed to write to status file: {status_file_path}. Error: {str(e)}") - raise RuntimeError(f"Failed to write to status file: {status_file_path}. Error: {str(e)}") + raise RuntimeError( + f"Failed to write to status file: {status_file_path}. Error: {str(e)}" + ) from e finally: logger.info("#" * 30 + f" {write_status_to_file.__name__} end " + "#" * 30) + + +def _update_existing_file(status_file_path, package_name, package_type, status, repo_name): + """Update existing status file with new package status.""" + with open(status_file_path, "r", encoding='utf-8') as f: + lines = f.readlines() + + updated = False + with open(status_file_path, "w", encoding='utf-8') as f: + # Write header + if lines: + f.write(lines[0]) + + # Write data lines + for line in lines[1:]: # Skip header + if line.startswith(f"{package_name},"): + updated_line = _update_existing_line( + line, package_name, package_type, status, repo_name, status_file_path + ) + f.write(updated_line) + updated = True + else: + f.write(line) + + if not updated: + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n") + + +def _create_new_file(status_file_path, package_name, package_type, status, repo_name): + """Create new status file with package status.""" + with open(status_file_path, "w", encoding='utf-8') as f: + f.write(STATUS_CSV_HEADER) + final_repo_name = _prefix_repo_name_with_arch(repo_name, status_file_path, None) + f.write(f"{package_name},{package_type},{final_repo_name if final_repo_name else ''},{status}\n") diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index a915f25f8b..3e06ddc7cd 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -179,7 +179,7 @@ def transform_package_dict(data, arch_val,logger): repo_mapping = {} for item in items: - if item.get("type") == "rpm": + if item.get("type") in ("rpm", "rpm_repo"): rpm_packages.append(item["package"]) # Preserve repo_name if available if "repo_name" in item: @@ -832,7 +832,7 @@ def remove_duplicates_from_trans(trans): if group == "default_packages": # Handle nested rpm_list case for pkg in items: - if pkg.get("type") == "rpm" and "rpm_list" in pkg: + if pkg.get("type") in ("rpm", "rpm_repo") and "rpm_list" in pkg: pkg["rpm_list"] = list(dict.fromkeys(pkg["rpm_list"])) continue @@ -856,7 +856,7 @@ def remove_duplicates_from_trans(trans): elif type_ == "git": key = (item.get("url"), item.get("version")) - elif type_ == "rpm" and "rpm_list" in item: + elif type_ in ("rpm", "rpm_repo") and "rpm_list" in item: item["rpm_list"] = list(dict.fromkeys(item["rpm_list"])) key = item.get("package") diff --git a/common/library/modules/parallel_tasks.py b/common/library/modules/parallel_tasks.py index 5951a525b2..17c14cf51f 100644 --- a/common/library/modules/parallel_tasks.py +++ b/common/library/modules/parallel_tasks.py @@ -34,7 +34,9 @@ from ansible.module_utils.local_repo.download_image import process_image from ansible.module_utils.local_repo.download_rpm import process_rpm from ansible.module_utils.local_repo.standard_logger import setup_standard_logger -from ansible.module_utils.local_repo.common_functions import generate_vault_key, process_file, is_encrypted +from ansible.module_utils.local_repo.common_functions import ( + generate_vault_key, process_file, is_encrypted +) from ansible.module_utils.local_repo.software_utils import ( load_json, set_version_variables, @@ -125,7 +127,10 @@ def update_status_csv(csv_dir, software, overall_status,slogger): slogger.info(f"Successfully updated status CSV at {status_file}") -def determine_function(task, repo_store_path, csv_file_path, user_data, version_variables, arc, user_registries, docker_username, docker_password): +def determine_function( + task, repo_store_path, csv_file_path, user_data, version_variables, arc, + user_registries, docker_username, docker_password +): """ Determines the appropriate function and its arguments to process a given task. @@ -160,27 +165,55 @@ def determine_function(task, repo_store_path, csv_file_path, user_data, version_ task_type = task.get("type") if task_type == "manifest": - return process_manifest, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_manifest, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "git": - return process_git, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_git, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "tarball": - return process_tarball, [task, repo_store_path, status_file, version_variables, cluster_os_type, cluster_os_version, arc] + return process_tarball, [ + task, repo_store_path, status_file, version_variables, + cluster_os_type, cluster_os_version, arc + ] if task_type == "shell": - return process_shell, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_shell, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "ansible_galaxy_collection": - return process_ansible_galaxy_collection, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_ansible_galaxy_collection, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "iso": - return process_iso, [task, repo_store_path, status_file, - cluster_os_type, cluster_os_version, version_variables, arc] + return process_iso, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, version_variables, arc + ] if task_type == "pip_module": - return process_pip, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] + return process_pip, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] if task_type == "image": - return process_image, [task, status_file, version_variables, user_registries, docker_username, docker_password] + return process_image, [ + task, status_file, version_variables, user_registries, + docker_username, docker_password + ] if task_type == "rpm_file": - return process_rpm_file, [task, repo_store_path, status_file, cluster_os_type, cluster_os_version, arc] - if task_type == "rpm": - return process_rpm, [task, repo_store_path, status_file, - cluster_os_type, cluster_os_version, repo_config_value, arc] + return process_rpm_file, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, arc + ] + if task_type in ("rpm", "rpm_repo"): + return process_rpm, [ + task, repo_store_path, status_file, cluster_os_type, + cluster_os_version, repo_config_value, arc + ] raise ValueError(f"Unknown task type: {task_type}") except Exception as e: @@ -272,57 +305,43 @@ def main(): Args: tasks (list): A list of tasks (dictionaries) that need to be processed in parallel. nthreads (int): The number of worker processes to run in parallel. - timeout (int): The maximum time allowed for all tasks to execute. If `None`, no timeout is enforced. + timeout (int): The maximum time allowed for all tasks to execute. + If `None`, no timeout is enforced. log_dir (str): The directory where log files for the worker processes will be saved. log_file (str): The path to the log file for the overall task execution. slog_file (str): The path to the log file for the standard logger. csv_file_path (str): The path to a CSV file that may be needed for processing some tasks. repo_store_path (str): The path to the repository where task-related files are stored. software (list): A list of software names. - user_json_file (str): The path to the JSON file containing use - show_softwares_status (bool): Whether to display the software status; optional, defaults to False. - overall_status_dict (dict): A list containing overall software status information; optional, defaults to an empty dict. - Dictionary containing software status information grouped by software names. - Each key (e.g., 'service_k8s') maps to a list of dictionaries, - where each dictionary contains: - - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'. - - 'overall_status' (str): Status of the software on that architecture, e.g., 'SUCCESS'. - Example: - { - "service_k8s": [ - {"arch": "x86_64", "overall_status": "SUCCESS"}, - {"arch": "aarch64", "overall_status": "SUCCESS"} - ] - } - Defaults to an empty dict if not provided. + user_json_file (str): The path to the JSON file containing user data. + show_softwares_status (bool): Whether to display the software status; + optional, defaults to False. + overall_status_dict (dict): A dictionary containing overall software status + information; optional, defaults to an empty dict. + Dictionary containing software status information grouped by software names. + Each key (e.g., 'service_k8s') maps to a list of dictionaries, + where each dictionary contains: + - 'arch' (str): Architecture name, e.g., 'x86_64' or 'aarch64'. + - 'overall_status' (str): Status of the software on that architecture, + e.g., 'SUCCESS'. + Example: + { + "service_k8s": [ + {"arch": "x86_64", "overall_status": "SUCCESS"}, + {"arch": "aarch64", "overall_status": "SUCCESS"} + ] + } + Defaults to an empty dict if not provided. Returns: tuple: A tuple containing: - - overall_status (str): The overall status of task execution ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT"). - - task_results_data (list): A list of dictionaries, each containing the result of an individual task. + - overall_status (str): The overall status of task execution + ("SUCCESS", "FAILED", "PARTIAL", "TIMEOUT"). + - task_results_data (list): A list of dictionaries, each containing + the result of an individual task. Raises: Exception: If an error occurs during execution. """ - # module_args = { - # "tasks": {"type": "list", "required": True}, - # "nthreads": {"type": "int", "required": False, "default": DEFAULT_NTHREADS}, - # "timeout": {"type": "int", "required": False, "default": DEFAULT_TIMEOUT}, - # "log_dir": {"type": "str", "required": False, "default": LOG_DIR_DEFAULT}, - # "log_file": {"type": "str", "required": False, "default": DEFAULT_LOG_FILE}, - # "slog_file": {"type": "str", "required": False, "default": DEFAULT_SLOG_FILE}, - # "csv_file_path": {"type": "str", "required": False, "default": CSV_FILE_PATH_DEFAULT}, - # "repo_store_path": {"type": "str", "required": False, "default": DEFAULT_REPO_STORE_PATH}, - # "software": {"type": "list", "elements": "str", "required": True}, - # "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT}, - # "show_softwares_status": {"type": "bool", "required": False, "default": False}, - # "overall_status_dict": {"type": "dict","required": True}, - # "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT}, - # "arch": {"type": "str", "required": False}, - # "user_reg_cred_input": {"type": "str", "required": False, "default": USER_REG_CRED_INPUT}, - # "user_reg_key_path": {"type": "str", "required": False, "default": USER_REG_KEY_PATH}, - # "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH}, - # "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH} - # } module_args = { "tasks": {"type": "list", "required": True}, @@ -337,10 +356,19 @@ def main(): "user_json_file": {"type": "str", "required": False, "default": USER_JSON_FILE_DEFAULT}, "show_softwares_status": {"type": "bool", "required": False, "default": False}, "overall_status_dict": {"type": "dict","required": True}, - "local_repo_config_path": {"type": "str", "required": False, "default": LOCAL_REPO_CONFIG_PATH_DEFAULT}, + "local_repo_config_path": { + "type": "str", "required": False, + "default": LOCAL_REPO_CONFIG_PATH_DEFAULT + }, "arch": {"type": "str", "required": False}, - "omnia_credentials_yaml_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_YAML_PATH}, - "omnia_credentials_vault_path": {"type": "str", "required": False, "default": OMNIA_CREDENTIALS_VAULT_PATH} + "omnia_credentials_yaml_path": { + "type": "str", "required": False, + "default": OMNIA_CREDENTIALS_YAML_PATH + }, + "omnia_credentials_vault_path": { + "type": "str", "required": False, + "default": OMNIA_CREDENTIALS_VAULT_PATH + } } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) tasks = module.params["tasks"] @@ -386,24 +414,29 @@ def main(): cluster_os_type = user_data['cluster_os_type'] cluster_os_version = user_data['cluster_os_version'] - subgroup_dict, software_names = get_subgroup_dict(user_data,slogger) - version_variables = set_version_variables(user_data, software_names, cluster_os_version,slogger) + subgroup_dict, software_names = get_subgroup_dict(user_data, slogger) + version_variables = set_version_variables( + user_data, software_names, cluster_os_version, slogger + ) slogger.info(f"Cluster OS: {cluster_os_type}") slogger.info(f"Version Variables: {version_variables}") # gen_result = {} # if not os.path.isfile(user_reg_key_path): # gen_result = generate_vault_key(user_reg_key_path) # if gen_result is None: - # module.fail_json(msg=f"Unable to generate local_repo key at path: {user_reg_key_path}") + # module.fail_json( + # msg=f"Unable to generate local_repo key at path: {user_reg_key_path}" + # ) overall_status, task_results = execute_parallel( tasks, determine_function, nthreads, repo_store_path, csv_file_path, - log_dir, user_data, version_variables, arc, slogger, local_repo_config_path, - omnia_credentials_yaml_path, omnia_credentials_vault_path, timeout + log_dir, user_data, version_variables, arc, slogger, + local_repo_config_path, omnia_credentials_yaml_path, + omnia_credentials_vault_path, timeout ) # if not is_encrypted(user_reg_cred_input): - # process_file(user_reg_cred_input,user_reg_key_path,'encrypt') + # process_file(user_reg_cred_input, user_reg_key_path, 'encrypt') end_time = datetime.now() formatted_end_time = end_time.strftime("%I:%M:%S %p") @@ -442,7 +475,9 @@ def main(): except Exception as e: - result["table_output"] = table_output if "table_output" in locals() else "No table generated." + result["table_output"] = ( + table_output if "table_output" in locals() else "No table generated." + ) slogger.error(f"Execution failed: {str(e)}") module.fail_json(msg=f"Error during execution: {str(e)}", **result) diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index f3da3e2004..a3c155ebdb 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -137,7 +137,10 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]: # Must contain at least one '/' to indicate registry/image format if '/' not in image_name: - return False, f"Invalid format '{image_name}'. Must include registry (e.g., registry.k8s.io/pause, docker.io/library/busybox)" + return False, ( + f"Invalid format '{image_name}'. Must include registry " + "(e.g., registry.k8s.io/pause, docker.io/library/busybox)" + ) # Must have a registry part (contains '.' or is a known registry) parts = image_name.split('/') @@ -145,7 +148,10 @@ def validate_container_format(image_name: str) -> Tuple[bool, str]: # Check if registry looks valid (contains dot or is localhost) if '.' not in registry and registry != 'localhost' and ':' not in registry: - return False, f"Invalid registry '{registry}' in '{image_name}'. Registry must be a domain (e.g., docker.io, registry.k8s.io)" + return False, ( + f"Invalid registry '{registry}' in '{image_name}'. " + "Registry must be a domain (e.g., docker.io, registry.k8s.io)" + ) return True, "" @@ -173,7 +179,9 @@ def detect_file_type(name: str) -> str: if '==' in name: return "pip_module" # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix) - if '.' in name and '/' not in name and '==' not in name and any(x in name.lower() for x in ['ansible', 'community', 'galaxy']): + if '.' in name and '/' not in name and '==' not in name and any( + x in name.lower() for x in ['ansible', 'community', 'galaxy'] + ): return "ansible_galaxy_collection" if name.startswith('ansible_galaxy_collection'): return "ansible_galaxy_collection" @@ -296,7 +304,9 @@ def cleanup_container(user_input: str, base_path: str, logger) -> Dict[str, Any] # Check existence if not container_exists(pulp_name, logger): - result["message"] = f"Container not found in Pulp (looked for: {pulp_name})" + result["message"] = ( + f"Container not found in Pulp (looked for: {pulp_name})" + ) return result try: @@ -368,7 +378,8 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) # 1. Remove content from repository if content_href: remove_result = run_cmd( - f"pulp file repository content remove --repository {repo_name} --href {content_href}", + f"pulp file repository content remove --repository {repo_name} " + f"--href {content_href}", logger ) if remove_result["rc"] == 0: @@ -376,7 +387,8 @@ def delete_file_from_pulp(name: str, repo_name: str, content_href: str, logger) else: # Try alternative: modify repository to remove content run_cmd( - f"pulp file repository content modify --repository {repo_name} --remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'", + f"pulp file repository content modify --repository {repo_name} " + f"--remove-content '[{{\"pulp_href\": \"{content_href}\"}}]'", logger ) @@ -444,7 +456,9 @@ def cleanup_pip_module(name: str, base_path: str, repo_store_path: str, logger) messages.append("Orphan cleanup completed") else: # Try listing repos to find partial match - repo_list = run_cmd(pulp_python_commands["list_repositories"], logger) + repo_list = run_cmd( + pulp_python_commands["list_repositories"], logger + ) if repo_list["rc"] == 0: repos = safe_json_parse(repo_list["stdout"]) for repo in repos: @@ -533,7 +547,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor messages.append("Repository deleted") else: # Try listing repos to find partial match - repo_list = run_cmd(pulp_file_commands["list_repositories"], logger) + repo_list = run_cmd( + pulp_file_commands["list_repositories"], logger + ) if repo_list["rc"] == 0: repos = safe_json_parse(repo_list["stdout"]) for repo in repos: @@ -569,7 +585,9 @@ def cleanup_file_repository(name: str, file_type: str, base_path: str, repo_stor mark_software_partial(affected, base_path, logger, file_type) # Clean up uploaded content from filesystem - fs_result = cleanup_content_directory(name, file_type, repo_store_path, logger) + fs_result = cleanup_content_directory( + name, file_type, repo_store_path, logger + ) if fs_result["status"] == "Success": content_removed = True messages.append(fs_result["message"]) @@ -673,67 +691,82 @@ def cleanup_content_directory(content_name: str, content_type: str, repo_store_p # STATUS FILE UPDATES # ============================================================================= -def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> List[str]: +def remove_rpms_from_repository(repo_name: str, base_path: str, logger) -> Dict[str, List[str]]: """Remove RPMs that belong to a specific repository from status files. - + Uses the repo_name column in status.csv to accurately identify RPMs from the repository. - + Now that all repo_names include architecture prefixes, the logic is simplified. + Args: - repo_name: Repository name (e.g., 'x86_64_appstream') + repo_name: Repository name (e.g., 'x86_64_appstream', 'aarch64_epel') base_path: Base path for status files logger: Logger instance - + Returns: - List of software names that were affected + Dict mapping architecture to list of affected software names """ - affected_software = [] + affected_software = {} logger.info(f"Removing RPMs from status.csv for repository: {repo_name}") - try: - for arch in ARCH_SUFFIXES: - for status_file in glob.glob(f"{base_path}/{arch}/*/status.csv"): - rows = [] - removed = False - has_repo_column = False - # Check if file has repo_name column - with open(status_file, 'r', encoding='utf-8') as f: - header = f.readline().strip().lower() - has_repo_column = "repo_name" in header + # Extract architecture from repo_name (all repo_names should now have arch prefixes) + target_arch = None + for arch in ARCH_SUFFIXES: + if repo_name.startswith(f"{arch}_"): + target_arch = arch + break + + if not target_arch: + logger.error(f"Repository name {repo_name} does not have architecture prefix") + return {} + + logger.info(f"Processing architecture: {target_arch}") + affected_software[target_arch] = [] + + try: + for status_file in glob.glob(f"{base_path}/{target_arch}/*/status.csv"): + rows = [] + removed = False + has_repo_column = False - with open(status_file, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - fieldnames = reader.fieldnames - for row in reader: - name = row.get('name', '') - row_type = row.get('type', '') - rpm_repo = row.get('repo_name', '') - - logger.info(f"Processing row: {row}") - # For RPMs, check if they belong to the deleted repository - if row_type == 'rpm' or row_type == 'rpm_file': - if has_repo_column and rpm_repo == repo_name: - removed = True - logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") - else: - rows.append(row) + # Check if file has repo_name column + with open(status_file, 'r', encoding='utf-8') as f: + header = f.readline().strip().lower() + has_repo_column = "repo_name" in header + + with open(status_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + fieldnames = reader.fieldnames + for row in reader: + name = row.get('name', '') + row_type = row.get('type', '') + rpm_repo = row.get('repo_name', '') + + logger.info(f"Processing row: {row}") + # For RPMs, check if they belong to the deleted repository + if row_type in ('rpm', 'rpm_repo', 'rpm_file'): + if has_repo_column and rpm_repo == repo_name: + removed = True + logger.info(f"Removing RPM '{name}' from {status_file} (repo {repo_name} deleted)") else: rows.append(row) + else: + rows.append(row) - if removed and fieldnames: - with open(status_file, 'w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) + if removed and fieldnames: + with open(status_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) - # Track affected software - software_name = os.path.basename(os.path.dirname(status_file)) - if software_name not in affected_software: - affected_software.append(software_name) + # Track affected software + software_name = os.path.basename(os.path.dirname(status_file)) + if software_name not in affected_software[target_arch]: + affected_software[target_arch].append(software_name) return affected_software except Exception as e: logger.error(f"Failed to remove RPMs from repository {repo_name}: {e}") - return [] + return {} def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: str, logger) -> Dict[str, List[str]]: """Remove artifact from status.csv files and return affected software names by architecture. @@ -798,10 +831,10 @@ def remove_from_status_files(artifact_name: str, artifact_type: str, base_path: def mark_software_partial(affected_software, base_path: str, logger, artifact_type: str = None): """Mark software entries as partial in software.csv. - + Args: - affected_software: Either a List[str] of software names (from remove_rpms_from_repository) - or a Dict[str, List[str]] mapping arch to software names (from remove_from_status_files) + affected_software: Either a List[str] of software names (legacy support) + or a Dict[str, List[str]] mapping arch to software names base_path: Base path for software.csv logger: Logger instance artifact_type: Type of artifact being removed (for logging purposes) @@ -811,8 +844,11 @@ def mark_software_partial(affected_software, base_path: str, logger, artifact_ty logger.info("No affected software to mark as partial") return - # Normalize input: if a flat list is passed, apply to all architectures + # Normalize input: convert to arch_software_map if needed if isinstance(affected_software, list): + # Legacy list input - this should not happen with new remove_rpms_from_repository + # but we keep it for backward compatibility + logger.warning("Received list input to mark_software_partial, applying to all architectures (legacy behavior)") arch_software_map = {arch: affected_software for arch in ARCH_SUFFIXES} else: arch_software_map = affected_software @@ -869,7 +905,7 @@ def software_has_rpms(software_name: str, arch: str, base_path: str, logger) -> with open(status_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: - if row.get('type', '').lower() == 'rpm': + if row.get('type', '').lower() in ('rpm', 'rpm_repo'): return True return False except OSError as e: @@ -892,7 +928,9 @@ def mark_all_software_partial(base_path: str, logger): try: for arch in ARCH_SUFFIXES: software_file = f"{base_path}/{arch}/software.csv" - logger.info(f"Processing software file: {software_file}") + logger.info( + f"Processing software file: {software_file}" + ) if not os.path.exists(software_file): logger.info(f"Software file not found: {software_file}") @@ -948,8 +986,12 @@ def run_module(): cleanup_repos=dict(type='list', elements='str', default=[]), cleanup_containers=dict(type='list', elements='str', default=[]), cleanup_files=dict(type='list', elements='str', default=[]), - base_path=dict(type='str', default=CLEANUP_BASE_PATH_DEFAULT), - repo_store_path=dict(type='str', default='/opt/omnia') + base_path=dict( + type='str', default=CLEANUP_BASE_PATH_DEFAULT + ), + repo_store_path=dict( + type='str', default='/opt/omnia' + ) ), supports_check_mode=True ) @@ -966,16 +1008,25 @@ def run_module(): logger = setup_standard_logger(log_dir) # Handle 'all' keyword for repositories only - cleanup_all_repos = cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all' + cleanup_all_repos = ( + cleanup_repos and len(cleanup_repos) == 1 and + cleanup_repos[0].lower() == 'all' + ) #if cleanup_repos and len(cleanup_repos) == 1 and cleanup_repos[0].lower() == 'all': if cleanup_all_repos: logger.info("cleanup_repos='all' - fetching all repositories from Pulp") cleanup_repos = get_all_repositories(logger) if not cleanup_repos: - module.fail_json(msg="Failed to retrieve repository list from Pulp. Please check if Pulp services are running.") + module.fail_json( + msg="Failed to retrieve repository list from Pulp. " + "Please check if Pulp services are running." + ) logger.info(f"Found {len(cleanup_repos)} repositories to cleanup: {cleanup_repos}") - logger.info(f"Starting cleanup - repos: {cleanup_repos}, containers: {cleanup_containers}, files: {cleanup_files}") + logger.info( + f"Starting cleanup - repos: {cleanup_repos}, " + f"containers: {cleanup_containers}, files: {cleanup_files}" + ) all_results = [] diff --git a/local_repo/pulp_cleanup.yml b/local_repo/pulp_cleanup.yml index 93e379833b..6f54e5f45f 100644 --- a/local_repo/pulp_cleanup.yml +++ b/local_repo/pulp_cleanup.yml @@ -15,10 +15,15 @@ # Pulp Cleanup Playbook - Clean Architecture # # Usage: -# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel", "baseos"]}' -# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_containers": ["nginx", "redis"]}' -# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_files": ["git", "chart-0.48.0"]}' -# ansible-playbook pulp_cleanup_v2.yml -e '{"cleanup_repos": ["epel"], "cleanup_containers": ["nginx"]}' -e force=true +# # Repository cleanup (include architecture prefix) +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel,aarch64_epel" +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_appstream" +# ansible-playbook pulp_cleanup.yml -e "cleanup_containers=nginx,redis" +# ansible-playbook pulp_cleanup.yml -e "cleanup_files=git,chart-0.48.0" +# ansible-playbook pulp_cleanup.yml -e "cleanup_repos=x86_64_epel -e cleanup_containers=nginx -e force=true" +# +# # Examples: x86_64_epel, aarch64_epel, x86_64_appstream, aarch64_baseos +# # Note: Use architecture prefix (x86_64_ or aarch64_) for repository names - name: Pulp Cleanup hosts: localhost From 2898ff029a86ea9c326bea156f2162d9548e1d86 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Thu, 12 Feb 2026 17:36:48 +0530 Subject: [PATCH 06/77] input config changes Signed-off-by: pullan1 --- input/config/aarch64/rhel/10.0/slurm_custom.json | 5 +---- input/config/x86_64/rhel/10.0/slurm_custom.json | 5 +---- input/local_repo_config.yml | 4 +++- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 2483775495..2bdfda0ab9 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -9,10 +9,7 @@ {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", - "type": "iso", - "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm" - } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 9531239fd2..8781885cca 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -7,10 +7,7 @@ {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", - "type": "iso", - "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm" - } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 2f318f1deb..8428e6d94c 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -138,10 +138,12 @@ omnia_repo_url_rhel_x86_64: - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key'", name: "cri-o"} + - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "cri-o"} + - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/repodata/repomd.xml.key", name: "doca"} omnia_repo_url_rhel_aarch64: - { url: "https://download.docker.com/linux/centos/10/aarch64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/aarch64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} + - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/", gpgkey: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/repodata/repomd.xml.key", name: "doca"} # Example: # additional_repos_x86_64: # - { url: "https://rpm.grafana.com/", gpgkey: "", name: "grafana" } From 680aef3efb7c0249d2d88447e9f0d7f83541a80f Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 17:44:57 +0530 Subject: [PATCH 07/77] Fixed ansible lint issues --- .../tasks/display_warnings.yml | 18 ++++------ .../restore_omnia_config_credentials.yml | 23 ++++++++----- .../restore_user_registry_credential.yml | 33 ++++++++++--------- .../import_input_parameters/vars/main.yml | 10 +++--- 4 files changed, 44 insertions(+), 40 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml index ac1eb69998..2cc6dfed26 100644 --- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml +++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml @@ -20,13 +20,11 @@ UPGRADE WARNINGS SUMMARY ================================= - {% if upgrade_warnings | length > 0 %} {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected. You will now be shown the detailed list. - {% else %} - No warnings detected. Upgrade completed successfully! - {% endif %} - when: upgrade_warnings is defined + when: + - upgrade_warnings is defined + - upgrade_warnings | length > 0 - name: Pause for user to review warnings @@ -36,7 +34,6 @@ ║ ⚠️ UPGRADE WARNINGS REVIEW ⚠️ ║ ╚════════════════════════════════════════════╝ - {% if upgrade_warnings | length > 0 %} {{ upgrade_warnings | length }} warning{{ 's' if upgrade_warnings | length > 1 else '' }} detected: {% for warning in upgrade_warnings %} @@ -45,9 +42,6 @@ Please review these warnings carefully. Press ENTER to continue or CTRL+C to abort. - {% else %} - No warnings detected. Upgrade completed successfully! - - Press ENTER to continue... - {% endif %} - when: upgrade_warnings is defined + when: + - upgrade_warnings is defined + - upgrade_warnings | length > 0 diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index 0abafee26b..71e8fb7db2 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -31,16 +31,21 @@ - not backup_omnia_config_credentials_stat.stat.exists - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))" +- name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/omnia_config_credentials.yml" + register: backup_omnia_config_credentials_content + changed_when: false + failed_when: false + no_log: true + when: backup_omnia_config_credentials_stat.stat.exists + - name: Process omnia_config_credentials.yml when present in backup + when: >- + backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout block: - - name: Check if backup file is encrypted - ansible.builtin.command: - cmd: cat "{{ backup_location }}/omnia_config_credentials.yml" - register: backup_omnia_config_credentials_content - changed_when: false - failed_when: false - no_log: true - - name: "Case 1: Key present and file encrypted - Process and update" block: - name: Copy encrypted omnia_config_credentials.yml from backup to temp location @@ -68,6 +73,7 @@ no_log: true register: vault_decrypt_result failed_when: vault_decrypt_result.rc != 0 + changed_when: false - name: Read decrypted content ansible.builtin.slurp: @@ -126,6 +132,7 @@ no_log: true register: vault_encrypt_result failed_when: vault_encrypt_result.rc != 0 + changed_when: false - name: Clean up temporary files ansible.builtin.file: diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml index de337310b8..fe02a3d750 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -31,25 +31,26 @@ - name: Add warning for missing user_registry_credential.yml to list ansible.builtin.set_fact: upgrade_warnings: >- - {{ upgrade_warnings + [ - "WARNING: user_registry_credential.yml not found in backup at " + - backup_location + "/user_registry_credential.yml. " + - "This might be due to complete Omnia execution not being completed. " + - "Skipping restoration of this file." - ] }} - when: + {{ upgrade_warnings + [msg_user_registry_credential_missing] }} + when: - not backup_user_registry_credential_stat.stat.exists - "'WARNING: user_registry_credential.yml not found in backup at' not in (upgrade_warnings | join(' '))" +- name: Check if backup file is encrypted + ansible.builtin.command: + cmd: cat "{{ backup_location }}/user_registry_credential.yml" + register: backup_user_registry_content + changed_when: false + failed_when: false + no_log: true + when: backup_user_registry_credential_stat.stat.exists + - name: Process user_registry_credential.yml when present in backup + when: >- + backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout block: - - name: Check if backup file is encrypted - ansible.builtin.command: - cmd: cat "{{ backup_location }}/user_registry_credential.yml" - register: backup_user_registry_content - changed_when: false - failed_when: false - no_log: true - name: "Case 1: Key present and file encrypted - Copy both" block: @@ -64,6 +65,7 @@ no_log: true register: vault_decrypt_result failed_when: vault_decrypt_result.rc != 0 + changed_when: false - name: Copy encrypted user_registry_credential.yml from backup ansible.builtin.copy: @@ -118,8 +120,7 @@ {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} - Key file exists but file is not encrypted {% endif %} - Please check the backup integrity and ensure both files are present - in consistent states. + Please check the backup integrity and ensure both files are present in consistent states. when: >- (not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 5eee4a2f50..9808da58bc 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -31,14 +31,16 @@ msg_upgrade_backup_dir_missing: "upgrade_backup_dir not found in /opt/omnia/.dat msg_restore_item_name_missing: "restore_item must define 'name'" msg_validation_failed: "Validation failed for {{ restore_item.name }}" msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" -msg_user_registry_credential_missing: |- - \033[93mWARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml\033[0m +msg_user_registry_credential_missing: |- + WARNING: user_registry_credential.yml not found in backup at + {{ backup_location }}/user_registry_credential.yml This might be due to complete Omnia execution not being completed. Skipping restoration of this file. # Omnia config credentials messages -msg_omnia_config_credentials_missing: |- - WARNING: omnia_config_credentials.yml not found in backup at {{ backup_location }}/omnia_config_credentials.yml. +msg_omnia_config_credentials_missing: |- + WARNING: omnia_config_credentials.yml not found in backup at + {{ backup_location }}/omnia_config_credentials.yml. This might be due to complete Omnia execution not being completed. Skipping restoration of this file. From ad7a5c08a6cf917814aefea6bef04145ad485534 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 18:05:14 +0530 Subject: [PATCH 08/77] fixed lint issues --- .../restore_omnia_config_credentials.yml | 34 +++++++-------- .../restore_user_registry_credential.yml | 43 +++++++++++-------- .../import_input_parameters/vars/main.yml | 2 +- 3 files changed, 42 insertions(+), 37 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index 71e8fb7db2..a129603dcc 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -27,7 +27,7 @@ ansible.builtin.set_fact: upgrade_warnings: >- {{ upgrade_warnings + [msg_omnia_config_credentials_missing] }} - when: + when: - not backup_omnia_config_credentials_stat.stat.exists - "'WARNING: omnia_config_credentials.yml not found in backup at' not in (upgrade_warnings | join(' '))" @@ -93,6 +93,10 @@ msg: "{{ msg_omnia_config_decrypt_error }}" - name: "Case 1.1: Apply template and encrypt" + when: > + backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout block: - name: Set template variables from credentials ansible.builtin.set_fact: @@ -150,29 +154,25 @@ - name: Fail with template/encryption error message ansible.builtin.fail: msg: "{{ msg_omnia_config_template_error }}\n{{ msg_omnia_config_encrypt_error }}" - when: >- - backup_omnia_config_credentials_key_stat.stat.exists and - backup_omnia_config_credentials_content.stdout is defined and - '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout - name: "Case 2: Both key and file missing - Add info warning" - ansible.builtin.set_fact: - upgrade_warnings: >- - {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }} - when: >- + when: > not backup_omnia_config_credentials_key_stat.stat.exists and - (backup_omnia_config_credentials_content.stdout is not defined or + (backup_omnia_config_credentials_content.stdout is not defined or '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) and "'INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key' not in (upgrade_warnings | join(' '))" + ansible.builtin.set_fact: + upgrade_warnings: > + {{ upgrade_warnings + [msg_omnia_config_credentials_info_missing] }} - name: "Case 3: Error - Mismatched state" - ansible.builtin.fail: - msg: "{{ msg_omnia_config_credentials_error }}" - when: >- - (not backup_omnia_config_credentials_key_stat.stat.exists and - backup_omnia_config_credentials_content.stdout is defined and + when: > + (not backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout) or - (backup_omnia_config_credentials_key_stat.stat.exists and - backup_omnia_config_credentials_content.stdout is defined and + (backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) + ansible.builtin.fail: + msg: "{{ msg_omnia_config_credentials_error }}" when: backup_omnia_config_credentials_stat.stat.exists diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml index fe02a3d750..69a6a391a2 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -53,6 +53,10 @@ block: - name: "Case 1: Key present and file encrypted - Copy both" + when: > + backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout block: - name: Decrypt user_registry_credential.yml using the key ansible.builtin.shell: @@ -92,12 +96,13 @@ - name: Fail with decryption error message ansible.builtin.fail: msg: "{{ msg_user_registry_decrypt_error }}" - when: >- - backup_local_repo_credentials_key_stat.stat.exists and - backup_user_registry_content.stdout is defined and - '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout - name: "Case 2: Both key and file missing - Add info warning" + when: >- + not backup_local_repo_credentials_key_stat.stat.exists and + (backup_user_registry_content.stdout is not defined or + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and + "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))" ansible.builtin.set_fact: upgrade_warnings: >- {{ upgrade_warnings + [ @@ -105,27 +110,27 @@ "are not present in backup. This is expected if registry credentials " + "were not configured in the source installation." ] }} - when: >- - not backup_local_repo_credentials_key_stat.stat.exists and - (backup_user_registry_content.stdout is not defined or - '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) and - "'INFO: Both user_registry_credential.yml and .local_repo_credentials_key' not in (upgrade_warnings | join(' '))" - name: "Case 3: Error - Mismatched state" + when: >- + (not backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or + (backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) ansible.builtin.fail: msg: | ERROR: Inconsistent state detected for user_registry_credential.yml: - {% if not backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %} + {% if not backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout %} - File is encrypted but key file (.local_repo_credentials_key) is missing - {% elif backup_local_repo_credentials_key_stat.stat.exists and backup_user_registry_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} + {% elif backup_local_repo_credentials_key_stat.stat.exists and + backup_user_registry_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout %} - Key file exists but file is not encrypted {% endif %} - Please check the backup integrity and ensure both files are present in consistent states. - when: >- - (not backup_local_repo_credentials_key_stat.stat.exists and - backup_user_registry_content.stdout is defined and - '$ANSIBLE_VAULT;' in backup_user_registry_content.stdout) or - (backup_local_repo_credentials_key_stat.stat.exists and - backup_user_registry_content.stdout is defined and - '$ANSIBLE_VAULT;' not in backup_user_registry_content.stdout) + Please check the backup integrity and ensure both files are present + in consistent states. when: backup_user_registry_credential_stat.stat.exists diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 9808da58bc..3bdf596641 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -32,7 +32,7 @@ msg_restore_item_name_missing: "restore_item must define 'name'" msg_validation_failed: "Validation failed for {{ restore_item.name }}" msg_backup_file_missing: "Backup file missing: {{ restore_item.name }}" msg_user_registry_credential_missing: |- - WARNING: user_registry_credential.yml not found in backup at + WARNING: user_registry_credential.yml not found in backup at {{ backup_location }}/user_registry_credential.yml This might be due to complete Omnia execution not being completed. Skipping restoration of this file. From 31c5600391bad02cd31c9c2d3ad167100371f5d2 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 18:13:29 +0530 Subject: [PATCH 09/77] Fixed ansible lint issues --- .../restore_omnia_config_credentials.yml | 2 +- .../restore_user_registry_credential.yml | 2 +- .../import_input_parameters/vars/main.yml | 46 ++++++++++--------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index a129603dcc..e04964e461 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -175,4 +175,4 @@ '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) ansible.builtin.fail: msg: "{{ msg_omnia_config_credentials_error }}" - when: backup_omnia_config_credentials_stat.stat.exists + diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml index 69a6a391a2..47b62fedb1 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -133,4 +133,4 @@ {% endif %} Please check the backup integrity and ensure both files are present in consistent states. - when: backup_user_registry_credential_stat.stat.exists + diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 3bdf596641..2bd20f0076 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -44,48 +44,52 @@ msg_omnia_config_credentials_missing: |- This might be due to complete Omnia execution not being completed. Skipping restoration of this file. -msg_omnia_config_credentials_info_missing: |- - INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key - are not present in backup. This is expected if credentials +msg_omnia_config_credentials_info_missing: |- + INFO: Both omnia_config_credentials.yml and .omnia_config_credentials_key + are not present in backup. This is expected if credentials were not configured in the source installation. -msg_omnia_config_credentials_success: |- +msg_omnia_config_credentials_success: |- omnia_config_credentials.yml restored and updated from backup. Backup: {{ backup_location }}/omnia_config_credentials.yml Target: {{ input_project_dir }}/omnia_config_credentials.yml Status: Updated with postgres credentials and re-encrypted (key file also restored) -msg_omnia_config_credentials_error: |- +msg_omnia_config_credentials_error: |- ERROR: Inconsistent state detected for omnia_config_credentials.yml: - {% if not backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %} + {% if not backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' in backup_omnia_config_credentials_content.stdout %} - File is encrypted but key file (.omnia_config_credentials_key) is missing - {% elif backup_omnia_config_credentials_key_stat.stat.exists and backup_omnia_config_credentials_content.stdout is defined and '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %} + {% elif backup_omnia_config_credentials_key_stat.stat.exists and + backup_omnia_config_credentials_content.stdout is defined and + '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout %} - Key file exists but file is not encrypted {% endif %} Please check the backup integrity and ensure both files are present in consistent states. # Rescue warning messages -msg_user_registry_decrypt_error: |- - ERROR: Failed to decrypt user_registry_credential.yml. - The backup key file may be corrupted or incompatible. - Please check the backup integrity and ensure the key file +msg_user_registry_decrypt_error: |- + ERROR: Failed to decrypt user_registry_credential.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file matches the encrypted file. -msg_omnia_config_decrypt_error: |- - ERROR: Failed to decrypt omnia_config_credentials.yml. - The backup key file may be corrupted or incompatible. - Please check the backup integrity and ensure the key file +msg_omnia_config_decrypt_error: |- + ERROR: Failed to decrypt omnia_config_credentials.yml. + The backup key file may be corrupted or incompatible. + Please check the backup integrity and ensure the key file matches the encrypted file. -msg_omnia_config_template_error: |- - ERROR: Failed to generate updated omnia_config_credentials.yml. - Template processing may have failed due to invalid data format. +msg_omnia_config_template_error: |- + ERROR: Failed to generate updated omnia_config_credentials.yml. + Template processing may have failed due to invalid data format. Please check the backup file format and ensure it contains valid YAML. -msg_omnia_config_encrypt_error: |- - ERROR: Failed to encrypt updated omnia_config_credentials.yml. - The key file may be corrupted or there may be permission issues. +msg_omnia_config_encrypt_error: |- + ERROR: Failed to encrypt updated omnia_config_credentials.yml. + The key file may be corrupted or there may be permission issues. Please check the key file integrity and file permissions. msg_decryption_failed: "Decryption failed. Check warnings for details." From da5423411cb969b8ddfd41856c195c4e8e443ac1 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 18:21:52 +0530 Subject: [PATCH 10/77] fixed ansible lint issues --- .../tasks/restore_omnia_config_credentials.yml | 1 - .../tasks/restore_user_registry_credential.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml index e04964e461..6a20f371f8 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_omnia_config_credentials.yml @@ -175,4 +175,3 @@ '$ANSIBLE_VAULT;' not in backup_omnia_config_credentials_content.stdout) ansible.builtin.fail: msg: "{{ msg_omnia_config_credentials_error }}" - diff --git a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml index 47b62fedb1..158b029ed3 100644 --- a/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml +++ b/upgrade/roles/import_input_parameters/tasks/restore_user_registry_credential.yml @@ -133,4 +133,3 @@ {% endif %} Please check the backup integrity and ensure both files are present in consistent states. - From cdaa98d829d7e32ee0a13955145a96c6b67f25db Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Thu, 12 Feb 2026 19:05:57 +0530 Subject: [PATCH 11/77] offline build-image and discovery updates (#3956) * Use Pulp-hosted builder images for x86_64 builds * added x86_64 image-builder image * Update default_packages.json Signed-off-by: balajikumaran.cs * Refine image build prereqs and regctl handling * Update omnia_metadata_file path to use variable Signed-off-by: balajikumaran.cs * Airgap: move telemetry/NFS prep offline and package installs to prepare_oim * added nolog true * Update prepare_oim_completion.yml Signed-off-by: balajikumaran.cs * Update aarch64_prereq.yml Signed-off-by: balajikumaran.cs * Update main.yml Signed-off-by: balajikumaran.cs * Update main.yml Signed-off-by: balajikumaran.cs * Update main.yml Signed-off-by: balajikumaran.cs * Replace command with podman_image module for image tasks Signed-off-by: balajikumaran.cs * Replace Podman command with Ansible module Signed-off-by: balajikumaran.cs * Align podman image pull with retries and tagging for x86_64 and aarch64 * Fix podman tagging for x86_64 and aarch64 images --------- Signed-off-by: balajikumaran.cs --- .../roles/image_creation/vars/main.yml | 5 +- .../roles/prepare_arm_node/tasks/main.yml | 58 ++++++++------ .../roles/prepare_arm_node/vars/main.yml | 10 ++- build_image_x86_64/build_image_x86_64.yml | 4 +- .../image_creation/tasks/build_image_tag.yml | 28 ------- .../tasks/prepare_pulp_image.yml | 79 +++++++++++++++++++ .../roles/image_creation/vars/main.yml | 10 ++- .../roles/nfs_client/tasks/nfs_client.yml | 5 -- discovery/roles/nfs_client/vars/main.yml | 7 -- discovery/roles/telemetry/tasks/main.yml | 4 + .../telemetry/tasks/telemetry_prereq.yml | 27 ++++--- .../tasks/update_ldms_agg_config.yml | 5 -- discovery/roles/telemetry/vars/main.yml | 14 ++-- .../x86_64/rhel/10.0/default_packages.json | 3 +- prepare_oim/prepare_oim.yml | 10 +++ .../common/tasks/aarch64_prereq.yml | 26 ++++++ .../deploy_containers/common/tasks/main.yml | 2 +- .../common/tasks/package_installation.yml | 29 +++++++ .../common/tasks/prepare_oim_completion.yml | 20 ++++- .../deploy_containers/common/vars/main.yml | 28 ++++++- 20 files changed, 272 insertions(+), 102 deletions(-) delete mode 100644 build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml create mode 100644 build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml create mode 100644 prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml create mode 100644 prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml diff --git a/build_image_aarch64/roles/image_creation/vars/main.yml b/build_image_aarch64/roles/image_creation/vars/main.yml index 67d11422ef..984f2497d8 100644 --- a/build_image_aarch64/roles/image_creation/vars/main.yml +++ b/build_image_aarch64/roles/image_creation/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml" dir_permissions_644: "0644" dir_permissions_755: "0755" +aarch64_local_tag: "aarch64-image-builder/ochami" openchami_dir: "/opt/omnia/openchami" openchami_clone_path: /opt/omnia/openchami/deployment-recipes job_retry: "120" @@ -32,7 +33,7 @@ ochami_compute_mounts: - -v {{ openchami_work_dir }}/images/rhel-{{ item.key }}-{{ rhel_tag }}.yaml:/home/builder/config.yaml:z ochami_aarch64_image: - --entrypoint /bin/bash - - localhost/arm-image/ochami + - "localhost/{{ aarch64_local_tag }}" ochami_base_command: - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG' diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml index 1801448611..4a9d150850 100644 --- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml @@ -167,32 +167,42 @@ - name: Build full Podman image path ansible.builtin.set_fact: - pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.1" - -- name: Pull aarch64 image using Podman - ansible.builtin.command: - cmd: "podman pull {{ pulp_aarch_image }}" - register: podman_pull_result - ignore_errors: true - changed_when: false + pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/{{ pulp_aarch64_image_name }}" + +- name: Pull and tag aarch64 image + block: + - name: Pull aarch64 image using Podman + containers.podman.podman_image: + name: "{{ pulp_aarch_image }}" + state: present + register: podman_pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: podman_pull_result is not failed + changed_when: false + + - name: Tag pulled image + containers.podman.podman_tag: + image: "{{ pulp_aarch_image }}" + target_names: + - "{{ aarch64_local_tag }}" + changed_when: false + + rescue: + - name: Fail if Podman pull failed + ansible.builtin.fail: + msg: "Failed to pull image {{ pulp_aarch_image }}" + +- name: Check if regctl binary exists + ansible.builtin.stat: + path: "{{ ochami_aarch_64_dir }}/regctl" + register: regctl_stat + delegate_to: localhost -- name: Fail if Podman pull failed +- name: Fail if regctl binary not found ansible.builtin.fail: - msg: "{{ aarch64_image_fail_msg }}" - when: podman_pull_result.rc != 0 - -- name: Tag pulled image - ansible.builtin.command: - cmd: "podman tag {{ pulp_aarch_image }} arm-image/ochami" - when: podman_pull_result.rc == 0 - changed_when: false - -- name: Download regctl binary to NFS shared path - ansible.builtin.get_url: - url: "{{ aarch64_regctl_url }}" - dest: "{{ ochami_aarch_64_dir }}/regctl" - mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" - delegate_to: localhost + msg: "{{ regctl_not_found_msg }}" + when: not regctl_stat.stat.exists - name: Copy regctl binary to /usr/local/bin on target host ansible.builtin.copy: diff --git a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml index d240f27de4..c0ce2868aa 100644 --- a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,13 @@ # input files input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" +pulp_aarch64_image_name: "dellhpcomniaaisolution/image-build-aarch64:1.1" +aarch64_local_tag: "aarch64-image-builder/ochami" +pull_image_retries: "3" +pull_image_delay: "10" network_spec: "{{ input_project_dir }}/network_spec.yml" ochami_aarch_64_dir: "/opt/omnia/openchami/aarch64" pulp_repo_store_path: "{{ ochami_aarch_64_dir }}/pulp.repo" -aarch64_regctl_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" pulp_repo_file_path: "/etc/yum.repos.d/pulp.repo" pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" @@ -39,3 +42,6 @@ aarch64_image_fail_msg: > Unable to pull the Ochami aarch64 image builder image. Make sure you have added the default package for aarch64 in the software_config.json file and ran local_repo.yml. If not, add that package and rerun local_repo.yml. +regctl_not_found_msg: > + regctl binary not found at {{ ochami_aarch_64_dir }}/regctl. + Please run prepare_oim.yml playbook to download the regctl binary. diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml index 85ecaf93cd..676d8adbd6 100644 --- a/build_image_x86_64/build_image_x86_64.yml +++ b/build_image_x86_64/build_image_x86_64.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -80,7 +80,7 @@ - name: Tag OpenCHAMI image ansible.builtin.include_role: name: image_creation - tasks_from: build_image_tag.yml + tasks_from: prepare_pulp_image.yml - name: OpenCHAMI build image for x86_64 hosts: localhost diff --git a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml deleted file mode 100644 index 0b7a56072d..0000000000 --- a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Pull image-build image - ansible.builtin.command: - cmd: "podman pull {{ image_build_el10 }}" - register: pull_result - retries: "{{ pull_image_retries }}" - delay: "{{ pull_image_delay }}" - until: pull_result.rc == 0 - changed_when: "'Image is up to date' not in pull_result.stdout" - -- name: Fail if image not pulled successfully - ansible.builtin.fail: - msg: "{{ pull_result.stdout }}" - when: pull_result.rc != 0 diff --git a/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml new file mode 100644 index 0000000000..22f336b849 --- /dev/null +++ b/build_image_x86_64/roles/image_creation/tasks/prepare_pulp_image.yml @@ -0,0 +1,79 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Load network specification +- name: Load network spec file + ansible.builtin.include_vars: + file: "{{ network_spec }}" + register: include_network_spec + no_log: true + +- name: Fail if network spec cannot be loaded + ansible.builtin.fail: + msg: "{{ network_spec_syntax_fail_msg }} Error: {{ include_network_spec.message }}" + when: include_network_spec is failed + +# Parse network spec data +- name: Parse network spec + ansible.builtin.set_fact: + network_data: "{{ network_data | default({}) | combine({item.key: item.value}) }}" + with_dict: "{{ Networks }}" + +# Set PXE IP fact +- name: Set PXE IP fact + ansible.builtin.set_fact: + oim_pxe_ip: "{{ network_data.admin_network.primary_oim_admin_ip }}" + cacheable: true + +# Copy pulp certificate and update CA trust +- name: Copy pulp webserver certificate to anchors + ansible.builtin.copy: + src: "{{ pulp_webserver_cert_path }}" + dest: "{{ anchors_path }}" + mode: "{{ dir_permissions_644 }}" + become: true + +- name: Update CA trust + ansible.builtin.command: update-ca-trust + register: update_ca + changed_when: false + +- name: Build full Podman image path for x86_64 + ansible.builtin.set_fact: + pulp_x86_image: "{{ oim_pxe_ip }}:2225/{{ pulp_x86_64_image_name }}" + +- name: Pull and tag x86_64 image + block: + - name: Pull x86_64 image using Podman + containers.podman.podman_image: + name: "{{ pulp_x86_image }}" + state: present + register: pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pull_result is not failed + changed_when: false + + - name: Tag pulled image for x86_64 build + containers.podman.podman_tag: + image: "{{ pulp_x86_image }}" + target_names: + - "{{ x86_64_local_tag }}" + changed_when: false + + rescue: + - name: Fail if Podman pull failed + ansible.builtin.fail: + msg: "Failed to pull image {{ pulp_x86_image }}." diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml index a05a39d37d..60dcf0bc6f 100644 --- a/build_image_x86_64/roles/image_creation/vars/main.yml +++ b/build_image_x86_64/roles/image_creation/vars/main.yml @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -image_build_el10: "docker.io/dellhpcomniaaisolution/image-build-el10:1.0" +pulp_x86_64_image_name: "dellhpcomniaaisolution/image-build-el10:1.0" +x86_64_local_tag: "x86_64-image-builder/ochami" pull_image_retries: "3" pull_image_delay: "10" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" @@ -23,6 +24,9 @@ openchami_dir: "/opt/omnia/openchami" openchami_clone_path: /opt/omnia/openchami/deployment-recipes job_retry: "120" job_delay: "30" +network_spec: "{{ input_project_dir }}/network_spec.yml" +pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" +anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" openchami_work_dir: "{{ oim_shared_path }}/omnia/openchami/workdir" ochami_mounts: - --user 0 --privileged @@ -35,7 +39,7 @@ ochami_compute_mounts: ochami_x86_64_image: - --entrypoint /bin/bash - - docker.io/dellhpcomniaaisolution/image-build-el10:1.0 + - "localhost/{{ x86_64_local_tag }}" ochami_base_command: - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG' @@ -54,3 +58,5 @@ compute_image_failure_msg: | # build_compute_image.yml openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2" openchami_compute_image_vars_path: "/opt/omnia/openchami/compute_images_template.yaml" + +network_spec_syntax_fail_msg: "Failed to load network_spec.yml due to syntax error" diff --git a/discovery/roles/nfs_client/tasks/nfs_client.yml b/discovery/roles/nfs_client/tasks/nfs_client.yml index 079933c26b..ca8a3c7660 100644 --- a/discovery/roles/nfs_client/tasks/nfs_client.yml +++ b/discovery/roles/nfs_client/tasks/nfs_client.yml @@ -32,11 +32,6 @@ nfs_server_ip: "{{ hostvars['127.0.0.1']['admin_nic_ip'] }}" when: item.server_ip == "localhost" -- name: Package installation for NFS - ansible.builtin.package: - name: "{{ nfs_packages[ansible_os_family] }}" - state: present - - name: Mount facts items to dict ansible.builtin.set_fact: nfs_src: "{{ nfs_server_ip }}:{{ item.server_share_path }}" diff --git a/discovery/roles/nfs_client/vars/main.yml b/discovery/roles/nfs_client/vars/main.yml index b5e01fd82a..a3c20c054c 100644 --- a/discovery/roles/nfs_client/vars/main.yml +++ b/discovery/roles/nfs_client/vars/main.yml @@ -20,13 +20,6 @@ software_config_file: "{{ hostvars['localhost']['input_project_dir'] }}/software # Usage: nfs_client.yml mounted_dir_perm: "0755" default_client_mount_options: "nosuid,rw,sync,hard,intr" -nfs_packages: - RedHat: - - nfs-utils - - nfs4-acl-tools - Debian: - - nfs-common - - nfs4-acl-tools slurm_nfs_fail_msg: "Failed to mount NFS share. Please check if the NFS server is reachable or NFS is configured properly." omnia_config_vars: "{{ hostvars['localhost']['input_project_dir'] }}/omnia_config.yml" diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml index c5a3dbefba..825c3988d7 100644 --- a/discovery/roles/telemetry/tasks/main.yml +++ b/discovery/roles/telemetry/tasks/main.yml @@ -28,6 +28,10 @@ when: - hostvars['localhost']['idrac_telemetry_support'] or hostvars['localhost']['ldms_support'] block: + - name: Set NFS info fact + ansible.builtin.set_fact: + oim_shared_path: "{{ hostvars['localhost']['oim_shared_path'] }}" + - name: Service cluster prerequisite ansible.builtin.include_tasks: telemetry_prereq.yml diff --git a/discovery/roles/telemetry/tasks/telemetry_prereq.yml b/discovery/roles/telemetry/tasks/telemetry_prereq.yml index d720c57822..7eb45a89ab 100644 --- a/discovery/roles/telemetry/tasks/telemetry_prereq.yml +++ b/discovery/roles/telemetry/tasks/telemetry_prereq.yml @@ -47,23 +47,24 @@ state: directory mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" -- name: Git clone for iDRAC Telemetry script +- name: Ensure iDRAC Telemetry scripting destination exists + ansible.builtin.file: + path: "{{ idrac_telemetry_scripting_git_clone_path }}" + state: directory + mode: "{{ hostvars['localhost']['dir_permissions_755'] }}" + +- name: Copy iDRAC Telemetry Scripting to NFS share block: - - name: Checkout iDRAC Telemetry GitHub repo - ansible.builtin.git: - repo: "{{ idrac_telemetry_scripting_repo }}" + - name: Copy pre-cloned iDRAC Telemetry Scripting directory + ansible.builtin.copy: + src: "{{ idrac_telemetry_scripting_src_path }}/" dest: "{{ idrac_telemetry_scripting_git_clone_path }}" - version: "{{ idrac_telemetry_scripting_stable_commit }}" - update: false - register: clone_idrac_script - until: clone_idrac_script is succeeded - retries: "{{ max_retries }}" - delay: "{{ delay_count }}" + remote_src: true + mode: preserve rescue: - - name: Fail if iDRAC telemetry Git clone fails + - name: Fail if iDRAC telemetry copy fails ansible.builtin.fail: - msg: "{{ idrac_script_git_clone_error_msg.splitlines() | join(' ') }}" - when: clone_idrac_script is failed + msg: "{{ idrac_telemetry_scripting_copy_fail_msg.splitlines() | join(' ') }}" - name: Set kafka_support to true ansible.builtin.set_fact: diff --git a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml index db4d4b1d3f..ee6c0c7d75 100644 --- a/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml +++ b/discovery/roles/telemetry/tasks/update_ldms_agg_config.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Install make - ansible.builtin.package: - name: make - state: present - - name: Verify values.yaml exists ansible.builtin.stat: path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/values.yaml" diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml index 473fd74e19..5c5838ce29 100644 --- a/discovery/roles/telemetry/vars/main.yml +++ b/discovery/roles/telemetry/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,14 +32,12 @@ telemetry_namespace: "telemetry" idrac_telemetry_k8s_name: idrac-telemetry # iDRAC Telemetry scripting repository -idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git" -idrac_telemetry_scripting_stable_commit: "f6999f5" +idrac_telemetry_scripting_src_path: "{{ oim_shared_path }}/omnia/telemetry/iDRAC-Telemetry-Scripting" idrac_telemetry_scripting_git_clone_path: "{{ service_cluster_idrac_telemetry_dir_path }}/iDRAC-Telemetry-Scripting" -idrac_script_git_clone_error_msg: | - Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }} - to {{ idrac_telemetry_scripting_git_clone_path }} directory in NFS share. -max_retries: 10 -delay_count: 5 +idrac_telemetry_scripting_copy_fail_msg: | + Failed to copy iDRAC Telemetry Scripting from {{ idrac_telemetry_scripting_src_path }} + to {{ idrac_telemetry_scripting_git_clone_path }}. Please ensure prepare_oim.yml has been + executed successfully before running discovery. # Pre-built container images for iDRAC telemetry components # These default to your published images but can be overridden via telemetry_images diff --git a/input/config/x86_64/rhel/10.0/default_packages.json b/input/config/x86_64/rhel/10.0/default_packages.json index 813f9ad993..6002894568 100644 --- a/input/config/x86_64/rhel/10.0/default_packages.json +++ b/input/config/x86_64/rhel/10.0/default_packages.json @@ -34,7 +34,8 @@ {"package": "wget", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "cloud-init", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "glibc-langpack-en", "type": "rpm", "repo_name": "x86_64_baseos"}, - {"package": "gedit", "type": "rpm", "repo_name": "epel"} + {"package": "gedit", "type": "rpm", "repo_name": "epel"}, + {"package": "docker.io/dellhpcomniaaisolution/image-build-el10", "tag": "1.0", "type": "image" } ] } } diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index a78d21e8d9..50c48fd3e5 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -63,6 +63,11 @@ name: deploy_containers/common tasks_from: add_known_hosts.yml + - name: Download aarch64 prerequisites # noqa:role-name[path] + ansible.builtin.include_role: + name: deploy_containers/common + tasks_from: aarch64_prereq.yml + - name: OpenLDAP Pre_req generate ssha password hosts: localhost connection: local @@ -156,6 +161,11 @@ name: deploy_containers/common tasks_from: omnia_service.yml + - name: Install required packages # noqa:role-name[path] + ansible.builtin.include_role: + name: deploy_containers/common + tasks_from: package_installation.yml + - name: Prepare oim completion hosts: localhost connection: local diff --git a/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml new file mode 100644 index 0000000000..f5eae768bb --- /dev/null +++ b/prepare_oim/roles/deploy_containers/common/tasks/aarch64_prereq.yml @@ -0,0 +1,26 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Create openchami aarch64 directory if not exists + ansible.builtin.file: + path: "{{ ochami_aarch64_dir }}" + state: directory + mode: "{{ dir_permissions_755 }}" + +- name: Download regctl binary (aarch64) + ansible.builtin.get_url: + url: "{{ regctl_aarch64_url }}" + dest: "{{ ochami_aarch64_dir }}/regctl" + mode: "{{ dir_permissions_755 }}" diff --git a/prepare_oim/roles/deploy_containers/common/tasks/main.yml b/prepare_oim/roles/deploy_containers/common/tasks/main.yml index 78c28e98ba..00287c628c 100644 --- a/prepare_oim/roles/deploy_containers/common/tasks/main.yml +++ b/prepare_oim/roles/deploy_containers/common/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml new file mode 100644 index 0000000000..1d84877307 --- /dev/null +++ b/prepare_oim/roles/deploy_containers/common/tasks/package_installation.yml @@ -0,0 +1,29 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Install required packages + block: + - name: Install required packages + ansible.builtin.package: + name: "{{ item }}" + state: present + loop: "{{ oim_packages }}" + register: oim_pkg_result + rescue: + - name: Fail if required package installation fails + ansible.builtin.fail: + msg: >- + {{ prepare_oim_pkg_fail_msg.splitlines() | join(' ') }} + Failed package(s): {{ oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='item') | list | join(', ') }} + Error: {{ (oim_pkg_result.results | selectattr('failed', 'defined') | selectattr('failed') | map(attribute='msg') | list | first) | default('') }} diff --git a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml index 7c86cfaf6b..52e4009219 100644 --- a/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml +++ b/prepare_oim/roles/deploy_containers/common/tasks/prepare_oim_completion.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,6 +32,24 @@ mode: "{{ file_permissions }}" when: not bmc_group_data_status.stat.exists +- name: Clone iDRAC Telemetry Scripting repository + block: + - name: Checkout iDRAC Telemetry GitHub repo + ansible.builtin.git: + repo: "{{ idrac_telemetry_scripting_repo }}" + dest: "{{ idrac_telemetry_scripting_clone_dest }}" + version: "{{ idrac_telemetry_scripting_stable_commit }}" + update: false + register: clone_idrac_script + until: clone_idrac_script is succeeded + retries: "{{ max_retries }}" + delay: "{{ delay_count }}" + rescue: + - name: Fail if iDRAC telemetry Git clone fails + ansible.builtin.fail: + msg: "{{ idrac_script_git_clone_fail_msg.splitlines() | join(' ') }}" + when: clone_idrac_script is failed + - name: Prepare oim completion ansible.builtin.debug: msg: "{{ prepare_oim_completion_msg.splitlines() | join(' ') }}" diff --git a/prepare_oim/roles/deploy_containers/common/vars/main.yml b/prepare_oim/roles/deploy_containers/common/vars/main.yml index 30bb7b8125..855e7350b1 100644 --- a/prepare_oim/roles/deploy_containers/common/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/common/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,12 +28,34 @@ internal_nfs_services: ntp_firewall_service: ntp +# Packages required on OIM +oim_packages: + - nfs-utils + - nfs4-acl-tools + - git + - make +prepare_oim_pkg_fail_msg: | + Failed to install required packages. Please ensure the repository is + configured on OIM and rerun the playbook. + # Usage: prepare_oim_completion.yml telemetry_dir: "/opt/omnia/telemetry" dir_permissions_755: "0755" bmc_group_data_filename: "{{ telemetry_dir }}/bmc_group_data.csv" bmc_group_data_template: "bmc_group_data.j2" file_permissions: "0644" +idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git" +idrac_telemetry_scripting_stable_commit: "f6999f5" +idrac_telemetry_scripting_clone_dest: "{{ telemetry_dir }}/iDRAC-Telemetry-Scripting" +max_retries: 10 +delay_count: 5 +git_install_timeout: 300 +git_install_fail_msg: | + Failed to install git. Please ensure the OS repository is configured on OIM. + Configure the repository and rerun the playbook. +idrac_script_git_clone_fail_msg: | + Failed to clone iDRAC Telemetry GitHub repository from {{ idrac_telemetry_scripting_repo }} + to {{ idrac_telemetry_scripting_clone_dest }}. Please check network connectivity and rerun the playbook. prepare_oim_completion_msg: | The playbook prepare_oim.yml has completed successfully. To create the offline repositories and registry for the cluster nodes, please execute the playbook local_repo/local_repo.yml as the next step. @@ -58,3 +80,7 @@ network_services: # Usage: configure_chrony.yml chrony_conf_path: "/etc/chrony.conf" chrony_no_sources_msg: "No chrony sources are reachable. Please give a valid NTP server configuration in network_spec.yml and re-run prepare_oim playbook." + +# Usage: aarch64_prereq.yml +ochami_aarch64_dir: "/opt/omnia/openchami/aarch64" +regctl_aarch64_url: "https://github.com/regclient/regclient/releases/latest/download/regctl-linux-arm64" From 01dece90e8c421745419a1b81a46df85a3fa15eb Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 12 Feb 2026 19:24:06 +0530 Subject: [PATCH 12/77] Added flow if any munge key update, will be useful if munge key changes --- .../slurm_config/tasks/check_ctld_running.yml | 19 +---- discovery/roles/slurm_config/tasks/confs.yml | 2 +- .../slurm_config/tasks/create_slurm_dir.yml | 19 ++++- .../tasks/read_slurm_hostnames.yml | 1 + .../slurm_config/tasks/update_hosts_munge.yml | 84 +++++++++++++++++++ discovery/roles/slurm_config/vars/main.yml | 2 +- 6 files changed, 106 insertions(+), 21 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/update_hosts_munge.yml diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 0c7626f3dd..5f2d41a904 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -61,22 +61,11 @@ ansible.builtin.set_fact: reachable_hosts: "{{ ip_map_ssh_check.results | rejectattr('failed', 'true') | map(attribute='host') | list }}" - - name: Update /etc/hosts with controller hostname and IP - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}' - line: "{{ host_entry.value }} {{ host_entry.key }}" - state: present - loop: "{{ reachable_hosts | product(ip_name_map | dict2items) | list }}" + - name: Update basics on reachable_hosts + ansible.builtin.include_tasks: update_hosts_munge.yml + loop: "{{ reachable_hosts }}" loop_control: - loop_var: host_combo - vars: - target_host: "{{ host_combo[0] }}" - host_entry: "{{ host_combo[1] }}" - delegate_to: "{{ target_host }}" - when: reachable_hosts | length > 0 - ignore_unreachable: true - failed_when: false + loop_var: slurmhost_ip - name: Trigger the scontrol reconfigure ansible.builtin.command: scontrol reconfigure diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 12236d6ed8..799d4cd757 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -172,7 +172,7 @@ ansible.builtin.copy: content: "{{ item.ini_lines | join('\n') }}\n" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" - mode: "{{ conf_file_mode }}" + mode: "0640" owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 81a08adfca..45e37ac243 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -84,11 +84,21 @@ share_prefix: "{{ slurm_config_path }}" when: conf_in_nfs -- name: Clear the share directory +- name: Clear Slurm-related files and directories ansible.builtin.file: - path: "{{ slurm_config_path }}" + path: "{{ slurm_config_path }}/{{ slurm_item }}" state: absent - when: clear_slurm_files + loop: "{{ (ctld_list | default([]) + + cmpt_list | default([]) + + login_list | default([]) + + compiler_login_list | default([]) + + dbd_list | default([]) + + ['munge.key']) | flatten }}" + loop_control: + loop_var: slurm_item + failed_when: false + when: + - clear_slurm_files - name: Create the slurm directory in share ansible.builtin.file: @@ -151,8 +161,9 @@ ansible.builtin.copy: src: "{{ slurm_config_path }}/munge.key" dest: "{{ slurm_config_path }}/{{ item }}/etc/munge/munge.key" - mode: "{{ common_mode }}" + mode: "0600" remote_src: true + register: munge_key_copy loop: "{{ (ctld_list | default([])) + (cmpt_list | default([])) + (compiler_login_list | default([])) + diff --git a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml index df19821983..0f7b3a16b2 100644 --- a/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml +++ b/discovery/roles/slurm_config/tasks/read_slurm_hostnames.yml @@ -46,6 +46,7 @@ - name: Get bmc_ip ansible.builtin.set_fact: bmc_ip_map: "{{ node_yaml.nodes | items2dict(key_name='name', value_name='bmc_ip') }}" + name_ip_map: "{{ dict(ip_name_map.values() | zip(ip_name_map.keys())) }}" - name: Assign slurm lists ansible.builtin.set_fact: diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml new file mode 100644 index 0000000000..ecaaad2beb --- /dev/null +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -0,0 +1,84 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Update /etc/hosts with controller hostname and IP + ansible.builtin.lineinfile: + path: /etc/hosts + regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}' + line: "{{ host_entry.value }} {{ host_entry.key }}" + state: present + loop: "{{ ip_name_map | dict2items | list }}" + loop_control: + loop_var: host_entry + ignore_unreachable: true + failed_when: false + delegate_to: "{{ slurmhost_ip }}" + +- name: Get munge changes + ansible.builtin.set_fact: + munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}" + when: munge_key_copy is defined + +- name: Block when munge key changed + when: + - munge_key_changed is defined + - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false) + - restart_slurm_services + delegate_to: "{{ slurmhost_ip }}" + ignore_errors: true + ignore_unreachable: true + block: + - name: Update munge key permissions + ansible.builtin.file: + path: /etc/munge/munge.key + owner: munge + group: munge + mode: '0600' + register: munge_key_permissions_result + + - name: Restart munge service if key changed + ansible.builtin.service: + name: munge + state: restarted + register: munge_restart_result + when: + - munge_key_permissions_result is defined + - munge_key_permissions_result is success + + - name: Restart slurmctld if munge restarted + ansible.builtin.service: + name: slurmctld + state: restarted + when: + - name_ip_map[slurmhost_ip] in ctld_list + - munge_restart_result is defined + - munge_restart_result is success + + - name: Restart slurmd if munge restarted + ansible.builtin.service: + name: slurmd + state: restarted + when: + - name_ip_map[slurmhost_ip] in (cmpt_list + login_list + compiler_login_list) + - munge_restart_result is defined + - munge_restart_result is success + + - name: Restart slurmdbd if munge restarted + ansible.builtin.service: + name: slurmdbd + state: restarted + when: + - name_ip_map[slurmhost_ip] in dbd_list + - munge_restart_result is defined + - munge_restart_result is success diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 43ee995e5a..93aa0d2786 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -79,7 +79,7 @@ cluster_name: cluster # TODO: direct load vars omnia_config.yml slurm_uid: 6001 slurm_user: slurm slurm_user_group: slurm -restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] }}" +restart_slurm_services: "{{ hostvars['localhost']['restart_slurm_services'] | default(true) }}" slurm_db_username: "{{ hostvars['localhost']['slurm_db_username'] | default('dbuser') }}" slurm_db_password: "{{ hostvars['localhost']['slurm_db_password'] }}" slurm_db_host: "{{ hostvars['localhost']['slurm_db_host'] | default(false) }}" From 19a000cb663e94ed23a2e15c866c67b2bf4b7d26 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 12 Feb 2026 19:44:38 +0530 Subject: [PATCH 13/77] lint issue fix --- discovery/roles/slurm_config/tasks/update_hosts_munge.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml index ecaaad2beb..a326fa820d 100644 --- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -36,7 +36,6 @@ - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false) - restart_slurm_services delegate_to: "{{ slurmhost_ip }}" - ignore_errors: true ignore_unreachable: true block: - name: Update munge key permissions @@ -82,3 +81,7 @@ - name_ip_map[slurmhost_ip] in dbd_list - munge_restart_result is defined - munge_restart_result is success + rescue: + - name: Handle munge restart failure + ansible.builtin.debug: + msg: "Failed task {{ ansible_failed_task.name }} on {{ slurmhost_ip }}" From 471d4e781435703aa2dba6d55e41139ca9a8ede7 Mon Sep 17 00:00:00 2001 From: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> Date: Thu, 12 Feb 2026 20:12:46 +0530 Subject: [PATCH 14/77] Update main.yml for copyright Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> --- discovery/roles/k8s_config/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml index a80fb9b257..601cc07097 100644 --- a/discovery/roles/k8s_config/vars/main.yml +++ b/discovery/roles/k8s_config/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 94a244fe9534c5feb3d950116c19e8f9b701aee9 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 12 Feb 2026 21:55:11 +0530 Subject: [PATCH 15/77] centralize oim_metadata.yml path and remove static backup_location variable --- .../import_input_parameters/tasks/set_backup_location.yml | 2 +- upgrade/roles/import_input_parameters/vars/main.yml | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml index 4f6a96e83f..94156606e5 100644 --- a/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml +++ b/upgrade/roles/import_input_parameters/tasks/set_backup_location.yml @@ -15,7 +15,7 @@ - name: Read oim_metadata.yml to get upgrade_backup_dir ansible.builtin.slurp: - src: /opt/omnia/.data/oim_metadata.yml + src: "{{ oim_metadata_path }}" register: oim_metadata_slurp - name: Parse oim_metadata.yml diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 2bd20f0076..ebaa33e492 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -15,7 +15,10 @@ # backup_location will be set from oim_metadata.yml upgrade_backup_dir # Format: /opt/omnia/backups/upgrade/version_2.0.0.0/input/project_default -backup_location: "" +# Set dynamically from metadata, no static variable needed + +# Path to oim_metadata.yml +oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" backup_dir_mode: '0755' default_file_mode: '0644' From b64916bd08990d83d4f5cf0cd6895604c20f7d14 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Fri, 13 Feb 2026 10:02:03 +0530 Subject: [PATCH 16/77] Update omnia.sh --- omnia.sh | 77 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/omnia.sh b/omnia.sh index 9c46a04dc9..81e2094ccc 100755 --- a/omnia.sh +++ b/omnia.sh @@ -766,7 +766,7 @@ Description=${container_name^} Container [Container] ContainerName=${container_name} HostName=${container_name} -Image=${container_name}:1.1 +Image=${container_name}:2.1 Network=host # Capabilities @@ -1001,16 +1001,16 @@ install_omnia_core() { fi fi - local omnia_core_tag="1.1" + local omnia_core_tag="2.1" local omnia_core_registry="" - # Check if local omnia_core:1.1 exists + # Check if local omnia_core:2.1 exists if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" # Check if latest exists for backward compatibility elif podman inspect omnia_core:latest >/dev/null 2>&1; then echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}" - # Tag it as 1.1 for consistency + # Tag it as 2.1 for consistency podman tag omnia_core:latest omnia_core:${omnia_core_tag} else echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}" @@ -1018,11 +1018,11 @@ install_omnia_core() { echo "" echo -e "${YELLOW}One way to build the image locally:${NC}" echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-" echo -e "2. Navigate to the repository directory:" echo -e " cd omnia-artifactory" echo -e "3. Build the core image locally (loads into local Podman by default):" - echo -e " ./build_images.sh core omnia_branch=" + echo -e " ./build_images.sh core core_tag=2.1 omnia_branch=" echo "" echo -e "${YELLOW}Then re-run:${NC}" echo -e " ./omnia.sh --install" @@ -1200,6 +1200,7 @@ phase1_validate() { if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Prerequisite failed: omnia_core container is not running" + display_cleanup_instructions return 1 fi @@ -1249,9 +1250,19 @@ phase1_validate() { return 1 fi - if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" - echo "[ERROR] [ORCHESTRATOR] Omnia does not pull from Docker Hub. Build/load the image locally and retry." + if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1" + echo "" + echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}" + echo "" + echo -e "${YELLOW}To build the core image locally:${NC}" + echo -e "1. Clone the Omnia Artifactory repository:" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-" + echo -e "2. Navigate to the repository directory:" + echo -e " cd omnia-artifactory" + echo -e "3. Build the core image locally (loads into local Podman by default):" + echo -e " ./build_images.sh core core_tag=2.1 omnia_branch=" + echo "" return 1 fi @@ -1267,7 +1278,7 @@ phase2_approval() { echo "OMNIA UPGRADE SUMMARY" echo "============================================" echo "Current Container Tag: 1.0" - echo "Target Container Tag: 1.1" + echo "Target Container Tag: 2.1" echo "Current Omnia Release: 2.0.0.0" echo "Target Omnia Release: 2.1.0.0" echo "New Features:" @@ -1386,17 +1397,17 @@ phase4_container_swap() { return 1 fi - echo "[INFO] [ORCHESTRATOR] Starting omnia_core 1.1 Quadlet unit" - if ! podman inspect "omnia_core:1.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:1.1" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 image not available" + echo "[INFO] [ORCHESTRATOR] Starting omnia_core 2.1 Quadlet unit" + if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 image not available" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." rollback_omnia_core return 1 fi - if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:1.1/' "$quadlet_file"; then - echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 1.1 in quadlet file" + if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:2.1/' "$quadlet_file"; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 2.1 in quadlet file" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." rollback_omnia_core @@ -1413,13 +1424,13 @@ phase4_container_swap() { systemctl start omnia_core.service || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 1.1 container" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 2.1 container" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." rollback_omnia_core return 1 } - echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 1.1 health check (60s)" + echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 2.1 health check (60s)" for i in $(seq 1 60); do if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then break @@ -1429,7 +1440,7 @@ phase4_container_swap() { if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 1.1 container failed health check" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 container failed health check" echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." rollback_omnia_core return 1 @@ -1607,6 +1618,23 @@ restore_from_backup() { return 0 } +# Display cleanup instructions for failed upgrade/rollback +display_cleanup_instructions() { + echo "" + echo -e "${RED}================================================================================${NC}" + echo -e "${RED} ROLLBACK FAILED${NC}" + echo -e "${RED}================================================================================${NC}" + echo "" + echo -e "${YELLOW}Rollback failed. Manual cleanup is required to restore a clean state before retrying.${NC}" + echo "" + echo -e "${YELLOW}Run the following on the OIM host:${NC}" + echo -e "${YELLOW}1. Clean Omnia shared path: rm -rf ${NC}" + echo -e "${YELLOW}2. Stop Omnia core system service: systemctl stop omnia_core${NC}" + echo -e "${YELLOW}3. Remove the Omnia core container: podman rm -f omnia_core${NC}" + echo -e "${YELLOW}4. Perform a fresh Omnia core install: ./omnia.sh --install${NC}" + echo "" +} + # Main rollback function rollback_omnia_core() { echo -e "${GREEN}================================================================================${NC}" @@ -1695,11 +1723,12 @@ rollback_omnia_core() { echo "" echo "[INFO] [ROLLBACK] Starting rollback process..." - # Step 1: Stop 1.1 container gracefully + # Step 1: Stop 2.1 container gracefully echo "" - echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 1.1 container..." + echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 2.1 container..." if ! stop_container_gracefully "omnia_core" 30; then echo -e "${RED}ERROR: Failed to stop container.${NC}" + display_cleanup_instructions exit 1 fi @@ -1715,6 +1744,7 @@ rollback_omnia_core() { podman tag omnia_core:latest omnia_core:1.0 else echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}" + display_cleanup_instructions exit 1 fi fi @@ -1725,6 +1755,7 @@ rollback_omnia_core() { systemctl daemon-reload if ! systemctl start omnia_core.service; then echo -e "${RED}ERROR: Failed to start container service.${NC}" + display_cleanup_instructions exit 1 fi @@ -1747,6 +1778,7 @@ rollback_omnia_core() { if [ $health_count -ge $health_timeout ]; then echo "" echo -e "${RED}ERROR: Container failed to become healthy within 60 seconds.${NC}" + display_cleanup_instructions exit 1 fi @@ -1755,6 +1787,7 @@ rollback_omnia_core() { echo "[INFO] [ROLLBACK] Step 5: Validating backup directory structure..." if ! validate_backup_directory "$selected_backup"; then echo -e "${RED}ERROR: Backup validation failed.${NC}" + display_cleanup_instructions exit 1 fi @@ -1763,6 +1796,7 @@ rollback_omnia_core() { echo "[INFO] [ROLLBACK] Step 6: Restoring files from backup..." if ! restore_from_backup "$selected_backup"; then echo -e "${RED}ERROR: Failed to restore from backup.${NC}" + display_cleanup_instructions exit 1 fi @@ -1773,6 +1807,7 @@ rollback_omnia_core() { if [ "$verify_version" != "$backup_version" ]; then echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}" + display_cleanup_instructions exit 1 fi From a39e26f82cbe954e492e6438a745dce13e042b1f Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 13 Feb 2026 06:38:40 +0000 Subject: [PATCH 17/77] updating /etc/hosts entries --- .../discovery_validations/tasks/update_hosts.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml index 43e7d3fc63..85c9ecf611 100644 --- a/discovery/roles/discovery_validations/tasks/update_hosts.yml +++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml @@ -13,16 +13,22 @@ # limitations under the License. --- -- name: Add hosts file entry for cluster +- name: Ensure 127.0.0.1 localhost entry exists ansible.builtin.shell: | set -o pipefail - grep -qxF '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' {{ hosts_file_path }} || \ - echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} + grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} + changed_when: true + +- name: Remove stale entries for IPs that are being updated + ansible.builtin.shell: | + set -o pipefail + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp && cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} && rm -f {{ hosts_file_path }}.tmp changed_when: true loop: "{{ read_mapping_file.dict | dict2items }}" -- name: Ensure 127.0.0.1 localhost entry exists uniquely using echo +- name: Add hosts file entry for cluster ansible.builtin.shell: | set -o pipefail - grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} + echo '{{ item.value.ADMIN_IP }} {{ item.value.HOSTNAME }}' >> {{ hosts_file_path }} changed_when: true + loop: "{{ read_mapping_file.dict | dict2items }}" From 00fd2e2942b97d2610cb720ba4b647bde3d876c6 Mon Sep 17 00:00:00 2001 From: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> Date: Fri, 13 Feb 2026 12:43:26 +0530 Subject: [PATCH 18/77] Update service_k8s.json Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> --- input/config/x86_64/rhel/10.0/service_k8s.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index afc073a19f..0ef4408a7f 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -33,7 +33,7 @@ { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"} + { "package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "service_kube_control_plane": { From 7b98e5ecd47d1d46b51aba587d4ee6eb99feeb7e Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 13 Feb 2026 07:19:48 +0000 Subject: [PATCH 19/77] lint issue fixed --- discovery/roles/discovery_validations/tasks/update_hosts.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml index 85c9ecf611..f040dd997f 100644 --- a/discovery/roles/discovery_validations/tasks/update_hosts.yml +++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml @@ -22,7 +22,9 @@ - name: Remove stale entries for IPs that are being updated ansible.builtin.shell: | set -o pipefail - grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp && cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} && rm -f {{ hosts_file_path }}.tmp + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp + cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} + rm -f {{ hosts_file_path }}.tmp changed_when: true loop: "{{ read_mapping_file.dict | dict2items }}" From 6ff5423831736dc86ea5227bd1702b553ccf81af Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Fri, 13 Feb 2026 07:26:03 +0000 Subject: [PATCH 20/77] Add user registry to crio.conf Signed-off-by: Vrinda_Marwah --- .../tasks/fetch_additional_images.yml | 9 +++++++++ ...ervice_kube_control_plane_first_x86_64.yaml.j2 | 15 ++++++++++++--- ...roup-service_kube_control_plane_x86_64.yaml.j2 | 15 ++++++++++++--- .../ci-group-service_kube_node_x86_64.yaml.j2 | 14 +++++++++++--- discovery/roles/configure_ochami/vars/main.yml | 1 + 5 files changed, 45 insertions(+), 9 deletions(-) diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml index 2fecb895e8..ca13f0c414 100644 --- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml +++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml @@ -42,3 +42,12 @@ ansible.builtin.debug: var: additional_images_dict verbosity: 2 + +- name: Read local_repo_config.yml + ansible.builtin.include_vars: + file: "{{ local_repo_config_path }}" + name: local_repo_config + +- name: Set fact for user_registry + ansible.builtin.set_fact: + user_registry: "{{ local_repo_config.user_registry | default([]) }}" \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index b8b71bf099..b98df53d7d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -169,6 +169,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} - path: /tmp/kube-vip.yaml owner: root:root @@ -415,13 +425,12 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - mv /tmp/generate-control-plane-join.sh {{ k8s_client_mount_path }} - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_control_plane_first' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index f3ba7a7330..922f63f852 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -147,6 +147,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} - path: /tmp/kube-vip.yaml owner: root:root permissions: '0644' @@ -323,12 +333,11 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_control_plane' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index b380030ddd..df98035baa 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -146,7 +146,16 @@ location = "gcr.io" [[registry.mirror]] location = "{{ pulp_mirror }}" +{% if user_registry | default([]) | length > 0 %} +{% for registry in user_registry %} + [[registry]] + prefix = "{{ registry.host }}" + location = "{{ registry.host }}" + [[registry.mirror]] + location = "{{ pulp_mirror }}" +{% endfor %} +{% endif %} runcmd: - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" @@ -226,12 +235,11 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh + - mkdir -p /etc/containers/registries.conf.d + - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - systemctl daemon-reload - - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} {% set role_name = 'service_kube_node' %} {% include 'pull_additional_images.yaml.j2' %} diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index 7f75daa01d..053ee15c0d 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -108,3 +108,4 @@ cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cud # Usage: fetch_additional_images.yml input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" software_config_file_path: "{{ input_project_dir }}/software_config.json" +local_repo_config_path: "{{ input_project_dir }}/local_repo_config.yml" From a70b838c3a7e4707d0f0235b0c350e13d598c36f Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 13 Feb 2026 08:14:30 +0000 Subject: [PATCH 21/77] duplicated hostnames --- discovery/roles/discovery_validations/tasks/update_hosts.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/discovery/roles/discovery_validations/tasks/update_hosts.yml b/discovery/roles/discovery_validations/tasks/update_hosts.yml index f040dd997f..bd046032bc 100644 --- a/discovery/roles/discovery_validations/tasks/update_hosts.yml +++ b/discovery/roles/discovery_validations/tasks/update_hosts.yml @@ -19,10 +19,11 @@ grep -qxF '127.0.0.1 localhost.localdomain localhost' {{ hosts_file_path }} || echo '127.0.0.1 localhost.localdomain localhost' >> {{ hosts_file_path }} changed_when: true -- name: Remove stale entries for IPs that are being updated +- name: Remove stale entries for IPs and hostnames that are being updated ansible.builtin.shell: | set -o pipefail - grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} > {{ hosts_file_path }}.tmp + grep -v '^{{ item.value.ADMIN_IP }}\s' {{ hosts_file_path }} | \ + grep -v '\s{{ item.value.HOSTNAME }}$' > {{ hosts_file_path }}.tmp cat {{ hosts_file_path }}.tmp > {{ hosts_file_path }} rm -f {{ hosts_file_path }}.tmp changed_when: true From aba17ded12da3c66de984e0cabb6dce24f7ca1a4 Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Fri, 13 Feb 2026 14:05:55 +0530 Subject: [PATCH 22/77] Update omnia.sh --- omnia.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/omnia.sh b/omnia.sh index 81e2094ccc..b7a086545d 100755 --- a/omnia.sh +++ b/omnia.sh @@ -164,7 +164,7 @@ setup_omnia_core() { # It removes the container and performs the necessary cleanup steps. cleanup_omnia_core() { # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" @@ -272,7 +272,7 @@ cleanup_config(){ # Otherwise, it prints an error message. remove_container() { # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" @@ -1083,7 +1083,7 @@ install_omnia_core() { # If the user wants to reinstall, call the remove_container function, and then call the setup_omnia_core function if [ "$choice" = "2" ]; then # Block if critical service containers exist - critical_running=$(podman ps --format '{{.Names}}' | grep -E 'pulp|registry|minio-server|postgres|step-ca|hydra|smd|opaal-idp|bss|opaal|cloud-init-server|haproxy|coresmd') + critical_running=$(podman ps --format '{{.Names}}' | grep -E '^pulp$|^omnia_auth$|^minio-server$|^registry$|^step-ca$|^postgres$|^hydra$|^opaal-idp$|^smd$|^opaal$|^bss$|^cloud-init-server$|^haproxy$|^coresmd$|^omnia_build_stream$|^omnia_postgres$') if [ -n "$critical_running" ]; then echo -e "${RED}Failed to intiatiate omnia_core container cleanup. There are other critical service containers still running:${NC}" echo "$critical_running" From 7c79b599c8fd89b75cdaf2eb082d9b95449cf84a Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Fri, 13 Feb 2026 08:47:06 +0000 Subject: [PATCH 23/77] resolve input validation + lint Signed-off-by: Vrinda_Marwah --- .../validation_flows/common_validation.py | 13 +++++++++++++ .../tasks/fetch_additional_images.yml | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 198c527440..f577a4e9b8 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -233,6 +233,19 @@ def validate_software_config( ) ) + # Check for required subgroups when specific software names are present + software_requiring_subgroups = ["additional_packages", "slurm_custom", "service_k8s"] + for software_name in software_requiring_subgroups: + if software_name in software_names: + if software_name not in data or not data[software_name]: + errors.append( + create_error_msg( + "Validation Error: ", + software_name, + f"is present in softwares but corresponding subgroup '{software_name}' is missing or empty in software_config.json. Please refer examples directory for the correct format." + ) + ) + for software_pkg in data['softwares']: software = software_pkg['name'] arch_list = software_pkg.get('arch') diff --git a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml index ca13f0c414..d4e8425749 100644 --- a/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml +++ b/discovery/roles/configure_ochami/tasks/fetch_additional_images.yml @@ -43,11 +43,11 @@ var: additional_images_dict verbosity: 2 -- name: Read local_repo_config.yml +- name: Read local_repo_config.yml ansible.builtin.include_vars: file: "{{ local_repo_config_path }}" name: local_repo_config - name: Set fact for user_registry ansible.builtin.set_fact: - user_registry: "{{ local_repo_config.user_registry | default([]) }}" \ No newline at end of file + user_registry: "{{ local_repo_config.user_registry | default([]) }}" From 40f1595cd15c9f59b4c653c679a0acfaa1eb6c57 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Fri, 13 Feb 2026 16:09:23 +0530 Subject: [PATCH 24/77] Removed slurmd dependency issue where ssh key changes on slurmctld, live --- ...-group-login_compiler_node_aarch64.yaml.j2 | 8 +++-- ...i-group-login_compiler_node_x86_64.yaml.j2 | 8 +++-- .../ci-group-login_node_aarch64.yaml.j2 | 7 +++- .../ci-group-login_node_x86_64.yaml.j2 | 7 +++- .../ci-group-slurm_node_aarch64.yaml.j2 | 8 +++-- .../ci-group-slurm_node_x86_64.yaml.j2 | 7 ++-- .../slurm_config/tasks/check_ctld_running.yml | 32 +++++++++++++------ discovery/roles/slurm_config/tasks/confs.yml | 2 ++ .../slurm_config/tasks/create_slurm_dir.yml | 12 +------ .../slurm_config/tasks/update_hosts_munge.yml | 1 + .../slurm_config/templates/slurmd.service.j2 | 22 ------------- 11 files changed, 62 insertions(+), 52 deletions(-) delete mode 100644 discovery/roles/slurm_config/templates/slurmd.service.j2 diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index dc2ddf9dcd..8918f03050 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -209,6 +209,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -278,12 +284,10 @@ {% if hostvars['localhost']['ldms_support'] %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - - /root/ldms_sampler.sh {% endif %} - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 2c23b868c0..51121a2e82 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -209,6 +209,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -282,12 +288,10 @@ {% if hostvars['localhost']['ldms_support'] %} - echo " Starting LDMS setup " | tee -a /var/log/ldms-cloudinit.log - - /root/ldms_sampler.sh {% endif %} - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 8b3d771592..4aacc2222d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -102,6 +102,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -131,7 +137,6 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 4e68ba8d81..524553bd55 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -108,6 +108,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' @@ -142,7 +148,6 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_log_dir_effective }} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ slurm_slurmd_pid_dir_effective }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 06a04a6068..dacade639b 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -277,8 +277,6 @@ echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) (aarch64) =====" - echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/" - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" @@ -415,6 +413,12 @@ {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" + - path: /usr/local/bin/check_slurm_controller_status.sh owner: root:root permissions: '{{ file_mode_755 }}' diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index c1b532908e..d21fcf9c5c 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -244,6 +244,11 @@ {% for key in ip_name_map | sort %} {{ ip_name_map[key] }} {{ key }} {% endfor %} + - path: /etc/sysconfig/slurmd + owner: root:root + permissions: '0644' + content: | + SLURMD_OPTIONS="{{ conf_server }}" - path: /usr/local/bin/configure_dirs_and_mounts.sh permissions: '{{ file_mode_755 }}' @@ -288,8 +293,6 @@ echo "[INFO] ===== Starting slurmd setup (service file, directories, epilog) =====" - echo "[INFO] Copying slurmd.service into /usr/lib/systemd/system/" - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 5f2d41a904..7d908169ab 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -14,30 +14,37 @@ --- - name: Check if remote host is reachable via SSH ansible.builtin.wait_for: - host: "{{ item }}" + host: "{{ ctld }}" port: 22 # TODO: make it configurable timeout: 10 state: started delegate_to: localhost register: ssh_check ignore_errors: true - ignore_unreachable: true -- name: Block when ssh_check is success - when: ssh_check is success +- name: Enter slurm controller when pingable + when: + - ssh_check is success + ignore_unreachable: true block: - name: Initialize ctld_state dict ansible.builtin.set_fact: - ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}" + ctld_state: "{{ ctld_state | default({}) | combine({ctld: false}) }}" - name: Check if slurmctld is running on remote host ansible.builtin.service_facts: - delegate_to: "{{ item }}" + delegate_to: "{{ ctld }}" register: service_facts + ignore_unreachable: true + + - name: Fail if slurmctld is unreachable + ansible.builtin.fail: + msg: "Failed to connect to {{ ctld }}." + when: service_facts is unreachable - name: Update ctld_state if slurmctld is running ansible.builtin.set_fact: - ctld_state: "{{ ctld_state | combine({item: true}) }}" + ctld_state: "{{ ctld_state | combine({ctld: true}) }}" when: - service_facts is success - ansible_facts.services['slurmctld.service'] is defined @@ -72,6 +79,13 @@ changed_when: scontrol_reconfig.rc == 0 failed_when: false register: scontrol_reconfig - delegate_to: "{{ item }}" + delegate_to: "{{ ctld }}" when: - - ctld_state[item] is true + - ctld_state[ctld] is true + + rescue: + - name: Fail if slurmctld is not running on any host + ansible.builtin.debug: + msg: "Failed to 'scontrol reconfigure' on {{ ctld }}. + As task '{{ ansible_failed_task.name }}' failed. + results: {{ ansible_failed_result }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 799d4cd757..c5f7953b0d 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -197,3 +197,5 @@ - ctld_list - ctld_conf_files is changed loop: "{{ ctld_list }}" + loop_control: + loop_var: ctld diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 45e37ac243..e4ac760d77 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -194,17 +194,7 @@ group: "{{ root_group }}" mode: "{{ common_mode }}" when: cmpt_list - loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}" - -- name: Create logout_user.sh and slurmd.service in login and login_compiler - ansible.builtin.template: - src: "{{ item.1 }}.j2" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" - when: login_list or compiler_login_list - loop: "{{ (login_list + compiler_login_list) | product(['slurmd.service']) }}" + loop: "{{ cmpt_list | product(['logout_user.sh']) }}" - name: Get the slurm NFS path ansible.builtin.debug: diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml index a326fa820d..64c36dbeaf 100644 --- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -30,6 +30,7 @@ munge_key_changed: "{{ munge_key_copy.results | default([]) | rekey_on_member('item') }}" when: munge_key_copy is defined +# TODO: Clean unreachable handling - name: Block when munge key changed when: - munge_key_changed is defined diff --git a/discovery/roles/slurm_config/templates/slurmd.service.j2 b/discovery/roles/slurm_config/templates/slurmd.service.j2 deleted file mode 100644 index 294d1fda75..0000000000 --- a/discovery/roles/slurm_config/templates/slurmd.service.j2 +++ /dev/null @@ -1,22 +0,0 @@ -[Unit] -Description=Slurm node daemon -After=munge.service network-online.target remote-fs.target sssd.service -Wants=network-online.target - -[Service] -Type=notify -EnvironmentFile=-/etc/sysconfig/slurmd -EnvironmentFile=-/etc/default/slurmd -RuntimeDirectory=slurm -RuntimeDirectoryMode=0755 -ExecStart=/usr/sbin/slurmd --systemd $SLURMD_OPTIONS {{ conf_server }} -ExecReload=/bin/kill -HUP $MAINPID -KillMode=process -LimitNOFILE=131072 -LimitMEMLOCK=infinity -LimitSTACK=infinity -Delegate=yes -TasksMax=infinity - -[Install] -WantedBy=multi-user.target \ No newline at end of file From 80a512650ed5146ed55b5a716fa855928d80b1cb Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 13 Feb 2026 17:43:19 +0530 Subject: [PATCH 25/77] Added user guidance messages in rollback_omnia.yml and upgrade_cluster.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 49 ++++++++++++++++-- upgrade/rollback_omnia.yml | 53 ++++++++++++++++++++ 2 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 upgrade/rollback_omnia.yml diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 196366870b..92aa87e2a3 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -13,6 +13,49 @@ # limitations under the License. --- -- name: Include import input parameters - ansible.builtin.include_role: - name: import_input_parameters + +- name: Display cluster reprovision guidance + ansible.builtin.pause: + prompt: "{{ '\x1b[32m' }}=================================================== + CLUSTER REPROVISION REQUIRED + =========================================================== + + Cluster reprovisioning is required after upgrade to enable new features. + + Review and update new 2.1 input fields before reprovisioning: + + 1. local_repo_config.yml + + - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64) + + - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64) + + 2. network_spec.yml (ib_network section) + + - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable) + + - Ensure host IB interfaces map to the IB network entries + + 3. omnia_config.yml (slurm_cluster.config_source) + + - Use the new structure: config_source: { type: , location: } + + - Populate location to point to your Slurm config bundle (local path or remote URL) + + 4. NFS cleanup (if NFS share is used for k8s/slurm) + + - Clean stale mounts and ensure the NFS share is accessible before reprovision + + - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment + + + Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster: + + 1. ansible-playbook local_repo/local_repo.yml + + 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml + + 3. ansible-playbook discovery/discovery.yml + + {{ '\x1b[0m' }}" + seconds: 1 diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml new file mode 100644 index 0000000000..fc33ab4a2e --- /dev/null +++ b/upgrade/rollback_omnia.yml @@ -0,0 +1,53 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Rollback Omnia guidance + hosts: localhost + connection: local + gather_facts: false + vars: + oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" + tasks: + - name: Read oim_metadata.yml for backup details + ansible.builtin.slurp: + src: "{{ oim_metadata_path }}" + register: oim_metadata_slurp + ignore_errors: true + + - name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}" + when: oim_metadata_slurp is defined and oim_metadata_slurp.content is defined + + - name: Derive backup_version from upgrade_backup_dir + ansible.builtin.set_fact: + backup_version: "{{ (oim_metadata.upgrade_backup_dir | regex_search('version_([^/]+)', '\\1')) + | default('previous version', true) }}" + when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined + + - name: Display rollback guidance + ansible.builtin.debug: + msg: >- + The rollback function restores the Omnia core to the last backup version + created during upgrade, including configs and container state. + + To return to the previous Omnia version + {{("(version " ~ backup_version[0] ~ ")") if backup_version is defined and backup_version }} + captured in the backup, + run the rollback from the OIM host: + 1) If you are inside the Omnia core container, exit the container shell. + 2) On the OIM host prompt, execute: ./omnia.sh --rollback + + - name: End play + ansible.builtin.meta: end_play From 3f516a3dd38d4923dd318e9600fb110f457700cf Mon Sep 17 00:00:00 2001 From: pullan1 Date: Fri, 13 Feb 2026 20:32:27 +0530 Subject: [PATCH 26/77] Fix for local repo is failing as cuda run package download issue Signed-off-by: pullan1 --- .../local_repo/parse_and_download.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index 72efd4566b..c8b8278eef 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -64,6 +64,26 @@ def execute_command(cmd_string, logger, type_json=False): stderr=subprocess.PIPE, shell=True, ) + status["returncode"] = cmd.returncode + status["stdout"] = cmd.stdout.strip() if cmd.stdout else None + status["stderr"] = cmd.stderr.strip() if cmd.stderr else None + + if cmd.returncode != 0: + logger.error(f"Command failed with return code {cmd.returncode}") + logger.error(f"Error: {status['stderr']}") + return False + + if type_json: + if not status["stdout"]: + logger.error("Command succeeded but returned empty output when JSON was expected") + return False + try: + status["stdout"] = json.loads(status["stdout"]) + except json.JSONDecodeError as error: + logger.error(f"Failed to parse JSON output: {error}") + logger.error(f"Raw output was: {status['stdout']}") + return False + logger.info(f"Command succeeded: {cmd_string}") return True except subprocess.CalledProcessError as e: From d138e3a75271e9653c4827899f0bdade8f00cb1e Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 12:19:33 +0530 Subject: [PATCH 27/77] Modification of Rollback guidance message --- upgrade/roles/upgrade_cluster/tasks/main.yml | 8 +++---- upgrade/rollback_omnia.yml | 25 ++++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 92aa87e2a3..1b70dc9561 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -22,7 +22,7 @@ Cluster reprovisioning is required after upgrade to enable new features. - Review and update new 2.1 input fields before reprovisioning: + Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning: 1. local_repo_config.yml @@ -42,11 +42,11 @@ - Populate location to point to your Slurm config bundle (local path or remote URL) - 4. NFS cleanup (if NFS share is used for k8s/slurm) + Do NFS cleanup (if NFS share is used for k8s/slurm) - - Clean stale mounts and ensure the NFS share is accessible before reprovision + - Clean stale mounts and ensure the NFS share is accessible before reprovision - - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment + - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster: diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml index fc33ab4a2e..c0d5080c22 100644 --- a/upgrade/rollback_omnia.yml +++ b/upgrade/rollback_omnia.yml @@ -36,18 +36,19 @@ | default('previous version', true) }}" when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined - - name: Display rollback guidance + - name: Display rollback guidance (green) ansible.builtin.debug: - msg: >- - The rollback function restores the Omnia core to the last backup version - created during upgrade, including configs and container state. - - To return to the previous Omnia version - {{("(version " ~ backup_version[0] ~ ")") if backup_version is defined and backup_version }} - captured in the backup, - run the rollback from the OIM host: - 1) If you are inside the Omnia core container, exit the container shell. - 2) On the OIM host prompt, execute: ./omnia.sh --rollback - + msg: + - "=================================" + - " OMNIA ROLLBACK" + - "=================================" + - "" + - "[Rollback Actions]" + - "1. Purpose: restore Omnia core to the last backup version (includes configs and container state)." + - "2. Target version: {{ backup_version | default('previous version from the backup location') }}." + - "3. How to run:" + - " - Exit the Omnia core container shell if you are inside it." + - " - From the OIM host prompt, execute: ./omnia.sh --rollback" + - "4. Note: ensure the backup location is accessible on the OIM host before running rollback." - name: End play ansible.builtin.meta: end_play From 3d5fa5b3f06c7dd41fbf8bf88c976eb25a0e348b Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 13:10:44 +0530 Subject: [PATCH 28/77] Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 1b70dc9561..6165997a47 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -54,8 +54,15 @@ 1. ansible-playbook local_repo/local_repo.yml 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml + + If the user is using aarch64 nodes, run the below playbook after build_image_x86_64: + + ansible-playbook build_image_aarch64/build_image_aarch64.yml 3. ansible-playbook discovery/discovery.yml + + Please follow the omnia documentation for steps in more detail. + {{ '\x1b[0m' }}" seconds: 1 From 53a1d1c62f303c4615e4d34dc2fe02a013e7269a Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 13:14:47 +0530 Subject: [PATCH 29/77] Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 6165997a47..751be68d73 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -61,7 +61,7 @@ 3. ansible-playbook discovery/discovery.yml - + Please follow the omnia documentation for steps in more detail. {{ '\x1b[0m' }}" From f370a252b786df319a1a8feeb4a7cec08a0511db Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 13:19:58 +0530 Subject: [PATCH 30/77] Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 751be68d73..a45be3f885 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -56,12 +56,12 @@ 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml If the user is using aarch64 nodes, run the below playbook after build_image_x86_64: - + ansible-playbook build_image_aarch64/build_image_aarch64.yml 3. ansible-playbook discovery/discovery.yml - + Please follow the omnia documentation for steps in more detail. {{ '\x1b[0m' }}" From de653020056aed145a14421592f2bdf676ed5cb8 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 13:23:25 +0530 Subject: [PATCH 31/77] Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index a45be3f885..a64df8feff 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -54,14 +54,12 @@ 1. ansible-playbook local_repo/local_repo.yml 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml - If the user is using aarch64 nodes, run the below playbook after build_image_x86_64: ansible-playbook build_image_aarch64/build_image_aarch64.yml 3. ansible-playbook discovery/discovery.yml - Please follow the omnia documentation for steps in more detail. {{ '\x1b[0m' }}" From 37358c9cc5e25e25cb8e86ce974d85a7e318f615 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 14:17:55 +0530 Subject: [PATCH 32/77] Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index a64df8feff..76c90b21bd 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -54,11 +54,12 @@ 1. ansible-playbook local_repo/local_repo.yml 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml - If the user is using aarch64 nodes, run the below playbook after build_image_x86_64: - ansible-playbook build_image_aarch64/build_image_aarch64.yml + 3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64: - 3. ansible-playbook discovery/discovery.yml + - ansible-playbook build_image_aarch64/build_image_aarch64.yml + + 4. ansible-playbook discovery/discovery.yml Please follow the omnia documentation for steps in more detail. From 1be86a2a250f6d9169fb46f30a1a0bd7ec338267 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 14:21:58 +0530 Subject: [PATCH 33/77] Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 76c90b21bd..90b25611b5 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -57,7 +57,7 @@ 3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64: - - ansible-playbook build_image_aarch64/build_image_aarch64.yml + ansible-playbook build_image_aarch64/build_image_aarch64.yml 4. ansible-playbook discovery/discovery.yml From f531576a0a3ff35bb969225716f15b73c1329ce7 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 16 Feb 2026 14:27:10 +0530 Subject: [PATCH 34/77] Addition of user guidance messages for cluster reprovisioning and rollback after upgrade to 2.1 (#3978) * Added user guidance messages in rollback_omnia.yml and upgrade_cluster.yml * Modification of Rollback guidance message * Update main.yml * Update main.yml * Update main.yml * Update main.yml * Update main.yml * Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 55 ++++++++++++++++++-- upgrade/rollback_omnia.yml | 54 +++++++++++++++++++ 2 files changed, 106 insertions(+), 3 deletions(-) create mode 100644 upgrade/rollback_omnia.yml diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 196366870b..90b25611b5 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -13,6 +13,55 @@ # limitations under the License. --- -- name: Include import input parameters - ansible.builtin.include_role: - name: import_input_parameters + +- name: Display cluster reprovision guidance + ansible.builtin.pause: + prompt: "{{ '\x1b[32m' }}=================================================== + CLUSTER REPROVISION REQUIRED + =========================================================== + + Cluster reprovisioning is required after upgrade to enable new features. + + Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning: + + 1. local_repo_config.yml + + - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64) + + - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64) + + 2. network_spec.yml (ib_network section) + + - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable) + + - Ensure host IB interfaces map to the IB network entries + + 3. omnia_config.yml (slurm_cluster.config_source) + + - Use the new structure: config_source: { type: , location: } + + - Populate location to point to your Slurm config bundle (local path or remote URL) + + Do NFS cleanup (if NFS share is used for k8s/slurm) + + - Clean stale mounts and ensure the NFS share is accessible before reprovision + + - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment + + + Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster: + + 1. ansible-playbook local_repo/local_repo.yml + + 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml + + 3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64: + + ansible-playbook build_image_aarch64/build_image_aarch64.yml + + 4. ansible-playbook discovery/discovery.yml + + Please follow the omnia documentation for steps in more detail. + + {{ '\x1b[0m' }}" + seconds: 1 diff --git a/upgrade/rollback_omnia.yml b/upgrade/rollback_omnia.yml new file mode 100644 index 0000000000..c0d5080c22 --- /dev/null +++ b/upgrade/rollback_omnia.yml @@ -0,0 +1,54 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Rollback Omnia guidance + hosts: localhost + connection: local + gather_facts: false + vars: + oim_metadata_path: "/opt/omnia/.data/oim_metadata.yml" + tasks: + - name: Read oim_metadata.yml for backup details + ansible.builtin.slurp: + src: "{{ oim_metadata_path }}" + register: oim_metadata_slurp + ignore_errors: true + + - name: Parse oim_metadata.yml + ansible.builtin.set_fact: + oim_metadata: "{{ oim_metadata_slurp.content | b64decode | from_yaml }}" + when: oim_metadata_slurp is defined and oim_metadata_slurp.content is defined + + - name: Derive backup_version from upgrade_backup_dir + ansible.builtin.set_fact: + backup_version: "{{ (oim_metadata.upgrade_backup_dir | regex_search('version_([^/]+)', '\\1')) + | default('previous version', true) }}" + when: oim_metadata is defined and oim_metadata.upgrade_backup_dir is defined + + - name: Display rollback guidance (green) + ansible.builtin.debug: + msg: + - "=================================" + - " OMNIA ROLLBACK" + - "=================================" + - "" + - "[Rollback Actions]" + - "1. Purpose: restore Omnia core to the last backup version (includes configs and container state)." + - "2. Target version: {{ backup_version | default('previous version from the backup location') }}." + - "3. How to run:" + - " - Exit the Omnia core container shell if you are inside it." + - " - From the OIM host prompt, execute: ./omnia.sh --rollback" + - "4. Note: ensure the backup location is accessible on the OIM host before running rollback." + - name: End play + ansible.builtin.meta: end_play From 8066a19d5542f2acaaf042e8dd5ccb92cdbb9b32 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Mon, 16 Feb 2026 12:04:27 +0000 Subject: [PATCH 35/77] fix status return in execute command Signed-off-by: Vrinda_Marwah --- common/library/module_utils/local_repo/parse_and_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index c8b8278eef..15bed1efb3 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -85,7 +85,7 @@ def execute_command(cmd_string, logger, type_json=False): return False logger.info(f"Command succeeded: {cmd_string}") - return True + return status except subprocess.CalledProcessError as e: logger.error(f"Command failed: {cmd_string} - {e}") return False From f0928443075d08a01973bb8b6f3921d9b16c0ea4 Mon Sep 17 00:00:00 2001 From: Nethravathi M G <146437298+nethramg@users.noreply.github.com> Date: Mon, 16 Feb 2026 23:12:44 +0530 Subject: [PATCH 36/77] Initial iDRAC Telemetry Node addition and deletion changes (#3972) * Initial set of changes for iDRAC Telemetry add and remove node * Ansible link and pylint fixes * Ansible lint fixes * Updated Copyrights to 2026 * Addressed the comments --- .../modules/delete_idracips_from_mysqldb.py | 251 ++++++++++++++++++ .../modules/disable_idrac_telemetry.py | 184 +++++++++++++ .../initiate_telemetry_service_cluster.yml | 5 +- .../tasks/remove_deleted_nodes.yml | 101 +++++++ .../templates/telemetry_report.j2 | 18 ++ telemetry/roles/idrac_telemetry/vars/main.yml | 24 +- 6 files changed, 581 insertions(+), 2 deletions(-) create mode 100644 common/library/modules/delete_idracips_from_mysqldb.py create mode 100644 common/library/modules/disable_idrac_telemetry.py create mode 100644 telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml diff --git a/common/library/modules/delete_idracips_from_mysqldb.py b/common/library/modules/delete_idracips_from_mysqldb.py new file mode 100644 index 0000000000..cd81b943e2 --- /dev/null +++ b/common/library/modules/delete_idracips_from_mysqldb.py @@ -0,0 +1,251 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +"""Module to delete iDRAC IPs from MySQL database. +This module connects to a Kubernetes pod running MySQL and deletes iDRAC IPs +that are not present in bmc_data.csv. It handles retries and delays for robustness.""" + +import time +from ansible.module_utils.basic import AnsibleModule +from kubernetes import client, config +from kubernetes.stream import stream +from kubernetes.config.config_exception import ConfigException + + +def load_kube_context(): + """Load Kubernetes configuration for accessing the cluster.""" + try: + config.load_kube_config() + except ConfigException: + config.load_incluster_config() + + +def run_mysql_query_in_pod(namespace, pod, container, mysql_user, mysql_password, query): + """Run a MySQL query in the specified pod. + + Args: + namespace: Kubernetes namespace + pod: Pod name + container: Container name + mysql_user: MySQL username + mysql_password: MySQL password + query: MySQL query to execute + + Returns: + dict: Result containing return code and output + """ + core_v1 = client.CoreV1Api() + mysql_command = [ + "mysql", + "-u", mysql_user, + "-N", "-B", + f"-p{mysql_password}", + "-e", query + ] + + try: + ws = stream( + core_v1.connect_get_namespaced_pod_exec, + name=pod, + namespace=namespace, + container=container, + command=mysql_command, + stderr=True, + stdin=False, + stdout=True, + tty=False, + _preload_content=False + ) + + stdout = "" + stderr = "" + + while ws.is_open(): + ws.update(timeout=1) + if ws.peek_stdout(): + stdout += ws.read_stdout() + if ws.peek_stderr(): + stderr += ws.read_stderr() + ws.close() + + rc = ws.returncode + + if rc != 0: + return { + "rc": rc, + "result": stderr.strip() if stderr else "Unknown error" + } + + query_result = [ + line.strip() for line in stdout.strip().splitlines() + if line.strip() and not line.strip().startswith("mysql:") + ] + + return { + "rc": rc, + "result": query_result + } + + except (ConfigException, OSError) as e: + return { + "rc": 1, + "result": str(e) + } + + +def delete_idrac_from_mysql( + namespace, + pod, + container, + mysqldb_name, + mysql_user, + mysql_password, + ip_to_delete, + retries=3, + delay=3 +): + """Delete a single iDRAC IP from MySQL database. + + Args: + namespace: Kubernetes namespace + pod: Pod name + container: Container name + mysqldb_name: MySQL database name + mysql_user: MySQL username + mysql_password: MySQL password + ip_to_delete: IP address to delete + retries: Number of retry attempts + delay: Delay between retries in seconds + + Returns: + dict: Result containing success status and message + """ + query = ( + f"DELETE FROM {mysqldb_name}.services " + f"WHERE ip = '{ip_to_delete}';" + ) + + for attempt in range(retries): + result = run_mysql_query_in_pod( + namespace=namespace, + pod=pod, + container=container, + mysql_user=mysql_user, + mysql_password=mysql_password, + query=query + ) + + if result.get("rc") == 0: + return { + "success": True, + "ip": ip_to_delete, + "msg": f"Successfully deleted iDRAC IP {ip_to_delete} from MySQL." + } + + if attempt < retries - 1: + time.sleep(delay) + + return { + "success": False, + "ip": ip_to_delete, + "msg": f"Failed to delete iDRAC IP {ip_to_delete} after {retries} attempts: {result.get('result')}" + } + + +def main(): + """Main function to execute the module logic.""" + module_args = { + "telemetry_namespace": {"type": "str", "required": True}, + "idrac_podnames": {"type": "list", "required": True}, + "mysqldb_k8s_name": {"type": "str", "required": True}, + "mysqldb_name": {"type": "str", "required": True}, + "mysqldb_user": {"type": "str", "required": True, "no_log": True}, + "mysqldb_password": {"type": "str", "required": True, "no_log": True}, + "ips_to_delete": {"type": "list", "required": True}, + "pod_to_db_idrac_ips": {"type": "dict", "required": True}, + "db_retries": {"type": "int", "default": 3}, + "db_delay": {"type": "int", "default": 3}, + } + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + telemetry_namespace = module.params["telemetry_namespace"] + idrac_podnames = module.params["idrac_podnames"] + mysqldb_k8s_name = module.params["mysqldb_k8s_name"] + mysqldb_name = module.params["mysqldb_name"] + mysqldb_user = module.params["mysqldb_user"] + mysqldb_password = module.params["mysqldb_password"] + ips_to_delete = module.params["ips_to_delete"] + pod_to_db_idrac_ips = module.params["pod_to_db_idrac_ips"] + db_retries = module.params["db_retries"] + db_delay = module.params["db_delay"] + + load_kube_context() + + deleted_ips = [] + failed_ips = [] + changed = False + + try: + for pod in idrac_podnames: + pod_ips = pod_to_db_idrac_ips.get(pod, []) + ips_to_delete_from_pod = list(set(pod_ips) & set(ips_to_delete)) + + if not ips_to_delete_from_pod: + module.warn(f"No IPs to delete from pod {pod}. Skipping.") + continue + + module.warn(f"Deleting IPs from pod {pod}: {ips_to_delete_from_pod}") + + for ip in ips_to_delete_from_pod: + result = delete_idrac_from_mysql( + namespace=telemetry_namespace, + pod=pod, + container=mysqldb_k8s_name, + mysqldb_name=mysqldb_name, + mysql_user=mysqldb_user, + mysql_password=mysqldb_password, + ip_to_delete=ip, + retries=db_retries, + delay=db_delay + ) + + if result.get("success"): + deleted_ips.append(ip) + changed = True + else: + failed_ips.append({ + "pod": pod, + "ip": ip, + "msg": result.get("msg", "Unknown error") + }) + + module.exit_json( + changed=changed, + deleted_ips=deleted_ips, + failed_ips=failed_ips, + msg=f"Deleted {len(deleted_ips)} iDRAC IPs from MySQL database." + ) + + except (OSError, ValueError) as e: + module.fail_json( + msg=f"An error occurred while deleting iDRAC IPs from MySQL: {str(e)}", + deleted_ips=deleted_ips, + failed_ips=failed_ips + ) + + +if __name__ == "__main__": + main() diff --git a/common/library/modules/disable_idrac_telemetry.py b/common/library/modules/disable_idrac_telemetry.py new file mode 100644 index 0000000000..cb7b885e1e --- /dev/null +++ b/common/library/modules/disable_idrac_telemetry.py @@ -0,0 +1,184 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/python +"""Module to disable telemetry on iDRAC nodes via Redfish API. +This module connects to iDRAC nodes and disables telemetry collection +by sending PATCH requests to the Redfish API endpoint.""" + +import requests +import urllib3 +from ansible.module_utils.basic import AnsibleModule + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +def disable_telemetry_on_idrac(idrac_ip, username, password, timeout=30): + """ + Disable telemetry on a single iDRAC node using Redfish API. + + Args: + idrac_ip: IP address of the iDRAC + username: iDRAC username + password: iDRAC password + timeout: Request timeout in seconds + + Returns: + dict: Result containing success status and message + """ + url = ( + f"https://{idrac_ip}/redfish/v1/Managers/" + f"iDRAC.Embedded.1/Attributes" + ) + + # Try different telemetry property names in order of preference + telemetry_properties = [ + "Telemetry.1.EnableTelemetry", + "TelemetryService.1.EnableTelemetry", + "Telemetry.2.EnableTelemetry", + "Redfish.1.TelemetryServiceEnabled" + ] + + headers = { + "Content-Type": "application/json" + } + + for property_name in telemetry_properties: + payload = { + "Attributes": { + property_name: "Disabled" + } + } + + try: + response = requests.patch( + url, + json=payload, + headers=headers, + auth=(username, password), + verify=False, + timeout=timeout + ) + + if response.status_code in [200, 202, 204]: + return { + "success": True, + "ip": idrac_ip, + "status_code": response.status_code, + "msg": f"Successfully disabled telemetry on iDRAC {idrac_ip} using {property_name}" + } + elif response.status_code == 400: + # Property not supported, try next one + continue + else: + return { + "success": False, + "ip": idrac_ip, + "status_code": response.status_code, + "msg": ( + f"Failed to disable telemetry on iDRAC {idrac_ip}. " + f"Status: {response.status_code}, Response: {response.text}" + ) + } + + except requests.exceptions.Timeout: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Timeout while connecting to iDRAC {idrac_ip}" + } + + except requests.exceptions.ConnectionError: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Connection error while connecting to iDRAC {idrac_ip}" + } + + except (requests.exceptions.RequestException, OSError) as e: + return { + "success": False, + "ip": idrac_ip, + "msg": f"Error disabling telemetry on iDRAC {idrac_ip}: {str(e)}" + } + + # All properties failed + return { + "success": False, + "ip": idrac_ip, + "msg": ( + f"Failed to disable telemetry on iDRAC {idrac_ip}. " + f"None of the supported telemetry properties were found: {', '.join(telemetry_properties)}" + ) + } + + +def main(): + """Main function to execute the module logic.""" + module_args = { + "idrac_ips": {"type": "list", "required": True, "elements": "str"}, + "username": {"type": "str", "required": True, "no_log": True}, + "password": {"type": "str", "required": True, "no_log": True}, + "timeout": {"type": "int", "default": 30}, + } + + module = AnsibleModule( + argument_spec=module_args, + supports_check_mode=True + ) + + idrac_ips = module.params["idrac_ips"] + username = module.params["username"] + password = module.params["password"] + timeout = module.params["timeout"] + + disabled_ips = [] + failed_ips = [] + changed = False + + try: + for idrac_ip in idrac_ips: + result = disable_telemetry_on_idrac( + idrac_ip=idrac_ip, + username=username, + password=password, + timeout=timeout + ) + + if result.get("success"): + disabled_ips.append(idrac_ip) + changed = True + else: + failed_ips.append({ + "ip": idrac_ip, + "msg": result.get("msg", "Unknown error") + }) + + module.exit_json( + changed=changed, + disabled_ips=disabled_ips, + failed_ips=failed_ips, + msg=f"Disabled telemetry on {len(disabled_ips)} iDRAC nodes." + ) + + except (requests.exceptions.RequestException, OSError) as e: + module.fail_json( + msg=f"An error occurred while disabling telemetry: {str(e)}", + disabled_ips=disabled_ips, + failed_ips=failed_ips + ) + + +if __name__ == "__main__": + main() diff --git a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml index 8615897205..7078a2f056 100644 --- a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml +++ b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry_service_cluster.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -87,6 +87,9 @@ ansible.builtin.debug: msg: "Filtered BMC IPs: {{ filtered_bmc_ip_list }}" +- name: Remove deleted nodes from telemetry (nodes not in bmc_data.csv) + ansible.builtin.include_tasks: remove_deleted_nodes.yml + - name: Convert filtered_bmc_ip_list to a dictionary with bmc_ip ansible.builtin.set_fact: filtered_bmc_ip_dict_list: "{{ filtered_bmc_ip_list | map('community.general.dict_kv', 'bmc_ip') | list }}" diff --git a/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml new file mode 100644 index 0000000000..4c82abf9e1 --- /dev/null +++ b/telemetry/roles/idrac_telemetry/tasks/remove_deleted_nodes.yml @@ -0,0 +1,101 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Identify iDRAC IPs to remove (present in DB but not in bmc_data.csv) + ansible.builtin.set_fact: + ips_to_remove: "{{ db_idrac_ip_list | difference(bmc_ips) }}" + +- name: Show iDRAC IPs to be removed + ansible.builtin.debug: + msg: "iDRAC IPs to be removed: {{ ips_to_remove }}" + when: ips_to_remove | length > 0 + +- name: Skip removal if no IPs to remove + ansible.builtin.debug: + msg: "{{ no_idracips_to_remove_msg }}" + when: ips_to_remove | length == 0 + +- name: Disable telemetry on iDRAC nodes before removal + when: ips_to_remove | length > 0 + block: + - name: Disable telemetry service on iDRAC nodes + disable_idrac_telemetry: + idrac_ips: "{{ ips_to_remove }}" + username: "{{ hostvars['localhost']['bmc_username'] }}" + password: "{{ hostvars['localhost']['bmc_password'] }}" + timeout: "{{ redfish_timeout }}" + register: disable_telemetry_result + ignore_errors: true + + - name: Show successfully disabled telemetry IPs + ansible.builtin.debug: + msg: "Successfully disabled telemetry on: {{ disable_telemetry_result.disabled_ips | default([]) }}" + when: + - disable_telemetry_result.disabled_ips is defined + - disable_telemetry_result.disabled_ips | length > 0 + + - name: Show failed to disable telemetry IPs + ansible.builtin.debug: + msg: "Failed to disable telemetry on: {{ disable_telemetry_result.failed_ips | default([]) }}" + when: + - disable_telemetry_result.failed_ips is defined + - disable_telemetry_result.failed_ips | length > 0 + +- name: Remove iDRAC IPs from MySQL database + when: ips_to_remove | length > 0 + block: + - name: Delete iDRAC IPs from mysqldb + delete_idracips_from_mysqldb: + telemetry_namespace: "{{ telemetry_namespace }}" + idrac_podnames: "{{ idrac_podname_idracips.idrac_podname_ips.keys() | list }}" + mysqldb_k8s_name: "{{ mysqldb_k8s_name }}" + mysqldb_name: "{{ mysqldb_name }}" + mysqldb_user: "{{ hostvars['localhost']['mysqldb_user'] }}" + mysqldb_password: "{{ hostvars['localhost']['mysqldb_password'] }}" + ips_to_delete: "{{ ips_to_remove }}" + pod_to_db_idrac_ips: "{{ existing_pod_to_db_idrac_ips }}" + db_retries: "{{ db_retries }}" + db_delay: "{{ db_delay }}" + register: delete_idrac_result + rescue: + - name: Failed to delete iDRAC IPs from mysqldb + ansible.builtin.fail: + msg: "{{ mysqldb_delete_fail_msg }}" + +- name: Show deleted iDRAC IPs + ansible.builtin.debug: + msg: "Successfully deleted iDRAC IPs from mysqldb: {{ delete_idrac_result.deleted_ips | default([]) }}" + when: + - ips_to_remove | length > 0 + - delete_idrac_result.deleted_ips is defined + - delete_idrac_result.deleted_ips | length > 0 + +- name: Show failed to delete iDRAC IPs + ansible.builtin.debug: + msg: "Failed to delete iDRAC IPs from mysqldb: {{ delete_idrac_result.failed_ips | default([]) }}" + when: + - ips_to_remove | length > 0 + - delete_idrac_result.failed_ips is defined + - delete_idrac_result.failed_ips | length > 0 + +- name: Update telemetry report variables with deletion info + ansible.builtin.set_fact: + deleted_idrac_count: "{{ delete_idrac_result.deleted_ips | default([]) | length }}" + deleted_idrac_ips: "{{ delete_idrac_result.deleted_ips | default([]) }}" + failed_delete_count: "{{ delete_idrac_result.failed_ips | default([]) | length }}" + failed_delete_ips: "{{ delete_idrac_result.failed_ips | default([]) }}" + disabled_telemetry_count: "{{ disable_telemetry_result.disabled_ips | default([]) | length }}" + disabled_telemetry_ips: "{{ disable_telemetry_result.disabled_ips | default([]) }}" + when: ips_to_remove | length > 0 diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 index 4d8554cab3..06bf230980 100644 --- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 +++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 @@ -14,5 +14,23 @@ Telemetry not supported IPs List: - {{ item }} {% endfor %} +{% if deleted_idrac_count is defined and deleted_idrac_count | int > 0 %} +----- Node Deletion Report ----- + +Total IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }} +Removed IPs List: +{% for item in deleted_idrac_ips %} + - {{ item }} +{% endfor %} + +{% if disabled_telemetry_count is defined and disabled_telemetry_count | int > 0 %} +IPs with telemetry disabled via Redfish: {{ disabled_telemetry_count | int }} +Disabled telemetry IPs List: +{% for item in disabled_telemetry_ips %} + - {{ item }} +{% endfor %} +{% endif %} +{% endif %} + ===== Telemetry Report End ===== diff --git a/telemetry/roles/idrac_telemetry/vars/main.yml b/telemetry/roles/idrac_telemetry/vars/main.yml index d2696f4ac8..7fe6730789 100644 --- a/telemetry/roles/idrac_telemetry/vars/main.yml +++ b/telemetry/roles/idrac_telemetry/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -67,6 +67,13 @@ idrac_telemetry_statefulset_restart_failure_msg: | Failed to restart the {{ idrac_telemetry_k8s_name }} StatefulSet. Please check the logs using the command kubectl logs -n {{ telemetry_namespace }} {{ idrac_telemetry_k8s_name }}- and try again. +# Usage: remove_deleted_nodes.yml +redfish_timeout: 30 +mysqldb_delete_fail_msg: | + Failed to delete iDRAC IPs from the mysql database. + This could be due to the tables in the mysqldb not being accessible at the moment. Please try running the playbook again after some time. +no_idracips_to_remove_msg: "No iDRAC IPs to remove. All DB entries are present in bmc_data.csv." + # Usage: create_telemetry_report.yml telemetry_report_path: "/opt/omnia/telemetry/idrac_telemetry_report.yml" telemetry_report_template: "telemetry_report.j2" @@ -75,6 +82,9 @@ telemetry_report: | IP count with Telemetry not supported: {{ failed_idrac_count | int + invalid_idrac_count | int }} IP count with Telemetry activated in current execution: {{ telemetry_idrac_count | int }} + {% if deleted_idrac_count is defined %} + IP count removed from telemetry (not in bmc_data.csv): {{ deleted_idrac_count | int }} + {% endif %} {% if (failed_idrac_count | int + invalid_idrac_count | int) > 0 %} Potential reasons for telemetry not being initiated include Redfish connectivity problems, timeout issues, @@ -105,3 +115,15 @@ telemetry_report: | - {{ item }} {% endfor %} {% endif %} + {% if deleted_idrac_ips is defined and deleted_idrac_ips | length > 0 %} + IPs removed from telemetry database (not present in bmc_data.csv): + {% for item in deleted_idrac_ips %} + - {{ item }} + {% endfor %} + {% endif %} + {% if disabled_telemetry_ips is defined and disabled_telemetry_ips | length > 0 %} + IPs with telemetry disabled via Redfish: + {% for item in disabled_telemetry_ips %} + - {{ item }} + {% endfor %} + {% endif %} From 128cac669d133c7c6eb1f52b37b1d201e1a3810a Mon Sep 17 00:00:00 2001 From: SOWJANYAJAGADISH123 Date: Tue, 17 Feb 2026 08:37:15 +0530 Subject: [PATCH 37/77] support multiple Omnia versions (2.1.0.0, 2.1.0.1) using a single core container tag (2.1) (#3983) --- omnia.sh | 782 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 622 insertions(+), 160 deletions(-) diff --git a/omnia.sh b/omnia.sh index b7a086545d..3b320b0bf6 100755 --- a/omnia.sh +++ b/omnia.sh @@ -52,11 +52,226 @@ is_local_ip() { fi } +# Version configuration variables +OMNIA_CORE_CONTAINER_TAG="2.1" # Default container tag +OMNIA_VERSION="" # Will be read from metadata +TARGET_OMNIA_VERSION="" # Target version for upgrade +TARGET_CONTAINER_TAG="" # Target container tag for upgrade + +# Centralized version list (in chronological order) +ALL_OMNIA_VERSIONS=("2.0.0.0" "2.1.0.0") + # Container-side paths (used inside podman exec commands) CONTAINER_INPUT_DIR="/opt/omnia/input" CONTAINER_BACKUPS_DIR="/opt/omnia/backups" CONTAINER_METADATA_FILE="/opt/omnia/.data/oim_metadata.yml" +# Function to get available upgrade versions (higher than current) +get_available_upgrade_versions() { + local current_version="$1" + local available_versions=() + local version_descriptions=() + + # Find versions higher than current + local found_current=false + for version in "${ALL_OMNIA_VERSIONS[@]}"; do + if [ "$version" = "$current_version" ]; then + found_current=true + continue + fi + + if [ "$found_current" = true ]; then + available_versions+=("$version") + + # Generate description based on upgrade type + local current_tag=$(get_container_tag_from_version "$current_version") + local target_tag=$(get_container_tag_from_version "$version") + + if [ "$current_tag" = "$target_tag" ]; then + version_descriptions+=("Patch upgrade to $version (container restart only)") + else + version_descriptions+=("Major upgrade to $version (container swap required)") + fi + fi + done + + # Return arrays + printf '%s\n' "${available_versions[@]}" + printf '%s\n' "${version_descriptions[@]}" +} + +# Function to get available rollback versions (lower than current) +get_available_rollback_versions() { + local current_version="$1" + local available_versions=() + + # Find versions lower than current + for version in "${ALL_OMNIA_VERSIONS[@]}"; do + if [ "$version" = "$current_version" ]; then + break + fi + available_versions+=("$version") + done + + # Return array (reverse order for rollback - newest first) + local reversed_versions=() + for ((i=${#available_versions[@]}-1; i>=0; i--)); do + reversed_versions+=("${available_versions[$i]}") + done + + printf '%s\n' "${reversed_versions[@]}" +} + +# Function to perform same-tag rollback (container restart only) +rollback_same_tag() { + local target_version="$1" + local current_version="$2" + + echo "[INFO] [ROLLBACK] Phase: Same-Tag Rollback" + echo "[INFO] [ROLLBACK] Rolling back to $target_version within same container tag" + + # Verify container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ROLLBACK] Container is not running for same-tag rollback" + return 1 + fi + + echo "[INFO] [ROLLBACK] Updating metadata to version $target_version" + + # Update version metadata + if ! podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE' + else + echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE' + fi + "; then + echo "[ERROR] [ROLLBACK] Failed to update metadata version" + echo "[ERROR] [ROLLBACK] Rollback failed: Could not update version metadata" + return 1 + fi + + echo "[INFO] [ROLLBACK] Restarting container to apply changes..." + + # Restart container to apply changes + if ! systemctl restart omnia_core.service; then + echo "[ERROR] [ROLLBACK] Failed to restart container service" + echo "[ERROR] [ROLLBACK] Rollback failed: Container restart failed" + return 1 + fi + + # Wait for container to be healthy after restart + echo "[INFO] [ROLLBACK] Waiting for container health check after restart (30s)" + local health_timeout=30 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ROLLBACK] Container is healthy after restart" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo "[ERROR] [ROLLBACK] Container failed to become healthy within 30 seconds after restart" + echo "[ERROR] [ROLLBACK] Rollback failed: Container health check failed" + return 1 + fi + + # Verify version update + local updated_version=$(get_current_omnia_version) + if [ "$updated_version" != "$target_version" ]; then + echo "[ERROR] [ROLLBACK] Version update verification failed" + echo "[ERROR] [ROLLBACK] Expected: $target_version, Found: $updated_version" + return 1 + fi + + echo "[INFO] [ROLLBACK] Same-tag rollback completed successfully" + echo "[INFO] [ROLLBACK] Version rolled back to: $target_version" + return 0 +} + +# Function to validate container image availability and show build instructions +validate_container_image() { + local target_version="$1" + local target_container_tag="$2" + local operation="${3:-upgrade}" + + echo -e "${BLUE}Validating target container image: omnia_core:$target_container_tag${NC}" + if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then + echo -e "${RED}ERROR: Target image missing locally: omnia_core:$target_container_tag${NC}" + echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}" + echo -e "1. Clone the Omnia Artifactory repository:" + echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-$target_version" + echo -e "2. Navigate to the repository directory:" + echo -e " cd omnia-artifactory" + echo -e "3. Build the core image locally (loads into local Podman by default):" + echo -e " ./build_images.sh core core_tag=$target_container_tag omnia_branch=$target_version" + echo -e "Then re-run:" + echo -e " ./omnia.sh --$operation" + return 1 + fi + + echo -e "${GREEN}✓ Target image available locally: omnia_core:$target_container_tag${NC}" + return 0 +} + +# Function to get container tag from omnia version +get_container_tag_from_version() { + local version="$1" + case "$version" in + 2.0.*) + echo "1.0" + ;; + *) + echo "$(echo "$version" | awk -F. '{print $1"."$2}')" + ;; + esac +} + +# Function to read current omnia version from metadata +get_current_omnia_version() { + if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + podman exec omnia_core cat /opt/omnia/.data/oim_metadata.yml 2>/dev/null | grep "omnia_version:" | awk '{print $2}' | tr -d '"' + else + echo "" + fi +} + +show_post_upgrade_instructions() { + local upgraded_version="$1" + + echo "" + echo -e "${YELLOW}================================================================================${NC}" + echo -e "${YELLOW} IMPORTANT POST-UPGRADE STEP${NC}" + echo -e "${YELLOW}================================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Omnia core container has been successfully upgraded${NC}" + echo -e "${GREEN}✓ Version updated to: $upgraded_version${NC}" + echo "" + echo -e "${BLUE}NEXT REQUIRED ACTION:${NC}" + echo -e "${YELLOW}You must now run the upgrade playbook inside the omnia_core container:${NC}" + echo "" + echo -e "${GREEN}podman exec -it omnia_core ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}" + echo "" + echo -e "${BLUE}This playbook will:${NC}" + echo -e "• Update input files" + echo -e "• Update internal configurations" + echo "" + echo -e "${YELLOW}Note: Run this command after the container is fully healthy and stable${NC}" + echo -e "${YELLOW}================================================================================${NC}" + echo "" +} + # Host-side paths (initialized dynamically after omnia_path is set) OMNIA_INPUT_DIR="" OMNIA_METADATA_DIR="" @@ -1004,29 +1219,9 @@ install_omnia_core() { local omnia_core_tag="2.1" local omnia_core_registry="" - # Check if local omnia_core:2.1 exists - if podman inspect omnia_core:${omnia_core_tag} >/dev/null 2>&1; then + # Check if local omnia_core image exists using validate function + if validate_container_image "" "$omnia_core_tag" "install"; then echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" - # Check if latest exists for backward compatibility - elif podman inspect omnia_core:latest >/dev/null 2>&1; then - echo -e "${GREEN}✓ Omnia core image (omnia_core:latest) found locally.${NC}" - # Tag it as 2.1 for consistency - podman tag omnia_core:latest omnia_core:${omnia_core_tag} - else - echo -e "${RED}ERROR: Omnia core image (omnia_core:${omnia_core_tag}) not found locally.${NC}" - echo -e "${YELLOW}Omnia no longer pulls images from Docker Hub. Build/load the image locally and retry.${NC}" - echo "" - echo -e "${YELLOW}One way to build the image locally:${NC}" - echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-" - echo -e "2. Navigate to the repository directory:" - echo -e " cd omnia-artifactory" - echo -e "3. Build the core image locally (loads into local Podman by default):" - echo -e " ./build_images.sh core core_tag=2.1 omnia_branch=" - echo "" - echo -e "${YELLOW}Then re-run:${NC}" - echo -e " ./omnia.sh --install" - exit 1 fi # Check if any other containers with 'omnia' in their name are running @@ -1148,9 +1343,6 @@ install_omnia_core() { # If core container is not present else - - # Start the container setup - echo -e "${GREEN}Starting Omnia core container setup.${NC}" setup_omnia_core fi } @@ -1216,16 +1408,6 @@ phase1_validate() { return 1 fi - if [ "$previous_omnia_version" = "2.1.0.0" ]; then - echo "[ERROR] [ORCHESTRATOR] Upgrade already performed. Current Omnia version is 2.1.0.0. No further upgrade required." - return 1 - fi - - if [ "$previous_omnia_version" != "2.0.0.0" ]; then - echo "[ERROR] [ORCHESTRATOR] Previous Omnia version mismatch: expected 2.0.0.0, got: $previous_omnia_version" - return 1 - fi - shared_path=$(echo "$core_config" | grep "^oim_shared_path:" | cut -d':' -f2- | tr -d ' \t\n\r') if [ -z "$shared_path" ]; then echo "[ERROR] [ORCHESTRATOR] oim_shared_path not found in oim_metadata.yml" @@ -1244,28 +1426,6 @@ phase1_validate() { return 1 fi - current_image=$(podman inspect omnia_core --format '{{.ImageName}}' 2>/dev/null) - if [ -z "$current_image" ]; then - echo "[ERROR] [ORCHESTRATOR] Unable to inspect omnia_core container image" - return 1 - fi - - if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1" - echo "" - echo -e "${YELLOW}Omnia does not pull images from Docker Hub. Build/load the image locally and retry.${NC}" - echo "" - echo -e "${YELLOW}To build the core image locally:${NC}" - echo -e "1. Clone the Omnia Artifactory repository:" - echo -e " git clone https://github.com/dell/omnia-artifactory -b omnia-container-" - echo -e "2. Navigate to the repository directory:" - echo -e " cd omnia-artifactory" - echo -e "3. Build the core image locally (loads into local Podman by default):" - echo -e " ./build_images.sh core core_tag=2.1 omnia_branch=" - echo "" - return 1 - fi - echo "[INFO] [ORCHESTRATOR] Phase 1: Validation passed" return 0 } @@ -1277,13 +1437,18 @@ phase2_approval() { echo "============================================" echo "OMNIA UPGRADE SUMMARY" echo "============================================" - echo "Current Container Tag: 1.0" - echo "Target Container Tag: 2.1" - echo "Current Omnia Release: 2.0.0.0" - echo "Target Omnia Release: 2.1.0.0" - echo "New Features:" - echo " - Add and remove node for slurm cluster" - echo " - Additional Package Installation" + echo "Current Container Tag: $OMNIA_CORE_CONTAINER_TAG" + echo "Target Container Tag: $TARGET_CONTAINER_TAG" + echo "Current Omnia Release: $OMNIA_VERSION" + echo "Target Omnia Release: $TARGET_OMNIA_VERSION" + + # Show upgrade type + if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then + echo "Upgrade Type: Same-tag upgrade (container restart)" + else + echo "Upgrade Type: Cross-tag upgrade (container swap)" + fi + echo "============================================" current_omnia_version=$(podman exec -u root omnia_core /bin/bash -c "grep '^omnia_version:' '$CONTAINER_METADATA_FILE' | cut -d':' -f2 | tr -d ' \t\n\r'" 2>/dev/null) @@ -1367,6 +1532,85 @@ phase3_backup_creation() { return 0 } +phase4_same_tag_upgrade() { + local target_version="$1" + + echo "[INFO] [ORCHESTRATOR] Phase 4: Same-Tag Upgrade" + echo "[INFO] [ORCHESTRATOR] Upgrading to $target_version within same container tag" + + # Verify container is running + if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then + echo "[ERROR] [ORCHESTRATOR] Container is not running for same-tag upgrade" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Updating metadata to version $target_version" + + # Update version metadata + if ! podman exec -u root omnia_core bash -c " + set -e + if [ ! -f '$CONTAINER_METADATA_FILE' ]; then + echo '[ERROR] Metadata file not found inside container: $CONTAINER_METADATA_FILE' >&2 + exit 1 + fi + if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then + sed -i 's/^omnia_version:.*/omnia_version: $target_version/' '$CONTAINER_METADATA_FILE' + else + echo 'omnia_version: $target_version' >> '$CONTAINER_METADATA_FILE' + fi + "; then + echo "[ERROR] [ORCHESTRATOR] Failed to update metadata version" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Restarting container to apply changes..." + + # Restart container to apply changes + if ! systemctl restart omnia_core.service; then + echo "[ERROR] [ORCHESTRATOR] Failed to restart container service" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container restart failed" + return 1 + fi + + # Wait for container to be healthy after restart + echo "[INFO] [ORCHESTRATOR] Waiting for container health check after restart (30s)" + local health_timeout=30 + local health_count=0 + + while [ $health_count -lt $health_timeout ]; do + if podman ps --format '{{.Names}} {{.Status}}' | grep -E "omnia_core.*Up" | grep -q "healthy\|Up"; then + echo "[INFO] [ORCHESTRATOR] Container is healthy after restart" + break + fi + sleep 1 + health_count=$((health_count + 1)) + echo -n "." + done + + if [ $health_count -ge $health_timeout ]; then + echo "" + echo "[ERROR] [ORCHESTRATOR] Container failed to become healthy within 30 seconds after restart" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Container health check failed" + return 1 + fi + + # Verify version update + local updated_version=$(get_current_omnia_version) + if [ "$updated_version" != "$target_version" ]; then + echo "[ERROR] [ORCHESTRATOR] Version update verification failed" + echo "[ERROR] [ORCHESTRATOR] Expected: $target_version, Found: $updated_version" + return 1 + fi + + echo "[INFO] [ORCHESTRATOR] Same-tag upgrade completed successfully" + echo "[INFO] [ORCHESTRATOR] Version updated to: $target_version" + + show_post_upgrade_instructions "$target_version" + + return 0 +} + phase4_container_swap() { local quadlet_file="/etc/containers/systemd/omnia_core.container" local i @@ -1376,12 +1620,12 @@ phase4_container_swap() { if [ ! -f "$quadlet_file" ]; then echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Quadlet file not found: $quadlet_file" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Quadlet configuration file missing" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi - echo "[INFO] [ORCHESTRATOR] Stopping omnia_core 1.0 container" + echo "[INFO] [ORCHESTRATOR] Stopping omnia_core $OMNIA_CORE_CONTAINER_TAG container" systemctl stop omnia_core.service >/dev/null 2>&1 || true if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then @@ -1391,25 +1635,25 @@ phase4_container_swap() { if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Failed to stop omnia_core container" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop 1.0 container" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not stop $OMNIA_CORE_CONTAINER_TAG container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi - echo "[INFO] [ORCHESTRATOR] Starting omnia_core 2.1 Quadlet unit" - if ! podman inspect "omnia_core:2.1" >/dev/null 2>&1; then - echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:2.1" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 image not available" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[INFO] [ORCHESTRATOR] Starting omnia_core $TARGET_CONTAINER_TAG Quadlet unit" + if ! podman inspect "omnia_core:$TARGET_CONTAINER_TAG" >/dev/null 2>&1; then + echo "[ERROR] [ORCHESTRATOR] Target image missing locally: omnia_core:$TARGET_CONTAINER_TAG" + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG image not available" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi - if ! sed -i 's/^Image=omnia_core:.*/Image=omnia_core:2.1/' "$quadlet_file"; then - echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to 2.1 in quadlet file" + if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$TARGET_CONTAINER_TAG/" "$quadlet_file"; then + echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to update Image to $TARGET_CONTAINER_TAG in quadlet file" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update container image tag" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi @@ -1417,20 +1661,20 @@ phase4_container_swap() { systemctl daemon-reload || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: systemctl daemon-reload failed" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: System daemon reload failed" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 } systemctl start omnia_core.service || { echo "[ERROR] [ORCHESTRATOR] Phase 4.3 failed: Failed to start omnia_core.service" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start 2.1 container" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not start $TARGET_CONTAINER_TAG container" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 } - echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core 2.1 health check (60s)" + echo "[INFO] [ORCHESTRATOR] Waiting for omnia_core $TARGET_CONTAINER_TAG health check (60s)" for i in $(seq 1 60); do if podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then break @@ -1440,13 +1684,13 @@ phase4_container_swap() { if ! podman ps --format '{{.Names}}' | grep -qw "omnia_core"; then echo "[ERROR] [ORCHESTRATOR] Phase 4.4 failed: Container failed health check after swap" - echo "[ERROR] [ORCHESTRATOR] Upgrade failed: 2.1 container failed health check" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Upgrade failed: $TARGET_CONTAINER_TAG container failed health check" + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi - echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to 2.1.0.0" + echo "[INFO] [ORCHESTRATOR] Updating metadata omnia_version to $TARGET_OMNIA_VERSION" if ! podman exec -u root omnia_core bash -c " set -e if [ ! -f '$CONTAINER_METADATA_FILE' ]; then @@ -1454,14 +1698,14 @@ phase4_container_swap() { exit 1 fi if grep -q '^omnia_version:' '$CONTAINER_METADATA_FILE'; then - sed -i 's/^omnia_version:.*/omnia_version: 2.1.0.0/' '$CONTAINER_METADATA_FILE' + sed -i 's/^omnia_version:.*/omnia_version: $TARGET_OMNIA_VERSION/' '$CONTAINER_METADATA_FILE' else - echo 'omnia_version: 2.1.0.0' >> '$CONTAINER_METADATA_FILE' + echo 'omnia_version: $TARGET_OMNIA_VERSION' >> '$CONTAINER_METADATA_FILE' fi "; then echo "[ERROR] [ORCHESTRATOR] Phase 4.5 failed: Failed to update metadata version" echo "[ERROR] [ORCHESTRATOR] Upgrade failed: Could not update version metadata" - echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore 1.0 container..." + echo "[ERROR] [ORCHESTRATOR] Initiating rollback to restore container..." rollback_omnia_core return 1 fi @@ -1471,21 +1715,129 @@ phase4_container_swap() { } upgrade_omnia_core() { - local lock_file="/var/lock/omnia_core_upgrade.lock" - local backup_base - - if [ -e "$lock_file" ]; then - echo -e "${RED}ERROR: Upgrade lock exists at $lock_file. Another upgrade may be running.${NC}" + echo -e "${BLUE}=================== Omnia Core Upgrade ====================${NC}" + echo -e "${BLUE}This script will upgrade Omnia core container.${NC}" + echo -e "${BLUE}Current version will be backed up and upgraded to target version.${NC}" + echo -e "${BLUE}=============================================================${NC}" + + # Read current version + OMNIA_VERSION=$(get_current_omnia_version) + if [ -z "$OMNIA_VERSION" ]; then + echo -e "${RED}ERROR: Could not determine current Omnia version${NC}" + echo -e "${YELLOW}Please ensure omnia_core container is running and metadata is accessible${NC}" exit 1 fi - - mkdir -p "$(dirname "$lock_file")" 2>/dev/null || true - echo "$$" > "$lock_file" || { - echo -e "${RED}ERROR: Failed to create lock file: $lock_file${NC}" + + # Get current container tag + OMNIA_CORE_CONTAINER_TAG=$(get_container_tag_from_version "$OMNIA_VERSION") + + echo -e "${GREEN}Current Omnia version: $OMNIA_VERSION${NC}" + echo -e "${GREEN}Current container tag: $OMNIA_CORE_CONTAINER_TAG${NC}" + + # Show available upgrade options + echo "" + echo "Available upgrade options:" + echo "=========================" + + # Get available upgrade versions dynamically + local upgrade_output + upgrade_output=$(get_available_upgrade_versions "$OMNIA_VERSION") + + # Parse output into versions and descriptions + local available_versions=() + local version_descriptions=() + local line_count=0 + local total_lines + + # Count total lines + total_lines=$(echo "$upgrade_output" | wc -l) + + # Split into versions and descriptions (first half = versions, second half = descriptions) + local mid_line=$((total_lines / 2)) + local line_num=0 + + while IFS= read -r line; do + line_num=$((line_num + 1)) + if [ $line_num -le $mid_line ]; then + available_versions+=("$line") + else + version_descriptions+=("$line") + fi + done <<< "$upgrade_output" + + # Check if any upgrade options are available + if [ ${#available_versions[@]} -eq 0 ]; then + echo -e "${GREEN}Already at latest version $OMNIA_VERSION${NC}" + echo "No upgrade options available." + exit 0 + fi + + # Display upgrade options + for i in "${!available_versions[@]}"; do + local target_version="${available_versions[$i]}" + local target_container_tag=$(get_container_tag_from_version "$target_version") + + # Check if target image exists locally + local image_status="✓ Available" + if ! podman inspect "omnia_core:$target_container_tag" >/dev/null 2>&1; then + image_status="✗ Missing (build required)" + fi + + echo "$((i+1)). Upgrade to $target_version (container tag: $target_container_tag) [$image_status]" + done + + # Prompt user to select upgrade version + echo -n "Select upgrade option (1-${#available_versions[@]}) or press Enter to cancel: " + read -r selection + + # Validate selection + if [ -z "$selection" ]; then + echo "Upgrade cancelled by user." + exit 0 + fi + + if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then + echo -e "${RED}ERROR: Invalid selection.${NC}" exit 1 - } + fi + + # Set target version based on user selection + TARGET_OMNIA_VERSION="${available_versions[$((selection-1))]}" + TARGET_CONTAINER_TAG=$(get_container_tag_from_version "$TARGET_OMNIA_VERSION") + + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then + exit 1 + fi + + echo -e "${GREEN}Target Omnia version: $TARGET_OMNIA_VERSION${NC}" + echo -e "${GREEN}Target container tag: $TARGET_CONTAINER_TAG${NC}" + + # Check if container tag change is needed + if [ "$OMNIA_CORE_CONTAINER_TAG" = "$TARGET_CONTAINER_TAG" ]; then + echo -e "${BLUE}Upgrade within same container tag ($TARGET_CONTAINER_TAG)${NC}" + echo -e "${BLUE}Will restart container instead of swapping${NC}" + SAME_TAG_UPGRADE=true + else + echo -e "${BLUE}Container tag change required ($OMNIA_CORE_CONTAINER_TAG -> $TARGET_CONTAINER_TAG)${NC}" + echo -e "${BLUE}Will perform full container swap${NC}" + SAME_TAG_UPGRADE=false + fi + + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$TARGET_OMNIA_VERSION" "$TARGET_CONTAINER_TAG" "upgrade"; then + exit 1 + fi + local lock_file="/tmp/omnia_upgrade.lock" + if [ -f "$lock_file" ]; then + echo -e "${RED}ERROR: Another upgrade process is already running${NC}" + echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}" + exit 1 + fi + touch "$lock_file" trap 'rm -f "$lock_file"' EXIT + # Run upgrade phases if ! phase1_validate; then echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" exit 1 @@ -1495,7 +1847,7 @@ upgrade_omnia_core() { exit 0 fi - backup_base="$OMNIA_UPGRADE_BACKUP_PATH" + local backup_base="$OMNIA_UPGRADE_BACKUP_PATH" if [ -z "$backup_base" ]; then echo "[ERROR] [ORCHESTRATOR] Backup path is empty" exit 1 @@ -1506,13 +1858,26 @@ upgrade_omnia_core() { exit 1 fi - if ! phase4_container_swap; then - echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4" - exit 1 + # Choose upgrade path based on container tag + if [ "$SAME_TAG_UPGRADE" = "true" ]; then + if ! phase4_same_tag_upgrade "$TARGET_OMNIA_VERSION"; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in same-tag upgrade" + exit 1 + fi + else + if ! phase4_container_swap; then + echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 4" + exit 1 + fi fi echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base" + + show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" + + # Initialize SSH config and start container session + init_ssh_config start_container_session exit 0 } @@ -1622,16 +1987,31 @@ restore_from_backup() { display_cleanup_instructions() { echo "" echo -e "${RED}================================================================================${NC}" - echo -e "${RED} ROLLBACK FAILED${NC}" + echo -e "${RED} UPGRADE/ROLLBACK FAILED${NC}" echo -e "${RED}================================================================================${NC}" echo "" - echo -e "${YELLOW}Rollback failed. Manual cleanup is required to restore a clean state before retrying.${NC}" + echo -e "${YELLOW}Operation failed. Manual cleanup is required to restore a clean state before retrying.${NC}" + echo "" + echo -e "${BLUE}Choose the appropriate cleanup scenario:${NC}" + echo "" + echo -e "${GREEN}CASE 1: If you can log into omnia_core container:${NC}" + echo -e "${YELLOW}1. Enter omnia_core container: podman exec -it omnia_core bash${NC}" + echo -e "${YELLOW}2. Run oim cleanup: ansible-playbook /omnia/oim_cleanup.yml${NC}" + echo -e "${YELLOW}3. Run uninstall inside container: ./omnia.sh --uninstall${NC}" + echo -e "${YELLOW}4. Exit container: exit${NC}" + echo -e "${YELLOW}5. Clean shared path: rm -rf ${NC}" + echo -e "${YELLOW}6. Install required version: ./omnia.sh --install${NC}" echo "" - echo -e "${YELLOW}Run the following on the OIM host:${NC}" - echo -e "${YELLOW}1. Clean Omnia shared path: rm -rf ${NC}" - echo -e "${YELLOW}2. Stop Omnia core system service: systemctl stop omnia_core${NC}" - echo -e "${YELLOW}3. Remove the Omnia core container: podman rm -f omnia_core${NC}" - echo -e "${YELLOW}4. Perform a fresh Omnia core install: ./omnia.sh --install${NC}" + echo -e "${GREEN}CASE 2: If you cannot log into omnia_core container (but other containers are running):${NC}" + echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}" + echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}" + echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}" + echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}" + echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}" + echo -e "${YELLOW}6. Clean shared path: rm -rf ${NC}" + echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}" + echo "" + echo -e "${BLUE}Note: Replace with your actual Omnia shared path.${NC}" echo "" } @@ -1652,6 +2032,27 @@ rollback_omnia_core() { exit 1 fi + # Create lock file to prevent concurrent rollbacks + local lock_file="/tmp/omnia_rollback.lock" + if [ -f "$lock_file" ]; then + local existing_pid + existing_pid=$(cat "$lock_file" 2>/dev/null | tr -d ' \t\n\r') + + if [ -n "$existing_pid" ] && kill -0 "$existing_pid" >/dev/null 2>&1; then + echo -e "${RED}ERROR: Another rollback process is already running (PID: $existing_pid)${NC}" + echo -e "${YELLOW}If this is incorrect, remove the lock file: rm -f $lock_file${NC}" + exit 1 + fi + + if [ -n "$existing_pid" ]; then + echo -e "${YELLOW}[WARN] Stale rollback lock file found (PID: $existing_pid); removing: $lock_file${NC}" + fi + rm -f "$lock_file" >/dev/null 2>&1 || true + fi + + echo "$$" > "$lock_file" + trap 'rm -f "$lock_file"' EXIT INT TERM + # Get current version if ! podman exec -u root omnia_core test -f "/opt/omnia/.data/oim_metadata.yml"; then echo -e "${RED}ERROR: Metadata file not found: /opt/omnia/.data/oim_metadata.yml${NC}" @@ -1659,48 +2060,56 @@ rollback_omnia_core() { fi local current_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') - if [ "$current_version" != "2.1.0.0" ]; then - echo -e "${RED}ERROR: Cannot rollback from version $current_version. Rollback is only supported from version 2.1.0.0.${NC}" - exit 1 - fi - # List available backups - echo "[INFO] [ROLLBACK] Scanning for available backups..." - local backup_dirs=() + # Get available rollback versions dynamically + local rollback_versions + rollback_versions=$(get_available_rollback_versions "$current_version") + + # Convert to array + local available_versions=() while IFS= read -r line; do - backup_dirs+=("$line") - done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_*" 2>/dev/null | sort -r) + available_versions+=("$line") + done <<< "$rollback_versions" - if [ ${#backup_dirs[@]} -eq 0 ]; then - echo -e "${RED}ERROR: No backup directories found.${NC}" + # Check if any rollback options are available + if [ ${#available_versions[@]} -eq 0 ]; then + echo -e "${RED}ERROR: No rollback versions available from $current_version.${NC}" exit 1 fi echo "" - echo "Available backup versions:" - for i in "${!backup_dirs[@]}"; do - local version=$(basename "${backup_dirs[$i]}" | sed 's/version_//') - local backup_date=$(podman exec -u root omnia_core stat -c '%y' "${backup_dirs[$i]}" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1) - echo " $((i+1)). Version $version (created: $backup_date)" + echo "Available rollback versions:" + echo "===========================" + for i in "${!available_versions[@]}"; do + local version="${available_versions[$i]}" + local container_tag=$(get_container_tag_from_version "$version") + + # Check if target image exists locally + local image_status="✓ Available" + if ! podman inspect "omnia_core:$container_tag" >/dev/null 2>&1; then + image_status="✗ Missing (build required)" + fi + + echo " $((i+1)). Rollback to version $version (container tag: $container_tag) [$image_status]" done - # Prompt for backup selection + # Prompt for rollback selection echo "" - echo -n "Select backup to restore from (1-${#backup_dirs[@]}): " + echo -n "Select rollback version (1-${#available_versions[@]}): " read -r selection # Validate selection - if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#backup_dirs[@]} ]; then + if ! [[ "$selection" =~ ^[0-9]+$ ]] || [ "$selection" -lt 1 ] || [ "$selection" -gt ${#available_versions[@]} ]; then echo -e "${RED}ERROR: Invalid selection.${NC}" exit 1 fi - local selected_backup="${backup_dirs[$((selection-1))]}" - local backup_version=$(basename "$selected_backup" | sed 's/version_//') + local selected_version="${available_versions[$((selection-1))]}" + local selected_container_tag=$(get_container_tag_from_version "$selected_version") echo "" - echo "Selected backup: Version $backup_version" - echo -n "Are you sure you want to rollback to version $backup_version? [y/N]: " + echo "Selected rollback: Version $selected_version" + echo -n "Are you sure you want to rollback to version $selected_version? [y/N]: " read -r confirm if [[ ! "$confirm" =~ ^[yY] ]]; then @@ -1708,50 +2117,99 @@ rollback_omnia_core() { exit 0 fi - # Validate selected backup - only check if directory exists without podman exec - if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then - # Try to check on host if container check fails - # Get shared path from metadata to check on host - local shared_path=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') - local host_backup_path="${selected_backup#/opt/omnia}" - if [ -z "$shared_path" ] || [ ! -d "$shared_path$host_backup_path" ]; then - echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}" + # Pre-validation: Check if target container image exists locally + if ! validate_container_image "$selected_version" "$selected_container_tag" "rollback"; then + exit 1 + fi + + # Check if container tag change is needed + local current_container_tag=$(get_container_tag_from_version "$current_version") + if [ "$current_container_tag" = "$selected_container_tag" ]; then + echo -e "${BLUE}Rollback within same container tag ($selected_container_tag)${NC}" + echo -e "${BLUE}Will restart container instead of swapping${NC}" + + # Perform same-tag rollback (container restart only) + if ! rollback_same_tag "$selected_version" "$current_version"; then + echo "[ERROR] [ROLLBACK] Rollback failed in same-tag rollback" exit 1 fi + + echo "[INFO] [ROLLBACK] Rollback completed successfully" + echo "[INFO] [ROLLBACK] Version rolled back to: $selected_version" + exit 0 + else + echo -e "${BLUE}Container tag change required ($current_container_tag -> $selected_container_tag)${NC}" + echo -e "${BLUE}Will perform full container swap${NC}" + # Continue with existing container swap logic + fi + + # List available backups for selected version + echo "[INFO] [ROLLBACK] Scanning for available backups for version $selected_version..." + local backup_dirs=() + while IFS= read -r line; do + backup_dirs+=("$line") + done < <(podman exec -u root omnia_core find /opt/omnia/backups/upgrade -maxdepth 1 -type d -name "version_${selected_version}*" 2>/dev/null | sort -r) + + if [ ${#backup_dirs[@]} -eq 0 ]; then + echo -e "${RED}ERROR: No backup directories found for version $selected_version.${NC}" + exit 1 + fi + + echo "" + echo "Available backups for version $selected_version:" + for i in "${!backup_dirs[@]}"; do + local backup_path="${backup_dirs[$i]}" + local backup_date=$(podman exec -u root omnia_core stat -c '%y' "$backup_path" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1) + echo " $((i+1)). Backup created: $backup_date" + done + + # Prompt for backup selection + echo "" + echo -n "Select backup to restore from (1-${#backup_dirs[@]}): " + read -r backup_selection + + # Validate backup selection + if ! [[ "$backup_selection" =~ ^[0-9]+$ ]] || [ "$backup_selection" -lt 1 ] || [ "$backup_selection" -gt ${#backup_dirs[@]} ]; then + echo -e "${RED}ERROR: Invalid backup selection.${NC}" + exit 1 + fi + + local selected_backup="${backup_dirs[$((backup_selection-1))]}" + + # Validate selected backup exists + if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then + echo -e "${RED}ERROR: Backup directory does not exist: $selected_backup${NC}" + exit 1 fi echo "" echo "[INFO] [ROLLBACK] Starting rollback process..." - # Step 1: Stop 2.1 container gracefully + # Step 1: Stop current container gracefully echo "" - echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core 2.1 container..." + echo "[INFO] [ROLLBACK] Step 1: Stopping Omnia core $current_container_tag container..." if ! stop_container_gracefully "omnia_core" 30; then echo -e "${RED}ERROR: Failed to stop container.${NC}" display_cleanup_instructions exit 1 fi - # Step 2: Check for 1.0 image + # Step 2: Update Quadlet file to use target container tag echo "" - echo "[INFO] [ROLLBACK] Step 2: Checking for Omnia core 1.0 image..." - if ! podman inspect omnia_core:1.0 >/dev/null 2>&1; then - echo -e "${YELLOW}WARNING: Omnia core 1.0 image not found locally.${NC}" - echo -e "${YELLOW}Attempting to tag image...${NC}" - - # Try to tag latest as 1.0 if available - if podman inspect omnia_core:latest >/dev/null 2>&1; then - podman tag omnia_core:latest omnia_core:1.0 - else - echo -e "${RED}ERROR: Omnia core 1.0 image not available. Please load the image first.${NC}" - display_cleanup_instructions - exit 1 - fi + echo "[INFO] [ROLLBACK] Step 2: Updating Quadlet file to use container tag $selected_container_tag..." + local quadlet_file="/etc/containers/systemd/omnia_core.container" + + if ! sed -i "s/^Image=omnia_core:.*/Image=omnia_core:$selected_container_tag/" "$quadlet_file"; then + echo -e "${RED}ERROR: Failed to update Image to $selected_container_tag in quadlet file${NC}" + display_cleanup_instructions + exit 1 fi - # Step 3: Start 1.0 container + echo "[INFO] [ROLLBACK] Quadlet file updated to use omnia_core:$selected_container_tag" + + # Step 3: Start target container echo "" - echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core 1.0 container..." + echo "[INFO] [ROLLBACK] Step 3: Starting Omnia core $selected_container_tag container..." systemctl daemon-reload if ! systemctl start omnia_core.service; then echo -e "${RED}ERROR: Failed to start container service.${NC}" @@ -1805,8 +2263,8 @@ rollback_omnia_core() { echo "[INFO] [ROLLBACK] Step 7: Verifying container version..." local verify_version=$(podman exec -u root omnia_core grep '^omnia_version:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2 | tr -d ' \t\n\r') - if [ "$verify_version" != "$backup_version" ]; then - echo -e "${RED}ERROR: Version verification failed. Expected: $backup_version, Found: $verify_version${NC}" + if [ "$verify_version" != "$selected_version" ]; then + echo -e "${RED}ERROR: Version verification failed. Expected: $selected_version, Found: $verify_version${NC}" display_cleanup_instructions exit 1 fi @@ -1814,18 +2272,22 @@ rollback_omnia_core() { # Audit log end local rollback_end=$(date -Iseconds) echo "[AUDIT] Rollback operation completed at: $rollback_end" - echo "[AUDIT] Rolled back from version $current_version to $backup_version" + echo "[AUDIT] Rolled back from version $current_version to $selected_version" echo "" echo -e "${GREEN}================================================================================${NC}" echo -e "${GREEN} ROLLBACK COMPLETED SUCCESSFULLY${NC}" echo -e "${GREEN}================================================================================${NC}" echo "" - echo -e "${GREEN}✓ Omnia core has been rolled back to version $backup_version${NC}" + echo -e "${GREEN}✓ Omnia core has been rolled back to version $selected_version${NC}" echo -e "${GREEN}✓ Container is running and healthy${NC}" echo -e "${GREEN}✓ Configuration restored from backup${NC}" echo "" + # Clean up lock file before starting long-running ssh session + rm -f "$lock_file" >/dev/null 2>&1 || true + echo "[INFO] Rollback lock file removed before starting container session" + # Initialize SSH config and start container session init_ssh_config start_container_session From 2078496e82aa5525bfc6255373f8f42ca4a51fa2 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Tue, 17 Feb 2026 12:35:42 +0530 Subject: [PATCH 38/77] LDMS Slurm node add /delete (#3976) * LDMS slurm node add/delete * pr review comments update --- .../telemetry/tasks/check_pxe_changes.yml | 88 ++++++++++ discovery/roles/telemetry/tasks/main.yml | 10 ++ .../telemetry/tasks/restart_ldms_configs.yml | 151 ++++++++++++++++++ discovery/roles/telemetry/vars/main.yml | 21 +++ 4 files changed, 270 insertions(+) create mode 100644 discovery/roles/telemetry/tasks/check_pxe_changes.yml create mode 100644 discovery/roles/telemetry/tasks/restart_ldms_configs.yml diff --git a/discovery/roles/telemetry/tasks/check_pxe_changes.yml b/discovery/roles/telemetry/tasks/check_pxe_changes.yml new file mode 100644 index 0000000000..398c831961 --- /dev/null +++ b/discovery/roles/telemetry/tasks/check_pxe_changes.yml @@ -0,0 +1,88 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if current PXE mapping file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + delegate_to: localhost + register: current_pxe_file + +- name: Check if backup PXE mapping file exists + ansible.builtin.stat: + path: "{{ backup_pxe_mapping_ldms_path }}" + delegate_to: localhost + register: backup_pxe_file + +- name: Handle first discovery run (no backup exists) + when: + - current_pxe_file.stat.exists + - not backup_pxe_file.stat.exists + block: + - name: Create backup of PXE mapping file + ansible.builtin.copy: + src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + dest: "{{ backup_pxe_mapping_ldms_path }}" + remote_src: true + mode: preserve + delegate_to: localhost + + - name: Set pxe_changed to false for first run + ansible.builtin.set_fact: + pxe_changed: false + + - name: Display first run message + ansible.builtin.debug: + msg: "{{ pxe_first_run_msg }}" + +- name: Compare PXE mapping files when backup exists + when: + - current_pxe_file.stat.exists + - backup_pxe_file.stat.exists + block: + - name: Get checksum of current PXE mapping file + ansible.builtin.stat: + path: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + checksum_algorithm: sha256 + delegate_to: localhost + register: current_pxe_checksum + + - name: Get checksum of backup PXE mapping file + ansible.builtin.stat: + path: "{{ backup_pxe_mapping_ldms_path }}" + checksum_algorithm: sha256 + delegate_to: localhost + register: backup_pxe_checksum + + - name: Set pxe_changed based on checksum comparison + ansible.builtin.set_fact: + pxe_changed: "{{ current_pxe_checksum.stat.checksum != backup_pxe_checksum.stat.checksum }}" + + - name: Update backup PXE mapping file when changed + ansible.builtin.copy: + src: "{{ hostvars['localhost']['pxe_mapping_file_path'] }}" + dest: "{{ backup_pxe_mapping_ldms_path }}" + remote_src: true + mode: preserve + delegate_to: localhost + when: pxe_changed | bool + + - name: Display PXE change status + ansible.builtin.debug: + msg: "{{ pxe_changed_msg if (pxe_changed | bool) else pxe_no_change_msg }}" + +- name: Set pxe_changed to false when PXE file is missing + ansible.builtin.set_fact: + pxe_changed: false + when: not current_pxe_file.stat.exists diff --git a/discovery/roles/telemetry/tasks/main.yml b/discovery/roles/telemetry/tasks/main.yml index 825c3988d7..e4e3d1846a 100644 --- a/discovery/roles/telemetry/tasks/main.yml +++ b/discovery/roles/telemetry/tasks/main.yml @@ -55,3 +55,13 @@ - name: Update ldms agg configuration ansible.builtin.include_tasks: update_ldms_agg_config.yml when: hostvars['localhost']['ldms_support'] + +- name: Check if PXE mapping has changed since last run + ansible.builtin.include_tasks: check_pxe_changes.yml + when: hostvars['localhost']['ldms_support'] + +- name: Restart LDMS configs for node addition and deletion + ansible.builtin.include_tasks: restart_ldms_configs.yml + when: + - hostvars['localhost']['ldms_support'] + - pxe_changed | default(false) | bool diff --git a/discovery/roles/telemetry/tasks/restart_ldms_configs.yml b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml new file mode 100644 index 0000000000..0a176118f0 --- /dev/null +++ b/discovery/roles/telemetry/tasks/restart_ldms_configs.yml @@ -0,0 +1,151 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Load high availability config + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['input_project_dir'] }}/high_availability_config.yml" + name: ha_config + +- name: Set kube_vip fact + ansible.builtin.set_fact: + kube_vip: "{{ ha_config.service_k8s_cluster_ha[0].virtual_ip_address | default('') }}" + +- name: Test SSH connectivity to kube VIP only when PXE has changed + when: + - kube_vip | length > 0 + - pxe_changed | default(false) | bool + block: + - name: SSH test to kube VIP + ansible.builtin.command: + cmd: "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes {{ kube_vip }} echo reachable" + delegate_to: localhost + register: kube_vip_ssh_check + changed_when: false + + - name: Set kube VIP reachable fact + ansible.builtin.set_fact: + kube_vip_reachable: "{{ kube_vip_ssh_check.rc == 0 }}" + + rescue: + - name: Display kube VIP unreachable message + ansible.builtin.debug: + msg: "{{ kube_vip_unreachable_msg }}" + + - name: Set kube VIP reachable fact to false + ansible.builtin.set_fact: + kube_vip_reachable: false + +- name: Restart LDMS aggregator when PXE has changed + when: pxe_changed | default(false) | bool + block: + - name: Check if LDMS aggregator is running on service k8s cluster + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: StatefulSet + name: nersc-ldms-aggr + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + register: ldms_statefulset_info + failed_when: false + when: + - kube_vip_reachable | bool + + - name: Set LDMS running state + ansible.builtin.set_fact: + ldms_running: "{{ ldms_statefulset_info.resources is defined and ldms_statefulset_info.resources | length > 0 }}" + when: + - kube_vip_reachable | bool + + - name: Check if LDMS conf ConfigMap file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml" + register: ldms_conf_file + when: ldms_running | default(false) | bool + + - name: Check if LDMS bin ConfigMap file exists + ansible.builtin.stat: + path: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml" + register: ldms_bin_file + when: ldms_running | default(false) | bool + + - name: Apply LDMS configuration ConfigMap + kubernetes.core.k8s: + state: present + src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-conf.yaml" + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + + - name: Apply LDMS scripts ConfigMap + kubernetes.core.k8s: + state: present + src: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/ldms/nersc-ldms-aggr/nersc-ldms-aggr/templates/cm.nersc-ldms-bin.yaml" + namespace: "{{ telemetry_namespace }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_bin_file.stat.exists | default(false) + + - name: Restart LDMS aggregator StatefulSet + kubernetes.core.k8s: + state: present + definition: + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: nersc-ldms-aggr + namespace: "{{ telemetry_namespace }}" + spec: + template: + metadata: + annotations: + kubectl.kubernetes.io/restartedAt: "{{ ansible_date_time.iso8601 }}" + delegate_to: "{{ kube_vip }}" + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) + + - name: Wait for LDMS aggregator pod to be ready after restart + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ telemetry_namespace }}" + label_selectors: + - "app=nersc-ldms-aggr" + wait: true + wait_condition: + type: Ready + status: "True" + wait_timeout: 120 + delegate_to: "{{ kube_vip }}" + register: ldms_pod_ready + failed_when: false + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) + + - name: Display LDMS aggregator restart status + ansible.builtin.debug: + msg: "{{ ldms_pod_ready_msg if (ldms_pod_ready.resources | default([]) | length > 0) else ldms_pod_not_ready_msg }}" + when: + - ldms_running | default(false) | bool + - ldms_conf_file.stat.exists | default(false) + - ldms_bin_file.stat.exists | default(false) diff --git a/discovery/roles/telemetry/vars/main.yml b/discovery/roles/telemetry/vars/main.yml index 5c5838ce29..69b0c0c0ac 100644 --- a/discovery/roles/telemetry/vars/main.yml +++ b/discovery/roles/telemetry/vars/main.yml @@ -252,3 +252,24 @@ common_templates: skip_when: "{{ cluster_id_present | default(false) }}" - src: 'telemetry/kustomization.yaml.j2' dest: 'kustomization.yaml' + +# Usage: check_pxe_changes.yml +backup_pxe_mapping_ldms_path: "/opt/omnia/telemetry/backup_pxe_mapping_ldms.csv" +pxe_first_run_msg: "First discovery run detected. Saving PXE mapping backup. LDMS restart not required." +pxe_no_change_msg: "PXE mapping file has not changed since last run. Skipping LDMS restart." +pxe_changed_msg: "PXE mapping file has changed. LDMS restart will be triggered." + +# Usage: restart_ldms_configs.yml +kube_vip_unreachable_msg: >- + Kube VIP ({{ kube_vip }}) is not reachable via SSH. + There might be issues with the k8s cluster. + LDMS aggregator restart will be skipped. + + After discovery completes, manually restart the LDMS aggregator pod with: + + ssh {{ kube_vip }} + kubectl rollout restart statefulset nersc-ldms-aggr -n {{ telemetry_namespace }} + kubectl get pods -n {{ telemetry_namespace }} -l app=nersc-ldms-aggr -w + +ldms_pod_ready_msg: "LDMS aggregator pod is ready." +ldms_pod_not_ready_msg: "WARNING: LDMS aggregator pod did not become ready within 120s." From 7953e3c519a8bd5ba72e820832ef596f062b1357 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 18 Feb 2026 00:31:55 +0530 Subject: [PATCH 39/77] Node drain logic for deletion --- .../slurm_config/tasks/check_ctld_running.yml | 12 ++- .../tasks/drain_and_remove_node.yml | 102 ++++++++++++++++++ discovery/roles/slurm_config/vars/main.yml | 3 + 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 discovery/roles/slurm_config/tasks/drain_and_remove_node.yml diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 7d908169ab..ce27d3c362 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -22,6 +22,16 @@ register: ssh_check ignore_errors: true +- name: Drain and remove nodes if any + ansible.builtin.include_tasks: drain_and_remove_node.yml + loop: "{{ nodes_in_normal_not_in_cmpt }}" + loop_control: + loop_var: node_to_remove + when: + - ssh_check is success + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + - name: Enter slurm controller when pingable when: - ssh_check is success @@ -37,7 +47,7 @@ register: service_facts ignore_unreachable: true - - name: Fail if slurmctld is unreachable + - name: Check slurmctld is reachable ansible.builtin.fail: msg: "Failed to connect to {{ ctld }}." when: service_facts is unreachable diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml new file mode 100644 index 0000000000..7b40363808 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -0,0 +1,102 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Check if node exists in Slurm cluster + ansible.builtin.command: scontrol show node {{ node_to_remove }} + register: node_exists_check + failed_when: false + ignore_unreachable: true + changed_when: false + delegate_to: "{{ ctld }}" + +- name: Skip if node does not exist + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} not found in cluster, skipping removal" + when: + - node_exists_check is reachable + - node_exists_check.rc != 0 + +- name: Process node removal + when: + - node_exists_check is reachable + - node_exists_check.rc == 0 + ignore_unreachable: true + block: + - name: Get current job count on node + ansible.builtin.command: squeue -w {{ node_to_remove }} -h | wc -l + register: current_jobs + changed_when: false + delegate_to: "{{ ctld }}" + + - name: Display job information + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)" + + - name: Drain the node to prevent new job assignments + ansible.builtin.command: > + scontrol update NodeName={{ node_to_remove }} + State=DRAIN + Reason="Scheduled removal - waiting for jobs to complete" + changed_when: true + delegate_to: "{{ ctld }}" + + - name: Wait for all jobs to complete on the node + ansible.builtin.command: squeue -w {{ node_to_remove }} -h | wc -l + register: job_count_check + until: job_count_check.stdout | int == 0 + retries: "{{ (node_drain_timeout / node_drain_delay) | int }}" + delay: "{{ node_drain_delay }}" + changed_when: false + delegate_to: "{{ ctld }}" + when: current_jobs.stdout | int > 0 + + - name: Confirm jobs completed + ansible.builtin.debug: + msg: "All jobs on {{ node_to_remove }} have completed" + when: current_jobs.stdout | int > 0 + + - name: Log node removal + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state" + + rescue: + - name: Log node removal failure + ansible.builtin.debug: + msg: "Failed to drain node {{ node_to_remove }}" + + - name: Prompt for node with running job after timeout + ansible.builtin.pause: + prompt: | + Jobs are still running on {{ node_to_remove }}. + Options: + 1. Press Ctrl+C then 'A' to abort + 2. Press Enter to force removal (jobs will be killed) + when: not force_scancel_node + + - name: Force cancel jobs if timeout reached + ansible.builtin.command: scancel -f -w {{ node_to_remove }} + changed_when: true + failed_when: false + delegate_to: "{{ ctld }}" + + always: + - name: Set node to DOWN state + ansible.builtin.command: > + scontrol update NodeName={{ node_to_remove }} + State=DOWN + Reason="Node removed from cluster" + changed_when: true + failed_when: false + delegate_to: "{{ ctld }}" + when: node_exists_check.rc == 0 diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 1593f791cb..39311ca64d 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -117,6 +117,9 @@ munge_dir_mode: "0700" common_mode: "0755" slurm_dbd_mode: "0600" slurm_db_cnf_mode: "0600" +node_drain_timeout: 900 +node_drain_delay: 30 +force_scancel_node: false dbd_slurm_conf: AccountingStoragePort: "{{ slurm_dbd_port }}" AccountingStorageType: accounting_storage/slurmdbd From 4dbc6a978fdbcbd74c7a7c62e75ab47c399784be Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Wed, 18 Feb 2026 07:32:41 +0000 Subject: [PATCH 40/77] mask docker credentials in local_repo logs Signed-off-by: Vrinda_Marwah --- .../library/module_utils/local_repo/parse_and_download.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/library/module_utils/local_repo/parse_and_download.py b/common/library/module_utils/local_repo/parse_and_download.py index 15bed1efb3..d5192e2bbe 100644 --- a/common/library/module_utils/local_repo/parse_and_download.py +++ b/common/library/module_utils/local_repo/parse_and_download.py @@ -84,16 +84,16 @@ def execute_command(cmd_string, logger, type_json=False): logger.error(f"Raw output was: {status['stdout']}") return False - logger.info(f"Command succeeded: {cmd_string}") + logger.info(f"Command succeeded: {safe_cmd_string}") return status except subprocess.CalledProcessError as e: - logger.error(f"Command failed: {cmd_string} - {e}") + logger.error(f"Command failed: {safe_cmd_string} - {e}") return False except subprocess.TimeoutExpired as e: - logger.error(f"Command timed out: {cmd_string} - {e}") + logger.error(f"Command timed out: {safe_cmd_string} - {e}") return False except OSError as e: - logger.error(f"OS error during command: {cmd_string} - {e}") + logger.error(f"OS error during command: {safe_cmd_string} - {e}") return False finally: From f657612bbaab6891b2b6342cbb186fecf45cf43f Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 18 Feb 2026 13:19:27 +0530 Subject: [PATCH 41/77] Shell instead of command for piping --- discovery/roles/slurm_config/tasks/drain_and_remove_node.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml index 7b40363808..2de076a5a0 100644 --- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -34,7 +34,7 @@ ignore_unreachable: true block: - name: Get current job count on node - ansible.builtin.command: squeue -w {{ node_to_remove }} -h | wc -l + ansible.builtin.shell: squeue -w {{ node_to_remove }} -h | wc -l register: current_jobs changed_when: false delegate_to: "{{ ctld }}" @@ -52,7 +52,7 @@ delegate_to: "{{ ctld }}" - name: Wait for all jobs to complete on the node - ansible.builtin.command: squeue -w {{ node_to_remove }} -h | wc -l + ansible.builtin.shell: squeue -w {{ node_to_remove }} -h | wc -l register: job_count_check until: job_count_check.stdout | int == 0 retries: "{{ (node_drain_timeout / node_drain_delay) | int }}" From d8bbd64b31daa7d6a540ea514662f639ea8a1641 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 18 Feb 2026 13:35:41 +0530 Subject: [PATCH 42/77] lint fixes --- .../slurm_config/tasks/drain_and_remove_node.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml index 2de076a5a0..da1c41d3fe 100644 --- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -34,7 +34,10 @@ ignore_unreachable: true block: - name: Get current job count on node - ansible.builtin.shell: squeue -w {{ node_to_remove }} -h | wc -l + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l register: current_jobs changed_when: false delegate_to: "{{ ctld }}" @@ -52,7 +55,10 @@ delegate_to: "{{ ctld }}" - name: Wait for all jobs to complete on the node - ansible.builtin.shell: squeue -w {{ node_to_remove }} -h | wc -l + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l register: job_count_check until: job_count_check.stdout | int == 0 retries: "{{ (node_drain_timeout / node_drain_delay) | int }}" @@ -75,10 +81,11 @@ ansible.builtin.debug: msg: "Failed to drain node {{ node_to_remove }}" - - name: Prompt for node with running job after timeout + - name: Remove slurm node with running job after timeout ansible.builtin.pause: prompt: | - Jobs are still running on {{ node_to_remove }}. + Node {{ node_to_remove }} has been DRAINED to prevent new job assignments. + Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds. Options: 1. Press Ctrl+C then 'A' to abort 2. Press Enter to force removal (jobs will be killed) From 76d7f3cd0c9c77fd0467a18249be12edc4236b34 Mon Sep 17 00:00:00 2001 From: Nethravathi M G <146437298+nethramg@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:04:12 +0530 Subject: [PATCH 43/77] Removing the IP's from the Activated IP list (#3992) --- telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 index 06bf230980..54986f418f 100644 --- a/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 +++ b/telemetry/roles/idrac_telemetry/templates/telemetry_report.j2 @@ -2,9 +2,9 @@ ----- Telemetry Report for Cluster ----- -Total IP count with Telemetry activated: {{ (db_idrac_ip_list | length) + (telemetry_idrac | length) }} +Total IP count with Telemetry activated: {{ ((db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([]))) | length }} Telemetry activated IPs List: -{% for item in db_idrac_ip_list + telemetry_idrac %} +{% for item in (db_idrac_ip_list + telemetry_idrac) | difference(deleted_idrac_ips | default([])) %} - {{ item }} {% endfor %} From 272bfb51c94fe7283bc3256c32894882b7b032e8 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Thu, 19 Feb 2026 14:41:35 +0530 Subject: [PATCH 44/77] Fix for local_repo.yml allows passes even with invalid package names in JSON files. Signed-off-by: pullan1 --- .../library/module_utils/local_repo/config.py | 6 +- .../local_repo/container_repo_utils.py | 161 ++++++++++-------- .../module_utils/local_repo/download_rpm.py | 89 +++++++++- 3 files changed, 178 insertions(+), 78 deletions(-) diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index a731c8528d..7bfea4b301 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -64,6 +64,10 @@ "x86_64": ["dnf", "download", "--resolve", "--alldeps", "--arch=x86_64,noarch"], "aarch64": ["dnf", "download", "--forcearch", "aarch64", "--resolve", "--alldeps", "--exclude=*.x86_64"] } +DNF_INFO_COMMANDS = { + "x86_64": ["dnf", "info", "--quiet"], + "aarch64": ["dnf", "info", "--quiet", "--forcearch=aarch64"] +} # ---------------------------- # Used by download_common.py @@ -222,7 +226,7 @@ # Naming convention: _omnia-additional to match existing filter patterns # ---------------------------- ADDITIONAL_REPOS_KEY = "additional_repos" -AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional-repo" +AGGREGATED_REPO_NAME_TEMPLATE = "{arch}_omnia-additional" AGGREGATED_REMOTE_NAME_TEMPLATE = "{arch}_omnia-additional-{name}" AGGREGATED_DISTRIBUTION_NAME_TEMPLATE = "{arch}_omnia-additional" AGGREGATED_BASE_PATH_TEMPLATE = "opt/omnia/offline_repo/cluster/{arch}/rhel/10.0/rpms/omnia-additional" diff --git a/common/library/module_utils/local_repo/container_repo_utils.py b/common/library/module_utils/local_repo/container_repo_utils.py index 0a4abb35fb..e3f47869af 100644 --- a/common/library/module_utils/local_repo/container_repo_utils.py +++ b/common/library/module_utils/local_repo/container_repo_utils.py @@ -13,6 +13,13 @@ # limitations under the License. #pylint: disable=import-error,no-name-in-module +""" +Container repository utilities for Pulp operations. + +This module provides functions for creating, syncing, and managing +container repositories and distributions in Pulp. +""" + import multiprocessing from ansible.module_utils.local_repo.parse_and_download import execute_command from ansible.module_utils.local_repo.config import ( @@ -114,109 +121,119 @@ def sync_container_repository(repo_name, remote_name, package_content, logger, t logger.info(f"Getting repository version before sync for {repo_name}") verify_command = pulp_container_commands["show_container_repo"] % repo_name verify_result_before = execute_command(verify_command, logger, type_json=True) - + version_before = None - if verify_result_before and isinstance(verify_result_before, dict) and "stdout" in verify_result_before: + if (verify_result_before and isinstance(verify_result_before, dict) and + "stdout" in verify_result_before): repo_data_before = verify_result_before["stdout"] if isinstance(repo_data_before, dict): version_before = repo_data_before.get("latest_version_href") logger.info(f"Repository version before sync: {version_before}") - + command = pulp_container_commands["sync_container_repository"] % (repo_name, remote_name) result = execute_command(command,logger) if result is False or (isinstance(result, dict) and result.get("returncode", 1) != 0): logger.error(f"Sync command failed for repository {repo_name}") return False - + logger.info(f"Validating sync result for repository {repo_name}") verify_result_after = execute_command(verify_command, logger, type_json=True) - - if verify_result_after and isinstance(verify_result_after, dict) and "stdout" in verify_result_after: + + if (verify_result_after and isinstance(verify_result_after, dict) and + "stdout" in verify_result_after): repo_data_after = verify_result_after["stdout"] if isinstance(repo_data_after, dict): version_after = repo_data_after.get("latest_version_href") logger.info(f"Repository version after sync: {version_after}") - + if not version_after or version_after.endswith("/versions/0/"): logger.error(f"Sync completed but no content was downloaded for {repo_name}. " f"The specified image tag likely does not exist in the upstream registry.") return False - + if version_before and version_after and version_before == version_after: # Check if tag actually exists using precise Pulp commands try: # Step 1: Get distribution to find repository href dist_command = f"pulp container distribution show --name {repo_name}" dist_result = execute_command(dist_command, logger, type_json=True) - + if not dist_result or not isinstance(dist_result, dict) or "stdout" not in dist_result: - logger.error(f"Failed to get distribution info for {repo_name}. Assuming tag doesn't exist.") - return False - - dist_data = dist_result["stdout"] - if not isinstance(dist_data, dict) or "repository" not in dist_data: - logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.") - return False - - repo_href = dist_data["repository"] - logger.info(f"Found repository href: {repo_href}") - - # Step 2: Get repository version href - repo_command = f"pulp container repository show --href {repo_href}" - repo_result = execute_command(repo_command, logger, type_json=True) - - if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result: - logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.") - return False - - repo_data = repo_result["stdout"] - if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data: - logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.") - return False - - repo_ver_href = repo_data["latest_version_href"] - logger.info(f"Found repository version href: {repo_ver_href}") - - # Step 3: Check if tag exists in content - tags_command = f"pulp show --href '/pulp/api/v3/content/container/tags/?repository_version={repo_ver_href}'" - tags_result = execute_command(tags_command, logger, type_json=True) - - if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result: - logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.") - return False - - tags_data = tags_result["stdout"] - if not isinstance(tags_data, dict) or "results" not in tags_data: - logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.") - return False - - tags = tags_data["results"] - tag_exists = False - - # Use the tag parameter if provided, otherwise fall back to checking package_content - tag_to_check = tag if tag else package_content - - for tag_item in tags: - if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check: - tag_exists = True - break - - if tag_exists: - logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.") + logger.info(f"Distribution {repo_name} does not exist yet - skipping tag validation, will create distribution") + # Skip tag validation but continue to create distribution at line 221 else: - logger.error(f"Sync completed but repository version did not change for {repo_name}. " - f"Version remained at {version_after}. " - f"Tag '{tag_to_check}' does not exist in Pulp repository content. " - f"This indicates the tag likely does not exist in the upstream registry.") - return False + # Distribution exists, validate the tag + dist_data = dist_result["stdout"] + if not isinstance(dist_data, dict) or "repository" not in dist_data: + logger.error(f"Invalid distribution data for {repo_name}. Assuming tag doesn't exist.") + return False + repo_href = dist_data["repository"] + logger.info(f"Found repository href: {repo_href}") + + # Step 2: Get repository version href + repo_command = f"pulp container repository show --href {repo_href}" + repo_result = execute_command(repo_command, logger, type_json=True) + + if not repo_result or not isinstance(repo_result, dict) or "stdout" not in repo_result: + logger.error(f"Failed to get repository info for {repo_href}. Assuming tag doesn't exist.") + return False + + repo_data = repo_result["stdout"] + if not isinstance(repo_data, dict) or "latest_version_href" not in repo_data: + logger.error(f"Invalid repository data for {repo_href}. Assuming tag doesn't exist.") + return False + + repo_ver_href = repo_data["latest_version_href"] + logger.info(f"Found repository version href: {repo_ver_href}") + + # Step 3: Check if tag exists in content + tags_command = ( + f"pulp show --href " + f"'/pulp/api/v3/content/container/tags/" + f"?repository_version={repo_ver_href}'" + ) + tags_result = execute_command(tags_command, logger, type_json=True) + + if not tags_result or not isinstance(tags_result, dict) or "stdout" not in tags_result: + logger.error(f"Failed to get content tags for {repo_ver_href}. Assuming tag doesn't exist.") + return False + + tags_data = tags_result["stdout"] + if not isinstance(tags_data, dict) or "results" not in tags_data: + logger.error(f"Invalid tags data for {repo_ver_href}. Assuming tag doesn't exist.") + return False + + tags = tags_data["results"] + tag_exists = False + + # Use the tag parameter if provided, otherwise fall back to checking package_content + tag_to_check = tag if tag else package_content + + for tag_item in tags: + if isinstance(tag_item, dict) and "name" in tag_item and tag_item["name"] == tag_to_check: + tag_exists = True + break + + if tag_exists: + logger.info(f"Tag '{tag_to_check}' already exists in Pulp repository {repo_name}. No sync needed - image is already available.") + else: + logger.error(f"Sync completed but repository version did not change for {repo_name}. " + f"Version remained at {version_after}. " + f"Tag '{tag_to_check}' does not exist in Pulp repository content. " + f"This indicates the tag likely does not exist in the upstream registry.") + return False except Exception as e: - logger.error(f"Error checking repository tag existence: {e}. Assuming tag doesn't exist.") + logger.error( + f"Error checking repository tag existence: {e}. Assuming tag doesn't exist." + ) return False - - logger.info(f"Sync validation successful: repository {repo_name} version changed from {version_before} to {version_after}") - - result = create_container_distribution(repo_name,package_content,logger) + + logger.info( + f"Sync validation successful: repository {repo_name} version changed " + f"from {version_before} to {version_after}" + ) + result = create_container_distribution(repo_name, package_content, logger) return result except Exception as e: logger.error(f"Failed to synchronize repository {repo_name} with remote {remote_name}. Error: {e}") diff --git a/common/library/module_utils/local_repo/download_rpm.py b/common/library/module_utils/local_repo/download_rpm.py index 95b354dd6b..44b56c1799 100644 --- a/common/library/module_utils/local_repo/download_rpm.py +++ b/common/library/module_utils/local_repo/download_rpm.py @@ -20,7 +20,8 @@ import shutil from pathlib import Path from ansible.module_utils.local_repo.config import ( - DNF_COMMANDS + DNF_COMMANDS, + DNF_INFO_COMMANDS ) from multiprocessing import Lock from ansible.module_utils.local_repo.parse_and_download import write_status_to_file @@ -95,11 +96,30 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, for pkg in rpm_list: # Get repo_name for this specific RPM from mapping pkg_repo_name = repo_mapping.get(pkg, "") - if any(pkg in line and ".rpm" in line for line in stdout_lines + stderr_lines): + # Check if package was downloaded successfully + # Look for "Already downloaded" or actual .rpm file in output + pkg_downloaded = False + for line in stdout_lines + stderr_lines: + if pkg in line and (".rpm" in line or "Already downloaded" in line): + pkg_downloaded = True + break + + # Also check for "No match for argument" or "No package" errors + pkg_not_found = False + for line in stderr_lines: + if pkg in line and ("No match for argument" in line or + "No package" in line or + "not found" in line.lower()): + pkg_not_found = True + break + + if pkg_downloaded and not pkg_not_found: downloaded.append(pkg) write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) else: failed.append(pkg) + if pkg_not_found: + logger.warning(f"Package '{pkg}' not found in configured repositories") # Retry failed ones individually if failed: @@ -110,6 +130,15 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, # Get repo_name for this specific RPM from mapping pkg_repo_name = repo_mapping.get(pkg, "") + # Check for package not found errors + retry_stderr = retry_res.stderr.lower() + pkg_invalid = any(err in retry_stderr for err in [ + "no match for argument", + "no package", + "not found", + "unable to find a match" + ]) + if retry_res.returncode == 0 and ".rpm" in retry_res.stdout + retry_res.stderr: downloaded.append(pkg) failed.remove(pkg) @@ -117,7 +146,10 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, logger.info(f"Package '{pkg}' downloaded successfully on retry.") else: write_status_to_file(status_file_path, pkg, "rpm", "Failed", logger, file_lock, pkg_repo_name) - logger.error(f"Package '{pkg}' still failed after retry.") + if pkg_invalid: + logger.error(f"Package '{pkg}' does not exist in configured repositories.") + else: + logger.error(f"Package '{pkg}' still failed after retry.") # Determine final status if not failed: @@ -128,12 +160,59 @@ def process_rpm(package, repo_store_path, status_file_path, cluster_os_type, status = "Failed" else: - status = "Success" logger.info("RPM won't be downloaded when repo_config is partial or never") + logger.info("Validating package availability using dnf info...") + + arch_key = "x86_64" if arc.lower() in ("x86_64") else "aarch64" + valid_packages = [] + invalid_packages = [] + for pkg in package["rpm_list"]: + # Validate package using dnf info + dnf_info_command = DNF_INFO_COMMANDS[arch_key] + [ + "--repo=*", # Search all enabled repositories + pkg + ] + result = subprocess.run( + dnf_info_command, + check=False, + capture_output=True, + text=True + ) # Get repo_name for this specific RPM from mapping pkg_repo_name = repo_mapping.get(pkg, "") - write_status_to_file(status_file_path, pkg, "rpm", "Success", logger, file_lock, pkg_repo_name) + if result.returncode == 0: + # Package exists and is available + valid_packages.append(pkg) + write_status_to_file( + status_file_path, pkg, "rpm", "Success", + logger, file_lock, pkg_repo_name + ) + logger.info(f"Package '{pkg}' validated successfully") + else: + # Package not found or invalid + invalid_packages.append(pkg) + write_status_to_file( + status_file_path, pkg, "rpm", "Failed", + logger, file_lock, pkg_repo_name + ) + logger.error( + f"Package '{pkg}' validation failed. " + f"Package may not exist in configured repositories." + ) + + # Determine final status based on validation results + if not invalid_packages: + status = "Success" + elif valid_packages: + status = "Partial" + else: + status = "Failed" + + logger.info( + f"Validation complete - Valid: {len(valid_packages)}, " + f"Invalid: {len(invalid_packages)}" + ) except Exception as e: logger.error(f"Exception occurred: {e}") From 5a03ffcf03c7dc7612ccd59f501b495fa105d4d1 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Thu, 19 Feb 2026 09:40:22 +0000 Subject: [PATCH 45/77] checkmarx fixes Signed-off-by: Vrinda_Marwah --- .../module_utils/local_repo/software_utils.py | 38 ++++++++++++++++++- .../local_repo/user_image_utility.py | 8 ++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index 3e06ddc7cd..126020f930 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -21,6 +21,7 @@ import json import csv import re +import shlex import yaml from jinja2 import Template import requests @@ -526,6 +527,37 @@ def get_failed_software(file_path): ] return failed_software +def _sanitize_shell_arg(value, logger, field_name="value"): + """ + Sanitize a value before using it in a shell command to prevent argument injection. + + Validates the value against a strict allowlist of characters that are safe + for shell interpolation, then applies shlex.quote for safe shell escaping. + + Args: + value (str): The value to sanitize. + logger (logging.Logger): Logger instance. + field_name (str): Name of the field being sanitized (for logging). + + Returns: + str: The sanitized, shell-quoted value. + + Raises: + ValueError: If the value contains disallowed characters. + """ + if not isinstance(value, str) or not value: + raise ValueError(f"Invalid {field_name}: must be a non-empty string") + value = value.strip().strip('"') + safe_pattern = re.compile(r'^[a-zA-Z0-9._\-/:@=?&\[\]]+$') + if not safe_pattern.match(value): + logger.error("Potentially unsafe characters detected in %s: %s", field_name, value) + raise ValueError( + f"Invalid {field_name}{value}: contains disallowed characters. " + f"Only alphanumeric characters and ._-/:@=?&[] are allowed." + ) + return shlex.quote(value) + + def check_additional_image_in_pulp(image_entry, logger): """ Checks if image present in additional_packages.json is configured in Pulp. @@ -536,6 +568,8 @@ def check_additional_image_in_pulp(image_entry, logger): logger.info("Checking if %s is present in Pulp", image_name) + _sanitize_shell_arg(image_name, logger, "image_name") + dist_name_prefix = "container_repo_" transformed_dist_name = (f"{dist_name_prefix}{image_name.replace('/', '_').replace(':', '_')}") @@ -543,7 +577,7 @@ def check_additional_image_in_pulp(image_entry, logger): latest_version_href_result = None tags_output_result = None - show_dist_cmd = (pulp_container_commands["container_distribution_show"] % transformed_dist_name) + show_dist_cmd = (pulp_container_commands["container_distribution_show"] % shlex.quote(transformed_dist_name)) repo_href_result = execute_command(show_dist_cmd, logger) logger.info("repo_href_result: %s", repo_href_result) @@ -557,6 +591,7 @@ def check_additional_image_in_pulp(image_entry, logger): else: logger.info("Distribution %s found in Pulp", transformed_dist_name) repo_href = repo_href_result["stdout"] + repo_href = _sanitize_shell_arg(repo_href, logger, "repo_href") show_repo_cmd = (pulp_container_commands["show_repository_version"] % repo_href) latest_version_href_result = execute_command(show_repo_cmd, logger) logger.info("latest_version_href_result: %s", latest_version_href_result) @@ -570,6 +605,7 @@ def check_additional_image_in_pulp(image_entry, logger): else: logger.info("Repository version found in Pulp") latest_version_href = latest_version_href_result["stdout"] + latest_version_href = _sanitize_shell_arg(latest_version_href, logger, "latest_version_href") show_tags_cmd = (pulp_container_commands["list_image_tags"] % latest_version_href) tags_output_result = execute_command(show_tags_cmd, logger, type_json=True) logger.info("tags_output_result: %s", tags_output_result) diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py index e97e9411dd..4c68cd1803 100644 --- a/common/library/module_utils/local_repo/user_image_utility.py +++ b/common/library/module_utils/local_repo/user_image_utility.py @@ -58,9 +58,11 @@ def check_image_in_registry( """ if not host.startswith(("http://", "https://")): - protocol = "https" if (cacert and key) else "http" - host = f"{protocol}://{host}" - image_url = f"{host}/v2/{image}/manifests/{tag}" + if cacert and key: + image_url = f"https://{host}/v2/{image}/manifests/{tag}" + else: + image_url = f"http://{host}/v2/{image}/manifests/{tag}" + logger.info(f"Checking image existence at: {image_url}") try: From fa0cd325ee7b38bafedc794c6bb47242d88323f1 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 19 Feb 2026 17:19:13 +0530 Subject: [PATCH 46/77] Updated permission for slurmdbd Added new force_conf option for allowing confs pass through validation --- .../input_validation/schema/omnia_config.json | 4 ++ .../validation_flows/common_validation.py | 9 ++- .../slurm_config/tasks/build_slurm_conf.yml | 5 ++ discovery/roles/slurm_config/tasks/confs.yml | 14 ++-- .../slurm_config/tasks/create_slurm_dir.yml | 1 + .../tasks/handle_forced_confs.yml | 64 +++++++++++++++++++ .../roles/slurm_config/tasks/remove_node.yml | 2 +- discovery/roles/slurm_config/vars/main.yml | 3 +- input/omnia_config.yml | 10 +++ 9 files changed, 102 insertions(+), 10 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/handle_forced_confs.yml diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index f53485770f..f7771d9441 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -19,6 +19,10 @@ "minLength": 1, "description": "Name of the nfs storage in storage_config.yml" }, + "force_conf": { + "type": "boolean", + "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" + }, "config_sources": { "type": "object", "description": "Config can be a file path or inline mapping", diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index f577a4e9b8..7726df24fb 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1074,9 +1074,12 @@ def validate_omnia_config( "slurm NFS not provided", f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) - cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation") - for cfg_path_dict in cnfg_src: + cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + force_conf_list = [clst.get('force_conf', False) for clst in data.get('slurm_cluster')] + for idx, cfg_path_dict in enumerate(cnfg_src): + force_conf = force_conf_list[idx] for k,v in cfg_path_dict.items(): conf_dict = None if isinstance(v, str): @@ -1086,7 +1089,7 @@ def validate_omnia_config( f"provided conf path for {k} - {v} does not exist")) continue else: # path exists - if not skip_conf_validation: + if not force_conf and not skip_conf_validation: conf_dict, duplicate_keys = parse_slurm_conf(v, k, False) if duplicate_keys: errors.append( diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index 9d5d0f0944..40b6137172 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Read NodeName parameters from iDRAC + ansible.builtin.include_tasks: read_node_idrac.yml + when: cmpt_list + loop: "{{ cmpt_list }}" + - name: Append node_params list into NodeName list ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index c5f7953b0d..3764ecc18a 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -17,13 +17,16 @@ apply_config: "{{ __default_config }}" no_log: true -- name: Read NodeName parameters - ansible.builtin.include_tasks: read_node_idrac.yml - when: cmpt_list - loop: "{{ cmpt_list }}" +- name: Remove keys from conf_files if they have string values in configs_input (when force_conf is true) + ansible.builtin.set_fact: + conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}" + when: + - force_conf | default(false) + - configs_input is defined - name: Build slurm.conf ansible.builtin.include_tasks: build_slurm_conf.yml + when: "'slurm' in conf_files" - name: Slurm dbd opts ansible.builtin.set_fact: @@ -167,12 +170,13 @@ - name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd ansible.builtin.set_fact: conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}" + when: slurm_conf_dict is defined - name: Write merged .conf ansible.builtin.copy: content: "{{ item.ini_lines | join('\n') }}\n" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" - mode: "0640" + mode: "{{ slurm_dbd_mode if item.item.key == 'slurmdbd' else slurm_mode }}" owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index e4ac760d77..f2182db18e 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -60,6 +60,7 @@ ansible.builtin.set_fact: cluster_name: "{{ slurm_cluster[0].cluster_name }}" configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}" + force_conf: "{{ slurm_cluster[0].force_conf | default(false) }}" slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}" controller_trackfile_path: "{{ share_path }}/ctld_track" diff --git a/discovery/roles/slurm_config/tasks/handle_forced_confs.yml b/discovery/roles/slurm_config/tasks/handle_forced_confs.yml new file mode 100644 index 0000000000..1862359cb1 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/handle_forced_confs.yml @@ -0,0 +1,64 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Parse forced conf file from localhost + slurm_conf: + op: parse + conf_name: "{{ forced_conf }}" + path: "{{ configs_input[forced_conf] }}" + delegate_to: localhost + register: forced_conf_parsed + no_log: true + when: + - configs_input[forced_conf] is string + +- name: Use forced conf dict directly + ansible.builtin.set_fact: + forced_conf_dict: "{{ configs_input[forced_conf] }}" + no_log: true + when: + - configs_input[forced_conf] is mapping + +- name: Use parsed forced conf dict + ansible.builtin.set_fact: + forced_conf_dict: "{{ forced_conf_parsed.conf_dict }}" + no_log: true + when: + - configs_input[forced_conf] is string + - forced_conf_parsed is success + +- name: Convert forced conf to ini format + slurm_conf: + op: merge + conf_sources: "{{ [forced_conf_dict] }}" + conf_name: "{{ forced_conf }}" + register: forced_conf_result + delegate_to: localhost + no_log: true + when: + - forced_conf_dict is defined + +- name: Write forced .conf file as-is + ansible.builtin.copy: + content: "{{ forced_conf_result.ini_lines | join('\n') }}\n" + dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ forced_conf }}.conf" + mode: "0640" + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + remote_src: "{{ copy_from_oim }}" + register: forced_conf_written + no_log: true + when: + - forced_conf_result is defined + - forced_conf_result.ini_lines is defined diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index 4dc0217559..ba93bb086a 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -30,7 +30,7 @@ - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': cmpt_list | join(',')}) if item.PartitionName == slurm_partition_name else item] }}" + + [item | combine({'Nodes': (cmpt_list | join(',')) if cmpt_list | length > 0 else 'ALL'}) if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: - "'slurm' in conf_merge_dict" diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 39311ca64d..d708eb0777 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -68,6 +68,7 @@ gpu_slurm_conf: SlurmdParameters: l3cache_as_socket innodb_buffer_pool_size: 4G innodb_lock_wait_timeout: 900 +conf_server: "--conf-server {{ ctld_list | join(',') }}" # TODO tmp nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" bmc_username: "{{ hostvars['localhost']['bmc_username'] }}" @@ -125,7 +126,7 @@ dbd_slurm_conf: AccountingStorageType: accounting_storage/slurmdbd partition_params: PartitionName: "{{ slurm_partition_name }}" - Nodes: "{{ cmpt_list | join(',') }}" + Nodes: "{{ cmpt_list | join(',') if cmpt_list else 'ALL' }}" MaxTime: "INFINITE" State: "UP" Default: "YES" diff --git a/input/omnia_config.yml b/input/omnia_config.yml index bb5a4f06fa..75cc599c81 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -27,6 +27,15 @@ # Storage name corresponding to the NFS share to be used by slurm cluster # This should match with exactly with a entry in storage_config.yml +# force_conf +# Variable indicates whether a specific configuration file path +# under config_sources should be used as-is without merging +# If force_conf is set to true for a configuration source path, +# that configuration file will be applied directly +# without merging with defaults or existing configurations +# It accepts true and false values +# Default value is false + # config_sources # defines how the Slurm configuration files are provided to the cluster. # : @@ -50,6 +59,7 @@ slurm_cluster: - cluster_name: slurm_cluster nfs_storage_name: nfs_slurm + # force_conf: true # config_sources: # slurm: # SlurmctldTimeout: 60 From 123df9514617a76a2fc6b376baca3070d92cf951 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 19 Feb 2026 17:24:44 +0530 Subject: [PATCH 47/77] removede new file --- .../tasks/handle_forced_confs.yml | 64 ------------------- 1 file changed, 64 deletions(-) delete mode 100644 discovery/roles/slurm_config/tasks/handle_forced_confs.yml diff --git a/discovery/roles/slurm_config/tasks/handle_forced_confs.yml b/discovery/roles/slurm_config/tasks/handle_forced_confs.yml deleted file mode 100644 index 1862359cb1..0000000000 --- a/discovery/roles/slurm_config/tasks/handle_forced_confs.yml +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Parse forced conf file from localhost - slurm_conf: - op: parse - conf_name: "{{ forced_conf }}" - path: "{{ configs_input[forced_conf] }}" - delegate_to: localhost - register: forced_conf_parsed - no_log: true - when: - - configs_input[forced_conf] is string - -- name: Use forced conf dict directly - ansible.builtin.set_fact: - forced_conf_dict: "{{ configs_input[forced_conf] }}" - no_log: true - when: - - configs_input[forced_conf] is mapping - -- name: Use parsed forced conf dict - ansible.builtin.set_fact: - forced_conf_dict: "{{ forced_conf_parsed.conf_dict }}" - no_log: true - when: - - configs_input[forced_conf] is string - - forced_conf_parsed is success - -- name: Convert forced conf to ini format - slurm_conf: - op: merge - conf_sources: "{{ [forced_conf_dict] }}" - conf_name: "{{ forced_conf }}" - register: forced_conf_result - delegate_to: localhost - no_log: true - when: - - forced_conf_dict is defined - -- name: Write forced .conf file as-is - ansible.builtin.copy: - content: "{{ forced_conf_result.ini_lines | join('\n') }}\n" - dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ forced_conf }}.conf" - mode: "0640" - owner: "{{ slurm_user }}" - group: "{{ slurm_user_group }}" - remote_src: "{{ copy_from_oim }}" - register: forced_conf_written - no_log: true - when: - - forced_conf_result is defined - - forced_conf_result.ini_lines is defined From d78a74a04c8fb9e0555196a6be0287d0e2f4326d Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Thu, 19 Feb 2026 17:40:37 +0530 Subject: [PATCH 48/77] Lock Mechanism added for Upgrade Sequence Integrity --- build_image_aarch64/build_image_aarch64.yml | 3 + build_image_x86_64/build_image_x86_64.yml | 3 + discovery/discovery.yml | 3 + local_repo/local_repo.yml | 3 + omnia.sh | 87 ++++++++++++++----- prepare_oim/prepare_oim.yml | 3 + .../tasks/display_warnings.yml | 2 + upgrade/upgrade_omnia.yml | 10 +++ utils/upgrade_checkup.yml | 33 +++++++ 9 files changed, 125 insertions(+), 22 deletions(-) create mode 100644 utils/upgrade_checkup.yml diff --git a/build_image_aarch64/build_image_aarch64.yml b/build_image_aarch64/build_image_aarch64.yml index 08ee0b4ad8..d5dc76a82d 100644 --- a/build_image_aarch64/build_image_aarch64.yml +++ b/build_image_aarch64/build_image_aarch64.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml index 676d8adbd6..8f56b86ef6 100644 --- a/build_image_x86_64/build_image_x86_64.yml +++ b/build_image_x86_64/build_image_x86_64.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 75efadb47c..40fd00123c 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../utils/include_input_dir.yml diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml index 3a743c3f47..963715b5e3 100644 --- a/local_repo/local_repo.yml +++ b/local_repo/local_repo.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/omnia.sh b/omnia.sh index 3b320b0bf6..25cfb01dec 100755 --- a/omnia.sh +++ b/omnia.sh @@ -398,6 +398,19 @@ cleanup_omnia_core() { # Fetch the configuration from the Omnia core container. fetch_config + # Clear upgrade guard lock if present (shared path visible to container and host) + local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" + local upgrade_guard_lock_host + upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -n "$upgrade_guard_lock_host" ]; then + upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" + else + upgrade_guard_lock_host="$upgrade_guard_lock_container" + fi + + rm -f "$upgrade_guard_lock_host" >/dev/null 2>&1 || true + echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_host" + # Remove the container remove_container @@ -1837,6 +1850,22 @@ upgrade_omnia_core() { touch "$lock_file" trap 'rm -f "$lock_file"' EXIT + # Create upgrade guard lock in shared path so other playbooks can block during upgrade + local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" + local upgrade_guard_lock_host + upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -n "$upgrade_guard_lock_host" ]; then + upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" + else + upgrade_guard_lock_host="$upgrade_guard_lock_container" + fi + + mkdir -p "$(dirname "$upgrade_guard_lock_host")" 2>/dev/null || true + echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_host" || { + echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_host${NC}" + exit 1 + } + # Run upgrade phases if ! phase1_validate; then echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" @@ -1874,8 +1903,10 @@ upgrade_omnia_core() { echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base" + # Seed inputs and defaults after upgrade + post_setup_config + show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" - # Initialize SSH config and start container session init_ssh_config start_container_session @@ -1885,15 +1916,15 @@ upgrade_omnia_core() { # Validate backup directory structure and files validate_backup_directory() { local backup_path="$1" - + echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path" - + # Check if backup directory exists if ! podman exec -u root omnia_core test -d "$backup_path"; then echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path" return 1 fi - + # Check for required subdirectories for subdir in input metadata configs; do if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then @@ -1901,24 +1932,24 @@ validate_backup_directory() { return 1 fi done - + # Check for required files if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml" return 1 fi - + if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container" return 1 fi - + # Verify metadata contains version information if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then echo "[ERROR] [ROLLBACK] Metadata file does not contain version information" return 1 fi - + echo "[INFO] [ROLLBACK] Backup validation successful" return 0 } @@ -1927,15 +1958,15 @@ validate_backup_directory() { stop_container_gracefully() { local container_name="$1" local timeout="${2:-30}" - + echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..." - + # Try graceful stop first if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then echo "[INFO] [ROLLBACK] Container stopped gracefully" return 0 fi - + # Check if container is still running if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..." @@ -1947,16 +1978,16 @@ stop_container_gracefully() { return 1 fi fi - + return 0 } # Restore files from backup restore_from_backup() { local backup_path="$1" - + echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path" - + # Restore input files if ! podman exec -u root omnia_core bash -c " set -e @@ -1966,19 +1997,19 @@ restore_from_backup() { echo "[ERROR] [ROLLBACK] Failed to restore input files" return 1 fi - + # Restore metadata if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then echo "[ERROR] [ROLLBACK] Failed to restore metadata" return 1 fi - + # Restore container config on host if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then echo "[ERROR] [ROLLBACK] Failed to restore container config" return 1 fi - + echo "[INFO] [ROLLBACK] Files restored successfully" return 0 } @@ -2006,8 +2037,8 @@ display_cleanup_instructions() { echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}" echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}" echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}" - echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}" - echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}" + echo -e "${YELLOW}4. Stop all containers: podman stop $(podman ps -aq)${NC}" + echo -e "${YELLOW}5. Remove all containers: podman rm -f $(podman ps -aq)${NC}" echo -e "${YELLOW}6. Clean shared path: rm -rf ${NC}" echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}" echo "" @@ -2015,7 +2046,6 @@ display_cleanup_instructions() { echo "" } -# Main rollback function rollback_omnia_core() { echo -e "${GREEN}================================================================================${NC}" echo -e "${GREEN} OMNIA CORE ROLLBACK${NC}" @@ -2287,7 +2317,20 @@ rollback_omnia_core() { # Clean up lock file before starting long-running ssh session rm -f "$lock_file" >/dev/null 2>&1 || true echo "[INFO] Rollback lock file removed before starting container session" - + + # Clear upgrade guard lock if it exists (shared path visible to container and host) + local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" + local upgrade_guard_lock_host + upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -n "$upgrade_guard_lock_host" ]; then + upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" + else + upgrade_guard_lock_host="$upgrade_guard_lock_container" + fi + + rm -f "$upgrade_guard_lock_host" >/dev/null 2>&1 || true + echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_host" + # Initialize SSH config and start container session init_ssh_config start_container_session @@ -2325,4 +2368,4 @@ main() { } # Call the main function -main "$1" +main "$1" \ No newline at end of file diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index 50c48fd3e5..f5ea607994 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml index 2cc6dfed26..444869291b 100644 --- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml +++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml @@ -29,6 +29,7 @@ - name: Pause for user to review warnings ansible.builtin.pause: + seconds: 30 prompt: | ╔════════════════════════════════════════════╗ ║ ⚠️ UPGRADE WARNINGS REVIEW ⚠️ ║ @@ -42,6 +43,7 @@ Please review these warnings carefully. Press ENTER to continue or CTRL+C to abort. + Continuing automatically in 30 seconds... when: - upgrade_warnings is defined - upgrade_warnings | length > 0 diff --git a/upgrade/upgrade_omnia.yml b/upgrade/upgrade_omnia.yml index 61050ec244..ade6b1f173 100644 --- a/upgrade/upgrade_omnia.yml +++ b/upgrade/upgrade_omnia.yml @@ -18,3 +18,13 @@ - name: Upgrade cluster tasks ansible.builtin.import_playbook: upgrade_cluster.yml + +- name: Clear upgrade guard lock + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Remove upgrade guard lock + ansible.builtin.file: + path: /opt/omnia/.data/upgrade_in_progress.lock + state: absent diff --git a/utils/upgrade_checkup.yml b/utils/upgrade_checkup.yml new file mode 100644 index 0000000000..5fb8582000 --- /dev/null +++ b/utils/upgrade_checkup.yml @@ -0,0 +1,33 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: "Guard: block if upgrade is in progress" + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Check upgrade lock file + ansible.builtin.stat: + path: /opt/omnia/.data/upgrade_in_progress.lock + register: upgrade_lock + + - name: Block playbook while upgrade is in progress + ansible.builtin.fail: + msg: >- + Upgrade is not completed fully. + Please run upgrade_omnia.yml to complete upgrade before running any other playbook using the below command: + "ansible-playbook /omnia/upgrade/upgrade_omnia.yml" + If you don't require input files to be migrated, reconfigure the default input files, remove the lock file using the following command + "rm /opt/omnia/.data/upgrade_in_progress.lock" and then proceed. + when: upgrade_lock.stat.exists From 3a9ef0a8a231bf308ab5953450d01014dcab518f Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 19 Feb 2026 17:58:30 +0530 Subject: [PATCH 49/77] Renamed force_conf to skip_merge --- .../input_validation/schema/omnia_config.json | 2 +- .../validation_flows/common_validation.py | 8 ++++---- discovery/roles/slurm_config/tasks/confs.yml | 4 ++-- discovery/roles/slurm_config/tasks/create_slurm_dir.yml | 2 +- input/omnia_config.yml | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index f7771d9441..ca7266124c 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -19,7 +19,7 @@ "minLength": 1, "description": "Name of the nfs storage in storage_config.yml" }, - "force_conf": { + "skip_merge": { "type": "boolean", "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" }, diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 7726df24fb..36f55130d4 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1074,12 +1074,12 @@ def validate_omnia_config( "slurm NFS not provided", f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) - + skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation") cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] - force_conf_list = [clst.get('force_conf', False) for clst in data.get('slurm_cluster')] + skip_merge_list = [clst.get('skip_merge', False) for clst in data.get('slurm_cluster')] for idx, cfg_path_dict in enumerate(cnfg_src): - force_conf = force_conf_list[idx] + skip_merge = skip_merge_list[idx] for k,v in cfg_path_dict.items(): conf_dict = None if isinstance(v, str): @@ -1089,7 +1089,7 @@ def validate_omnia_config( f"provided conf path for {k} - {v} does not exist")) continue else: # path exists - if not force_conf and not skip_conf_validation: + if not skip_merge and not skip_conf_validation: conf_dict, duplicate_keys = parse_slurm_conf(v, k, False) if duplicate_keys: errors.append( diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 3764ecc18a..1e5a4e507e 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -17,11 +17,11 @@ apply_config: "{{ __default_config }}" no_log: true -- name: Remove keys from conf_files if they have string values in configs_input (when force_conf is true) +- name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true) ansible.builtin.set_fact: conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}" when: - - force_conf | default(false) + - skip_merge | default(false) - configs_input is defined - name: Build slurm.conf diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index f2182db18e..b68bcbbded 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -60,7 +60,7 @@ ansible.builtin.set_fact: cluster_name: "{{ slurm_cluster[0].cluster_name }}" configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}" - force_conf: "{{ slurm_cluster[0].force_conf | default(false) }}" + skip_merge: "{{ slurm_cluster[0].skip_merge | default(false) }}" slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}" controller_trackfile_path: "{{ share_path }}/ctld_track" diff --git a/input/omnia_config.yml b/input/omnia_config.yml index 75cc599c81..943d70e530 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -27,10 +27,10 @@ # Storage name corresponding to the NFS share to be used by slurm cluster # This should match with exactly with a entry in storage_config.yml -# force_conf +# skip_merge # Variable indicates whether a specific configuration file path # under config_sources should be used as-is without merging -# If force_conf is set to true for a configuration source path, +# If skip_merge is set to true for a configuration source path, # that configuration file will be applied directly # without merging with defaults or existing configurations # It accepts true and false values @@ -59,7 +59,7 @@ slurm_cluster: - cluster_name: slurm_cluster nfs_storage_name: nfs_slurm - # force_conf: true + # skip_merge: true # config_sources: # slurm: # SlurmctldTimeout: 60 From f12996cec9561ad0b027a3ff149468e284d760ed Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 20 Feb 2026 10:49:42 +0530 Subject: [PATCH 50/77] Update omnia.sh --- omnia.sh | 59 ++++++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/omnia.sh b/omnia.sh index 25cfb01dec..530c168e7d 100755 --- a/omnia.sh +++ b/omnia.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -299,7 +299,18 @@ update_metadata_upgrade_backup_dir() { " } - +# Resolve the upgrade guard lock path (container or host shared path) +get_upgrade_guard_lock_path() { + local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" + local upgrade_guard_lock_host + upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -n "$upgrade_guard_lock_host" ]; then + upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" + else + upgrade_guard_lock_host="$upgrade_guard_lock_container" + fi + echo "$upgrade_guard_lock_host" +} check_internal_nfs_export() { nfs_server_ip=$1 @@ -399,17 +410,9 @@ cleanup_omnia_core() { fetch_config # Clear upgrade guard lock if present (shared path visible to container and host) - local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" - local upgrade_guard_lock_host - upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') - if [ -n "$upgrade_guard_lock_host" ]; then - upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" - else - upgrade_guard_lock_host="$upgrade_guard_lock_container" - fi - - rm -f "$upgrade_guard_lock_host" >/dev/null 2>&1 || true - echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_host" + local upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true + echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_path" # Remove the container remove_container @@ -1851,18 +1854,12 @@ upgrade_omnia_core() { trap 'rm -f "$lock_file"' EXIT # Create upgrade guard lock in shared path so other playbooks can block during upgrade - local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" - local upgrade_guard_lock_host - upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') - if [ -n "$upgrade_guard_lock_host" ]; then - upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" - else - upgrade_guard_lock_host="$upgrade_guard_lock_container" - fi + local upgrade_guard_lock_path + upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) - mkdir -p "$(dirname "$upgrade_guard_lock_host")" 2>/dev/null || true - echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_host" || { - echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_host${NC}" + mkdir -p "$(dirname "$upgrade_guard_lock_path")" 2>/dev/null || true + echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_path" || { + echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_path${NC}" exit 1 } @@ -2319,17 +2316,11 @@ rollback_omnia_core() { echo "[INFO] Rollback lock file removed before starting container session" # Clear upgrade guard lock if it exists (shared path visible to container and host) - local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" - local upgrade_guard_lock_host - upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') - if [ -n "$upgrade_guard_lock_host" ]; then - upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" - else - upgrade_guard_lock_host="$upgrade_guard_lock_container" - fi + local upgrade_guard_lock_path + upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) - rm -f "$upgrade_guard_lock_host" >/dev/null 2>&1 || true - echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_host" + rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true + echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_path" # Initialize SSH config and start container session init_ssh_config From 08dd3e9c06dd86f6b3314cc0f96572218cd42a5f Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 20 Feb 2026 05:30:41 +0000 Subject: [PATCH 51/77] auto-backup of slurm-confs --- discovery/roles/slurm_config/tasks/confs.yml | 57 ++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index c5f7953b0d..1885347260 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -191,6 +191,63 @@ loop_control: loop_var: extra_conf +- name: Backup Slurm configuration files when changed + when: + - ctld_conf_files is changed + - ctld_list is defined + - ctld_list | length > 0 + block: + - name: Set backup timestamp + ansible.builtin.set_fact: + backup_timestamp: "{{ ansible_date_time.date }}_{{ ansible_date_time.time | replace(':', '-') }}" + backup_base_name: "auto_backup_discovery" + + - name: Set backup name suffix + ansible.builtin.set_fact: + backup_name_suffix: "{{ backup_base_name ~ '_' ~ backup_timestamp }}" + + - name: Set backup directories + ansible.builtin.set_fact: + slurm_backups_root: "{{ share_path }}/slurm_backups" + backup_dir: "{{ share_path }}/slurm_backups/{{ backup_base_name ~ '_' ~ backup_timestamp }}" + + - name: Ensure slurm backups root exists + ansible.builtin.file: + path: "{{ slurm_backups_root }}" + state: directory + mode: '0755' + + - name: Create backup directory + ansible.builtin.file: + path: "{{ backup_dir }}" + state: directory + mode: '0755' + + - name: Create backup config directories + ansible.builtin.file: + path: "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}" + state: directory + mode: '0755' + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + + - name: Backup controller config directories + ansible.builtin.command: >- + cp -a "{{ slurm_config_path }}/{{ ctld_list[0] }}/{{ item }}/." "{{ backup_dir }}/{{ ctld_list[0] }}/{{ item }}/" + loop: + - etc/slurm + - etc/munge + - etc/my.cnf.d + changed_when: true + failed_when: false + + - name: Display backup location + ansible.builtin.debug: + msg: "Slurm config backup created at: {{ backup_dir }}/{{ ctld_list[0] }}" + + - name: Check if cluster running ansible.builtin.include_tasks: check_ctld_running.yml when: From 7169855081ecefbf6183cc218cd41896683ce49b Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 20 Feb 2026 15:09:06 +0530 Subject: [PATCH 52/77] upgrade utility added to oim_cleanup and credential utility --- utils/credential_utility/get_config_credentials.yml | 4 ++++ utils/oim_cleanup.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/utils/credential_utility/get_config_credentials.yml b/utils/credential_utility/get_config_credentials.yml index 0e4c323b94..b77ba14b9b 100644 --- a/utils/credential_utility/get_config_credentials.yml +++ b/utils/credential_utility/get_config_credentials.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../upgrade_checkup.yml + tags: always + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../include_input_dir.yml diff --git a/utils/oim_cleanup.yml b/utils/oim_cleanup.yml index edb9cfb207..4d959d5ea4 100644 --- a/utils/oim_cleanup.yml +++ b/utils/oim_cleanup.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: upgrade_checkup.yml + tags: always + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: include_input_dir.yml From c42782c8481703c5d0c10ba3e36ee7e242bd0304 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Fri, 20 Feb 2026 17:54:24 +0530 Subject: [PATCH 53/77] Lock Mechanism for Upgrade Sequence Integrity (#3994) --- build_image_aarch64/build_image_aarch64.yml | 3 + build_image_x86_64/build_image_x86_64.yml | 3 + discovery/discovery.yml | 3 + local_repo/local_repo.yml | 3 + omnia.sh | 82 +++++++++++++------ prepare_oim/prepare_oim.yml | 3 + .../tasks/display_warnings.yml | 2 + upgrade/upgrade_omnia.yml | 10 +++ .../get_config_credentials.yml | 4 + utils/oim_cleanup.yml | 4 + utils/upgrade_checkup.yml | 33 ++++++++ 11 files changed, 126 insertions(+), 24 deletions(-) create mode 100644 utils/upgrade_checkup.yml diff --git a/build_image_aarch64/build_image_aarch64.yml b/build_image_aarch64/build_image_aarch64.yml index 08ee0b4ad8..d5dc76a82d 100644 --- a/build_image_aarch64/build_image_aarch64.yml +++ b/build_image_aarch64/build_image_aarch64.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/build_image_x86_64/build_image_x86_64.yml b/build_image_x86_64/build_image_x86_64.yml index 676d8adbd6..8f56b86ef6 100644 --- a/build_image_x86_64/build_image_x86_64.yml +++ b/build_image_x86_64/build_image_x86_64.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 75efadb47c..40fd00123c 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../utils/include_input_dir.yml diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml index 3a743c3f47..963715b5e3 100644 --- a/local_repo/local_repo.yml +++ b/local_repo/local_repo.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/omnia.sh b/omnia.sh index 3b320b0bf6..530c168e7d 100755 --- a/omnia.sh +++ b/omnia.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright © 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -299,7 +299,18 @@ update_metadata_upgrade_backup_dir() { " } - +# Resolve the upgrade guard lock path (container or host shared path) +get_upgrade_guard_lock_path() { + local upgrade_guard_lock_container="/opt/omnia/.data/upgrade_in_progress.lock" + local upgrade_guard_lock_host + upgrade_guard_lock_host=$(podman exec -u root omnia_core grep '^oim_shared_path:' /opt/omnia/.data/oim_metadata.yml 2>/dev/null | cut -d':' -f2- | tr -d ' \t\n\r') + if [ -n "$upgrade_guard_lock_host" ]; then + upgrade_guard_lock_host="$upgrade_guard_lock_host/omnia/.data/upgrade_in_progress.lock" + else + upgrade_guard_lock_host="$upgrade_guard_lock_container" + fi + echo "$upgrade_guard_lock_host" +} check_internal_nfs_export() { nfs_server_ip=$1 @@ -398,6 +409,11 @@ cleanup_omnia_core() { # Fetch the configuration from the Omnia core container. fetch_config + # Clear upgrade guard lock if present (shared path visible to container and host) + local upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true + echo "[INFO] [CLEANUP] Cleared upgrade guard lock (if present): $upgrade_guard_lock_path" + # Remove the container remove_container @@ -1837,6 +1853,16 @@ upgrade_omnia_core() { touch "$lock_file" trap 'rm -f "$lock_file"' EXIT + # Create upgrade guard lock in shared path so other playbooks can block during upgrade + local upgrade_guard_lock_path + upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + + mkdir -p "$(dirname "$upgrade_guard_lock_path")" 2>/dev/null || true + echo "Upgrade in progress. Complete upgrade_omnia.yml or rollback to clear." > "$upgrade_guard_lock_path" || { + echo -e "${RED}ERROR: Failed to create upgrade guard lock: $upgrade_guard_lock_path${NC}" + exit 1 + } + # Run upgrade phases if ! phase1_validate; then echo "[ERROR] [ORCHESTRATOR] Upgrade failed in Phase 1" @@ -1874,8 +1900,10 @@ upgrade_omnia_core() { echo "[INFO] [ORCHESTRATOR] Upgrade completed successfully" echo "[INFO] [ORCHESTRATOR] Backup location (inside omnia_core container): $backup_base" + # Seed inputs and defaults after upgrade + post_setup_config + show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" - # Initialize SSH config and start container session init_ssh_config start_container_session @@ -1885,15 +1913,15 @@ upgrade_omnia_core() { # Validate backup directory structure and files validate_backup_directory() { local backup_path="$1" - + echo "[INFO] [ROLLBACK] Validating backup directory: $backup_path" - + # Check if backup directory exists if ! podman exec -u root omnia_core test -d "$backup_path"; then echo "[ERROR] [ROLLBACK] Backup directory does not exist: $backup_path" return 1 fi - + # Check for required subdirectories for subdir in input metadata configs; do if ! podman exec -u root omnia_core test -d "$backup_path/$subdir"; then @@ -1901,24 +1929,24 @@ validate_backup_directory() { return 1 fi done - + # Check for required files if ! podman exec -u root omnia_core test -f "$backup_path/metadata/oim_metadata.yml"; then echo "[ERROR] [ROLLBACK] Missing metadata file: $backup_path/metadata/oim_metadata.yml" return 1 fi - + if ! podman exec -u root omnia_core test -f "$backup_path/configs/omnia_core.container"; then echo "[ERROR] [ROLLBACK] Missing container config: $backup_path/configs/omnia_core.container" return 1 fi - + # Verify metadata contains version information if ! podman exec -u root omnia_core grep -q "^omnia_version:" "$backup_path/metadata/oim_metadata.yml"; then echo "[ERROR] [ROLLBACK] Metadata file does not contain version information" return 1 fi - + echo "[INFO] [ROLLBACK] Backup validation successful" return 0 } @@ -1927,15 +1955,15 @@ validate_backup_directory() { stop_container_gracefully() { local container_name="$1" local timeout="${2:-30}" - + echo "[INFO] [ROLLBACK] Stopping $container_name container gracefully..." - + # Try graceful stop first if podman stop -t "$timeout" "$container_name" >/dev/null 2>&1; then echo "[INFO] [ROLLBACK] Container stopped gracefully" return 0 fi - + # Check if container is still running if podman ps --format '{{.Names}}' | grep -qw "$container_name"; then echo "[WARN] [ROLLBACK] Graceful stop failed, force stopping container..." @@ -1947,16 +1975,16 @@ stop_container_gracefully() { return 1 fi fi - + return 0 } # Restore files from backup restore_from_backup() { local backup_path="$1" - + echo "[INFO] [ROLLBACK] Restoring from backup: $backup_path" - + # Restore input files if ! podman exec -u root omnia_core bash -c " set -e @@ -1966,19 +1994,19 @@ restore_from_backup() { echo "[ERROR] [ROLLBACK] Failed to restore input files" return 1 fi - + # Restore metadata if ! podman exec -u root omnia_core cp -a "$backup_path/metadata/oim_metadata.yml" /opt/omnia/.data/; then echo "[ERROR] [ROLLBACK] Failed to restore metadata" return 1 fi - + # Restore container config on host if ! podman cp "omnia_core:$backup_path/configs/omnia_core.container" /etc/containers/systemd/; then echo "[ERROR] [ROLLBACK] Failed to restore container config" return 1 fi - + echo "[INFO] [ROLLBACK] Files restored successfully" return 0 } @@ -2006,8 +2034,8 @@ display_cleanup_instructions() { echo -e "${YELLOW}1. Remove all container definitions: cd /etc/containers/systemd${NC}" echo -e "${YELLOW}2. Delete all container files: rm -rf *${NC}" echo -e "${YELLOW}3. Reload systemd daemon: systemctl daemon-reload${NC}" - echo -e "${YELLOW}4. Stop all containers: podman stop \$(podman ps -aq)${NC}" - echo -e "${YELLOW}5. Remove all containers: podman rm -f \$(podman ps -aq)${NC}" + echo -e "${YELLOW}4. Stop all containers: podman stop $(podman ps -aq)${NC}" + echo -e "${YELLOW}5. Remove all containers: podman rm -f $(podman ps -aq)${NC}" echo -e "${YELLOW}6. Clean shared path: rm -rf ${NC}" echo -e "${YELLOW}7. Install required version: ./omnia.sh --install${NC}" echo "" @@ -2015,7 +2043,6 @@ display_cleanup_instructions() { echo "" } -# Main rollback function rollback_omnia_core() { echo -e "${GREEN}================================================================================${NC}" echo -e "${GREEN} OMNIA CORE ROLLBACK${NC}" @@ -2287,7 +2314,14 @@ rollback_omnia_core() { # Clean up lock file before starting long-running ssh session rm -f "$lock_file" >/dev/null 2>&1 || true echo "[INFO] Rollback lock file removed before starting container session" - + + # Clear upgrade guard lock if it exists (shared path visible to container and host) + local upgrade_guard_lock_path + upgrade_guard_lock_path=$(get_upgrade_guard_lock_path) + + rm -f "$upgrade_guard_lock_path" >/dev/null 2>&1 || true + echo "[INFO] [ROLLBACK] Cleared upgrade guard lock: $upgrade_guard_lock_path" + # Initialize SSH config and start container session init_ssh_config start_container_session @@ -2325,4 +2359,4 @@ main() { } # Call the main function -main "$1" +main "$1" \ No newline at end of file diff --git a/prepare_oim/prepare_oim.yml b/prepare_oim/prepare_oim.yml index 50c48fd3e5..f5ea607994 100644 --- a/prepare_oim/prepare_oim.yml +++ b/prepare_oim/prepare_oim.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../utils/upgrade_checkup.yml + - name: Set_fact for fetch omnia config credentials hosts: localhost connection: local diff --git a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml index 2cc6dfed26..444869291b 100644 --- a/upgrade/roles/import_input_parameters/tasks/display_warnings.yml +++ b/upgrade/roles/import_input_parameters/tasks/display_warnings.yml @@ -29,6 +29,7 @@ - name: Pause for user to review warnings ansible.builtin.pause: + seconds: 30 prompt: | ╔════════════════════════════════════════════╗ ║ ⚠️ UPGRADE WARNINGS REVIEW ⚠️ ║ @@ -42,6 +43,7 @@ Please review these warnings carefully. Press ENTER to continue or CTRL+C to abort. + Continuing automatically in 30 seconds... when: - upgrade_warnings is defined - upgrade_warnings | length > 0 diff --git a/upgrade/upgrade_omnia.yml b/upgrade/upgrade_omnia.yml index 61050ec244..ade6b1f173 100644 --- a/upgrade/upgrade_omnia.yml +++ b/upgrade/upgrade_omnia.yml @@ -18,3 +18,13 @@ - name: Upgrade cluster tasks ansible.builtin.import_playbook: upgrade_cluster.yml + +- name: Clear upgrade guard lock + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Remove upgrade guard lock + ansible.builtin.file: + path: /opt/omnia/.data/upgrade_in_progress.lock + state: absent diff --git a/utils/credential_utility/get_config_credentials.yml b/utils/credential_utility/get_config_credentials.yml index 0e4c323b94..b77ba14b9b 100644 --- a/utils/credential_utility/get_config_credentials.yml +++ b/utils/credential_utility/get_config_credentials.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: ../upgrade_checkup.yml + tags: always + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: ../include_input_dir.yml diff --git a/utils/oim_cleanup.yml b/utils/oim_cleanup.yml index edb9cfb207..4d959d5ea4 100644 --- a/utils/oim_cleanup.yml +++ b/utils/oim_cleanup.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if upgrade is in progress + ansible.builtin.import_playbook: upgrade_checkup.yml + tags: always + - name: Include input project directory when: not project_dir_status | default(false) | bool ansible.builtin.import_playbook: include_input_dir.yml diff --git a/utils/upgrade_checkup.yml b/utils/upgrade_checkup.yml new file mode 100644 index 0000000000..5fb8582000 --- /dev/null +++ b/utils/upgrade_checkup.yml @@ -0,0 +1,33 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: "Guard: block if upgrade is in progress" + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Check upgrade lock file + ansible.builtin.stat: + path: /opt/omnia/.data/upgrade_in_progress.lock + register: upgrade_lock + + - name: Block playbook while upgrade is in progress + ansible.builtin.fail: + msg: >- + Upgrade is not completed fully. + Please run upgrade_omnia.yml to complete upgrade before running any other playbook using the below command: + "ansible-playbook /omnia/upgrade/upgrade_omnia.yml" + If you don't require input files to be migrated, reconfigure the default input files, remove the lock file using the following command + "rm /opt/omnia/.data/upgrade_in_progress.lock" and then proceed. + when: upgrade_lock.stat.exists From d11fde8e868837f3c5403bd3b55f36b72ee60ae5 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Fri, 20 Feb 2026 18:14:34 +0530 Subject: [PATCH 54/77] Slurm delete node - drain node before delete - skip_merge new option (#3986) * Node drain logic for deletion * Shell instead of command for piping * lint fixes * Updated permission for slurmdbd Added new force_conf option for allowing confs pass through validation * removede new file * Renamed force_conf to skip_merge --- .../input_validation/schema/omnia_config.json | 4 + .../validation_flows/common_validation.py | 9 +- .../slurm_config/tasks/build_slurm_conf.yml | 5 + .../slurm_config/tasks/check_ctld_running.yml | 12 +- discovery/roles/slurm_config/tasks/confs.yml | 14 ++- .../slurm_config/tasks/create_slurm_dir.yml | 1 + .../tasks/drain_and_remove_node.yml | 109 ++++++++++++++++++ .../roles/slurm_config/tasks/remove_node.yml | 2 +- discovery/roles/slurm_config/vars/main.yml | 6 +- input/omnia_config.yml | 10 ++ 10 files changed, 161 insertions(+), 11 deletions(-) create mode 100644 discovery/roles/slurm_config/tasks/drain_and_remove_node.yml diff --git a/common/library/module_utils/input_validation/schema/omnia_config.json b/common/library/module_utils/input_validation/schema/omnia_config.json index f53485770f..ca7266124c 100644 --- a/common/library/module_utils/input_validation/schema/omnia_config.json +++ b/common/library/module_utils/input_validation/schema/omnia_config.json @@ -19,6 +19,10 @@ "minLength": 1, "description": "Name of the nfs storage in storage_config.yml" }, + "skip_merge": { + "type": "boolean", + "description": "Variable indicates whether a specific configuration file path under config_sources should be used as-is without merging" + }, "config_sources": { "type": "object", "description": "Config can be a file path or inline mapping", diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index f577a4e9b8..36f55130d4 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -1074,9 +1074,12 @@ def validate_omnia_config( "slurm NFS not provided", f"NFS name {', '.join(diff_set)} required for slurm is not defined in {storage_config}" )) - cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + skip_conf_validation = os.path.exists("/opt/omnia/input/.skip_slurm_conf_validation") - for cfg_path_dict in cnfg_src: + cnfg_src = [clst.get('config_sources', {}) for clst in data.get('slurm_cluster')] + skip_merge_list = [clst.get('skip_merge', False) for clst in data.get('slurm_cluster')] + for idx, cfg_path_dict in enumerate(cnfg_src): + skip_merge = skip_merge_list[idx] for k,v in cfg_path_dict.items(): conf_dict = None if isinstance(v, str): @@ -1086,7 +1089,7 @@ def validate_omnia_config( f"provided conf path for {k} - {v} does not exist")) continue else: # path exists - if not skip_conf_validation: + if not skip_merge and not skip_conf_validation: conf_dict, duplicate_keys = parse_slurm_conf(v, k, False) if duplicate_keys: errors.append( diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index 9d5d0f0944..40b6137172 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Read NodeName parameters from iDRAC + ansible.builtin.include_tasks: read_node_idrac.yml + when: cmpt_list + loop: "{{ cmpt_list }}" + - name: Append node_params list into NodeName list ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index 7d908169ab..ce27d3c362 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -22,6 +22,16 @@ register: ssh_check ignore_errors: true +- name: Drain and remove nodes if any + ansible.builtin.include_tasks: drain_and_remove_node.yml + loop: "{{ nodes_in_normal_not_in_cmpt }}" + loop_control: + loop_var: node_to_remove + when: + - ssh_check is success + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + - name: Enter slurm controller when pingable when: - ssh_check is success @@ -37,7 +47,7 @@ register: service_facts ignore_unreachable: true - - name: Fail if slurmctld is unreachable + - name: Check slurmctld is reachable ansible.builtin.fail: msg: "Failed to connect to {{ ctld }}." when: service_facts is unreachable diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index c5f7953b0d..1e5a4e507e 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -17,13 +17,16 @@ apply_config: "{{ __default_config }}" no_log: true -- name: Read NodeName parameters - ansible.builtin.include_tasks: read_node_idrac.yml - when: cmpt_list - loop: "{{ cmpt_list }}" +- name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true) + ansible.builtin.set_fact: + conf_files: "{{ conf_files | difference(configs_input | dict2items | selectattr('value', 'string') | map(attribute='key') | list) }}" + when: + - skip_merge | default(false) + - configs_input is defined - name: Build slurm.conf ansible.builtin.include_tasks: build_slurm_conf.yml + when: "'slurm' in conf_files" - name: Slurm dbd opts ansible.builtin.set_fact: @@ -167,12 +170,13 @@ - name: Generate slurmd opts for Configless # TODO: Move to $SLURMD_OPTIONS /etc/default/slurmd ansible.builtin.set_fact: conf_server: "--conf-server {{ ctld_list | map('regex_replace', '$', ':' ~ (slurm_conf_dict.get('SlurmctldPort', '6817') | string)) | join(',') }}" + when: slurm_conf_dict is defined - name: Write merged .conf ansible.builtin.copy: content: "{{ item.ini_lines | join('\n') }}\n" dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" - mode: "0640" + mode: "{{ slurm_dbd_mode if item.item.key == 'slurmdbd' else slurm_mode }}" owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index e4ac760d77..b68bcbbded 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -60,6 +60,7 @@ ansible.builtin.set_fact: cluster_name: "{{ slurm_cluster[0].cluster_name }}" configs_input: "{{ slurm_cluster[0].config_sources | default({}) | dict2items | rejectattr('value', 'falsy') | list | items2dict }}" + skip_merge: "{{ slurm_cluster[0].skip_merge | default(false) }}" slurm_config_path: "{{ share_path }}/{{ slurm_dir_name }}" controller_trackfile_path: "{{ share_path }}/ctld_track" diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml new file mode 100644 index 0000000000..da1c41d3fe --- /dev/null +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -0,0 +1,109 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Check if node exists in Slurm cluster + ansible.builtin.command: scontrol show node {{ node_to_remove }} + register: node_exists_check + failed_when: false + ignore_unreachable: true + changed_when: false + delegate_to: "{{ ctld }}" + +- name: Skip if node does not exist + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} not found in cluster, skipping removal" + when: + - node_exists_check is reachable + - node_exists_check.rc != 0 + +- name: Process node removal + when: + - node_exists_check is reachable + - node_exists_check.rc == 0 + ignore_unreachable: true + block: + - name: Get current job count on node + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l + register: current_jobs + changed_when: false + delegate_to: "{{ ctld }}" + + - name: Display job information + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)" + + - name: Drain the node to prevent new job assignments + ansible.builtin.command: > + scontrol update NodeName={{ node_to_remove }} + State=DRAIN + Reason="Scheduled removal - waiting for jobs to complete" + changed_when: true + delegate_to: "{{ ctld }}" + + - name: Wait for all jobs to complete on the node + ansible.builtin.shell: + cmd: | + set -o pipefail + squeue -w {{ node_to_remove }} -h | wc -l + register: job_count_check + until: job_count_check.stdout | int == 0 + retries: "{{ (node_drain_timeout / node_drain_delay) | int }}" + delay: "{{ node_drain_delay }}" + changed_when: false + delegate_to: "{{ ctld }}" + when: current_jobs.stdout | int > 0 + + - name: Confirm jobs completed + ansible.builtin.debug: + msg: "All jobs on {{ node_to_remove }} have completed" + when: current_jobs.stdout | int > 0 + + - name: Log node removal + ansible.builtin.debug: + msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state" + + rescue: + - name: Log node removal failure + ansible.builtin.debug: + msg: "Failed to drain node {{ node_to_remove }}" + + - name: Remove slurm node with running job after timeout + ansible.builtin.pause: + prompt: | + Node {{ node_to_remove }} has been DRAINED to prevent new job assignments. + Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds. + Options: + 1. Press Ctrl+C then 'A' to abort + 2. Press Enter to force removal (jobs will be killed) + when: not force_scancel_node + + - name: Force cancel jobs if timeout reached + ansible.builtin.command: scancel -f -w {{ node_to_remove }} + changed_when: true + failed_when: false + delegate_to: "{{ ctld }}" + + always: + - name: Set node to DOWN state + ansible.builtin.command: > + scontrol update NodeName={{ node_to_remove }} + State=DOWN + Reason="Node removed from cluster" + changed_when: true + failed_when: false + delegate_to: "{{ ctld }}" + when: node_exists_check.rc == 0 diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index 4dc0217559..ba93bb086a 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -30,7 +30,7 @@ - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': cmpt_list | join(',')}) if item.PartitionName == slurm_partition_name else item] }}" + + [item | combine({'Nodes': (cmpt_list | join(',')) if cmpt_list | length > 0 else 'ALL'}) if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: - "'slurm' in conf_merge_dict" diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 1593f791cb..d708eb0777 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -68,6 +68,7 @@ gpu_slurm_conf: SlurmdParameters: l3cache_as_socket innodb_buffer_pool_size: 4G innodb_lock_wait_timeout: 900 +conf_server: "--conf-server {{ ctld_list | join(',') }}" # TODO tmp nodes_yaml: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" bmc_username: "{{ hostvars['localhost']['bmc_username'] }}" @@ -117,12 +118,15 @@ munge_dir_mode: "0700" common_mode: "0755" slurm_dbd_mode: "0600" slurm_db_cnf_mode: "0600" +node_drain_timeout: 900 +node_drain_delay: 30 +force_scancel_node: false dbd_slurm_conf: AccountingStoragePort: "{{ slurm_dbd_port }}" AccountingStorageType: accounting_storage/slurmdbd partition_params: PartitionName: "{{ slurm_partition_name }}" - Nodes: "{{ cmpt_list | join(',') }}" + Nodes: "{{ cmpt_list | join(',') if cmpt_list else 'ALL' }}" MaxTime: "INFINITE" State: "UP" Default: "YES" diff --git a/input/omnia_config.yml b/input/omnia_config.yml index bb5a4f06fa..943d70e530 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -27,6 +27,15 @@ # Storage name corresponding to the NFS share to be used by slurm cluster # This should match with exactly with a entry in storage_config.yml +# skip_merge +# Variable indicates whether a specific configuration file path +# under config_sources should be used as-is without merging +# If skip_merge is set to true for a configuration source path, +# that configuration file will be applied directly +# without merging with defaults or existing configurations +# It accepts true and false values +# Default value is false + # config_sources # defines how the Slurm configuration files are provided to the cluster. # : @@ -50,6 +59,7 @@ slurm_cluster: - cluster_name: slurm_cluster nfs_storage_name: nfs_slurm + # skip_merge: true # config_sources: # slurm: # SlurmctldTimeout: 60 From 8e08eabe0f7787a33c5d3d27c211e479e4627aed Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Sat, 21 Feb 2026 14:51:35 +0000 Subject: [PATCH 55/77] checkmarx fixes - II Signed-off-by: Vrinda_Marwah --- .../module_utils/local_repo/registry_utils.py | 13 ++++++++++--- .../module_utils/local_repo/user_image_utility.py | 12 ++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/common/library/module_utils/local_repo/registry_utils.py b/common/library/module_utils/local_repo/registry_utils.py index 2e7da2f659..6abd75b6dc 100644 --- a/common/library/module_utils/local_repo/registry_utils.py +++ b/common/library/module_utils/local_repo/registry_utils.py @@ -27,14 +27,21 @@ def is_https(host, timeout=1): context.check_hostname = False context.verify_mode = ssl.CERT_NONE + sock = None + wrapped_sock = None try: - with socket.create_connection((ip, port), timeout=timeout) as sock: - with context.wrap_socket(sock, server_hostname=ip): - return True + sock = socket.create_connection((ip, port), timeout=timeout) + wrapped_sock = context.wrap_socket(sock, server_hostname=ip) + return True except ssl.SSLError: return False except Exception: return False + finally: + if wrapped_sock: + wrapped_sock.close() + if sock: + sock.close() def validate_user_registry(user_registry): """ diff --git a/common/library/module_utils/local_repo/user_image_utility.py b/common/library/module_utils/local_repo/user_image_utility.py index 4c68cd1803..d50ea41df7 100644 --- a/common/library/module_utils/local_repo/user_image_utility.py +++ b/common/library/module_utils/local_repo/user_image_utility.py @@ -58,11 +58,12 @@ def check_image_in_registry( """ if not host.startswith(("http://", "https://")): - if cacert and key: - image_url = f"https://{host}/v2/{image}/manifests/{tag}" - else: - image_url = f"http://{host}/v2/{image}/manifests/{tag}" - + # Checkmarx: Communication_Over_HTTP + # HTTP is intentionally allowed here because this function must support + # insecure user registries. + protocol = "https" if (cacert and key) else "http" + host = f"{protocol}://{host}" + image_url = f"{host}/v2/{image}/manifests/{tag}" logger.info(f"Checking image existence at: {image_url}") try: @@ -409,4 +410,3 @@ def handle_user_image_registry(package, package_content, version_variables, user logger.info("#" * 30 + f" {handle_user_image_registry.__name__} end " + "#" * 30) return result, package_info - From 62529b64bebdbaeccf9a54fbeb0be5f0b14c2885 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Sun, 22 Feb 2026 07:32:32 +0000 Subject: [PATCH 56/77] fix for security issue - improper resource shutdown issue Signed-off-by: Vrinda_Marwah --- .../module_utils/local_repo/registry_utils.py | 48 +++++++++++++++---- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/common/library/module_utils/local_repo/registry_utils.py b/common/library/module_utils/local_repo/registry_utils.py index 6abd75b6dc..965a0880f8 100644 --- a/common/library/module_utils/local_repo/registry_utils.py +++ b/common/library/module_utils/local_repo/registry_utils.py @@ -19,29 +19,57 @@ from ansible.module_utils.local_repo.common_functions import is_file_exists def is_https(host, timeout=1): + """ + Check whether the given host is serving HTTPS (TLS). + + Attempts a TLS handshake without verifying the server certificate. + + Args: + host (str): The host address in "ip:port" format. + timeout (int, optional): Connection timeout in seconds. Defaults to 1. + + Returns: + bool: True if the host supports HTTPS/TLS, False otherwise. + """ ip, port = host.rsplit(":", 1) port = int(port) - # Don't verify server cert; just see if TLS works context = ssl.create_default_context() context.check_hostname = False context.verify_mode = ssl.CERT_NONE + result = False sock = None wrapped_sock = None + try: sock = socket.create_connection((ip, port), timeout=timeout) wrapped_sock = context.wrap_socket(sock, server_hostname=ip) - return True - except ssl.SSLError: - return False - except Exception: - return False + result = True + + except (ssl.SSLError, OSError): + result = False + finally: - if wrapped_sock: - wrapped_sock.close() - if sock: - sock.close() + # Close wrapped socket first + if wrapped_sock is not None: + try: + wrapped_sock.shutdown(socket.SHUT_RDWR) + except Exception: + pass + try: + wrapped_sock.close() + except Exception: + pass + + # Then explicitly close original socket + if sock is not None: + try: + sock.close() + except Exception: + pass + + return result def validate_user_registry(user_registry): """ From 95429340d3a67c102043cfced6b097d837b4de47 Mon Sep 17 00:00:00 2001 From: Abhishek S A Date: Mon, 23 Feb 2026 14:09:02 +0530 Subject: [PATCH 57/77] mapping file update --- examples/pxe_mapping_file.csv | 8 ++++---- input/pxe_mapping_file.csv | 10 +++++----- input/software_config.json | 1 + 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index f9dfdf0cee..4d1c4775ed 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,11 +1,11 @@ FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_x86_64,grp1,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 login_node_x86_64,grp9,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file diff --git a/input/pxe_mapping_file.csv b/input/pxe_mapping_file.csv index 849e3a2168..4d1c4775ed 100644 --- a/input/pxe_mapping_file.csv +++ b/input/pxe_mapping_file.csv @@ -1,11 +1,11 @@ FUNCTIONAL_GROUP_NAME,GROUP_NAME,SERVICE_TAG,PARENT_SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_MAC,BMC_IP slurm_control_node_x86_64,grp0,ABCD12,,slurm-control-node1,xx:yy:zz:aa:bb:cc,172.16.107.52,xx:yy:zz:aa:bb:dd,172.17.107.52 -slurm_node_x86_64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 -slurm_node_x86_64,grp1,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 -login_compiler_node_x86_64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 -login_compiler_node_x86_64,grp8,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 +slurm_node_aarch64,grp1,ABCD34,ABFL82,slurm-node1,aa:bb:cc:dd:ee:ff,172.16.107.43,aa:bb:cc:dd:ee:gg,172.17.107.43 +slurm_node_aarch64,grp2,ABFG34,ABKD88,slurm-node2,aa:bb:cc:dd:ee:ff,172.16.107.44,aa:bb:cc:dd:ff:gg,172.17.107.44 +login_compiler_node_aarch64,grp8,ABCD78,,login-compiler-node1,aa:bb:cc:dd:ee:gg,172.16.107.41,aa:bb:cc:dd:ee:bb,172.17.107.41 +login_node_x86_64,grp9,ABFG78,,login-compiler-node2,aa:bb:cc:dd:ee:gg,172.16.107.42,aa:bb:cc:dd:ee:bb,172.17.107.42 service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb:cc:dd:ee:ff,172.16.107.53,xx:yy:zz:aa:bb:ff,172.17.107.53 service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file diff --git a/input/software_config.json b/input/software_config.json index 0d7f62acc3..4683376057 100644 --- a/input/software_config.json +++ b/input/software_config.json @@ -9,6 +9,7 @@ {"name": "service_k8s","version": "1.34.1", "arch": ["x86_64"]}, {"name": "slurm_custom", "arch": ["x86_64","aarch64"]}, {"name": "ldms", "arch": ["x86_64","aarch64"]}, + {"name": "csi_driver_powerscale", "version":"v2.15.0", "arch": ["x86_64"]}, {"name": "additional_packages", "arch": ["x86_64","aarch64"]} ], "slurm_custom": [ From 1499181edd3186ea2c26f6d8842dd1f7fe2db02c Mon Sep 17 00:00:00 2001 From: pullan1 Date: Mon, 23 Feb 2026 14:28:59 +0530 Subject: [PATCH 58/77] To configure per-repository sync and caching policies in localrepo Signed-off-by: pullan1 --- .../schema/local_repo_config.json | 278 ++++++++++++++++++ .../validation_flows/local_repo_validation.py | 30 +- .../library/module_utils/local_repo/config.py | 11 +- .../module_utils/local_repo/software_utils.py | 59 +++- input/local_repo_config.yml | 60 +++- .../tasks/configure_rhel_os_urls.yml | 104 ++++++- .../deploy_containers/pulp/vars/main.yml | 9 +- 7 files changed, 521 insertions(+), 30 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/local_repo_config.json b/common/library/module_utils/input_validation/schema/local_repo_config.json index e44cf44df7..587851d0b3 100644 --- a/common/library/module_utils/input_validation/schema/local_repo_config.json +++ b/common/library/module_utils/input_validation/schema/local_repo_config.json @@ -1136,6 +1136,284 @@ ] }, "description": "Optional list of additional repository URLs for aarch64 architecture. These repos are aggregated into a single Pulp repository." + }, + "rhel_subscription_repo_config_x86_64": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "minLength": 1, + "pattern": "^(https?:\\/\\/).+" + }, + "gpgkey": { + "type": "string", + "pattern": "^(|[a-zA-Z][a-zA-Z0-9+.-]*:\\/\\/\\S+)$" + }, + "name": { + "type": "string", + "minLength": 1, + "pattern": "^(?!\\s*$).+" + }, + "policy": { + "type": "string", + "enum": [ + "always", + "partial" + ] + }, + "caching": { + "type": "boolean" + }, + "sslcacert": { + "type": [ + "string", + "null" + ] + }, + "sslclientkey": { + "type": [ + "string", + "null" + ] + }, + "sslclientcert": { + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "url", + "gpgkey", + "name" + ], + "allOf": [ + { + "if": { + "required": [ + "sslcacert" + ], + "properties": { + "sslcacert": { + "minLength": 1 + } + } + }, + "then": { + "required": [ + "sslclientkey", + "sslclientcert" + ], + "properties": { + "sslclientkey": { + "minLength": 1 + }, + "sslclientcert": { + "minLength": 1 + } + } + } + }, + { + "if": { + "required": [ + "sslclientkey" + ], + "properties": { + "sslclientkey": { + "minLength": 1 + } + } + }, + "then": { + "required": [ + "sslcacert", + "sslclientcert" + ], + "properties": { + "sslcacert": { + "minLength": 1 + }, + "sslclientcert": { + "minLength": 1 + } + } + } + }, + { + "if": { + "required": [ + "sslclientcert" + ], + "properties": { + "sslclientcert": { + "minLength": 1 + } + } + }, + "then": { + "required": [ + "sslcacert", + "sslclientkey" + ], + "properties": { + "sslcacert": { + "minLength": 1 + }, + "sslclientkey": { + "minLength": 1 + } + } + } + } + ] + }, + "description": "Optional configuration for overriding policy and caching settings for RHEL subscription-based repositories on x86_64 architecture." + }, + "rhel_subscription_repo_config_aarch64": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "url": { + "type": "string", + "minLength": 1, + "pattern": "^(https?:\\/\\/).+" + }, + "gpgkey": { + "type": "string", + "pattern": "^(|[a-zA-Z][a-zA-Z0-9+.-]*:\\/\\/\\S+)$" + }, + "name": { + "type": "string", + "minLength": 1, + "pattern": "^(?!\\s*$).+" + }, + "policy": { + "type": "string", + "enum": [ + "always", + "partial" + ] + }, + "caching": { + "type": "boolean" + }, + "sslcacert": { + "type": [ + "string", + "null" + ] + }, + "sslclientkey": { + "type": [ + "string", + "null" + ] + }, + "sslclientcert": { + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "url", + "gpgkey", + "name" + ], + "allOf": [ + { + "if": { + "required": [ + "sslcacert" + ], + "properties": { + "sslcacert": { + "minLength": 1 + } + } + }, + "then": { + "required": [ + "sslclientkey", + "sslclientcert" + ], + "properties": { + "sslclientkey": { + "minLength": 1 + }, + "sslclientcert": { + "minLength": 1 + } + } + } + }, + { + "if": { + "required": [ + "sslclientkey" + ], + "properties": { + "sslclientkey": { + "minLength": 1 + } + } + }, + "then": { + "required": [ + "sslcacert", + "sslclientcert" + ], + "properties": { + "sslcacert": { + "minLength": 1 + }, + "sslclientcert": { + "minLength": 1 + } + } + } + }, + { + "if": { + "required": [ + "sslclientcert" + ], + "properties": { + "sslclientcert": { + "minLength": 1 + } + } + }, + "then": { + "required": [ + "sslcacert", + "sslclientkey" + ], + "properties": { + "sslcacert": { + "minLength": 1 + }, + "sslclientkey": { + "minLength": 1 + } + } + } + } + ] + }, + "description": "Optional configuration for overriding policy and caching settings for RHEL subscription-based repositories on aarch64 architecture." } }, "required": [ diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index 343a4f3de1..88e02845d2 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -137,19 +137,41 @@ def validate_local_repo_config(input_file_path, data, arch_repo_names = [] arch_list = url_list + [url+'_'+arch for url in url_list] # define base repos dynamically for this arch if subscription registered - if sub_result: - base_repo_names = [f"{arch}_baseos",f"{arch}_appstream",f"{arch}_codeready-builder"] - logger.info(f"Adding base repos for {arch}: {base_repo_names}") + if sub_result: + base_subscription_repos = [f"{arch}_baseos", f"{arch}_appstream", f"{arch}_codeready-builder"] + logger.info(f"Base subscription repos for {arch}: {base_subscription_repos}") + + # Collect repo names from standard repo lists for repurl in arch_list: repos = data.get(repurl) if repos: arch_repo_names = arch_repo_names + [x.get('name') for x in repos] + + # Handle rhel_subscription_repo_config separately + # Only add non-base repos to the name list (base repos are overrides, not duplicates) + subscription_config_key = f"rhel_subscription_repo_config_{arch}" + subscription_config = data.get(subscription_config_key, []) + if subscription_config: + for repo in subscription_config: + repo_name = repo.get('name') + if repo_name and repo_name not in base_subscription_repos: + # This is a new repo, not an override of base repos + arch_repo_names.append(repo_name) + logger.info(f"Adding new subscription config repo: {repo_name}") + else: + logger.info(f"Skipping base repo override from duplicate check: {repo_name}") + # Add additional_repos names for this arch additional_repos_key = f"additional_repos_{arch}" additional_repos = data.get(additional_repos_key) if additional_repos: arch_repo_names = arch_repo_names + [x.get('name') for x in additional_repos] - repo_names[arch] = repo_names.get(arch, []) + arch_repo_names + base_repo_names + + # Add base subscription repos to the final list (they will be dynamically generated) + if sub_result: + arch_repo_names = arch_repo_names + base_subscription_repos + + repo_names[arch] = arch_repo_names logger.info(f"Total repos for {arch}: {repo_names[arch]}") for k,v in repo_names.items(): diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 7bfea4b301..3e812a6e47 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -58,8 +58,17 @@ RHEL_OS_URL = "rhel_os_url" SOFTWARES_KEY = "softwares" USER_REPO_URL = "user_repo_url" -REPO_CONFIG = { "always": "on_demand", "partial": "on_demand", "never": "streamed" } ARCH_SUFFIXES = {"x86_64", "aarch64"} +DEFAULT_POLICY = "on_demand" +DEFAULT_CACHING = True +POLICY_CACHING_MAP = { + ("always", False): "immediate", + ("always", True): "on_demand", + ("partial", False): "streamed", + ("partial", True): "on_demand", + ("never", False): "streamed", + ("never", True): "streamed" +} DNF_COMMANDS = { "x86_64": ["dnf", "download", "--resolve", "--alldeps", "--arch=x86_64,noarch"], "aarch64": ["dnf", "download", "--forcearch", "aarch64", "--resolve", "--alldeps", "--exclude=*.x86_64"] diff --git a/common/library/module_utils/local_repo/software_utils.py b/common/library/module_utils/local_repo/software_utils.py index 3e06ddc7cd..90faf88a7e 100644 --- a/common/library/module_utils/local_repo/software_utils.py +++ b/common/library/module_utils/local_repo/software_utils.py @@ -36,7 +36,9 @@ RPM_LABEL_TEMPLATE, RHEL_OS_URL, SOFTWARES_KEY, - REPO_CONFIG, + POLICY_CACHING_MAP, + DEFAULT_POLICY, + DEFAULT_CACHING, ARCH_SUFFIXES, ADDITIONAL_REPOS_KEY, pulp_container_commands @@ -210,6 +212,32 @@ def transform_package_dict(data, arch_val,logger): logger.info("Transformation complete for arch '%s'. Final result keys: %s", arch_val, list(final_result.keys())) return final_result +def resolve_pulp_policy(policy_str, caching_val, logger=None): + """ + Resolve user-facing policy and caching into Pulp download policy. + Args: + policy_str (str): User policy ('always', 'on_demand', 'partial'). + caching_val: Caching flag (bool, str 'true'/'false', or None). + logger: Optional logger instance. + Returns: + str: Pulp download policy ('immediate', 'on_demand', 'streamed'). + """ + policy = str(policy_str).lower() if policy_str else DEFAULT_POLICY + if isinstance(caching_val, str): + caching = caching_val.lower() in ('true', '1', 'yes') + elif isinstance(caching_val, bool): + caching = caching_val + else: + caching = DEFAULT_CACHING + pulp_policy = POLICY_CACHING_MAP.get( + (policy, caching), "on_demand" + ) + if logger: + logger.info( + f"Resolved policy='{policy}', caching={caching}" + f" -> pulp_policy='{pulp_policy}'" + ) + return pulp_policy def parse_repo_urls(repo_config, local_repo_config_path, version_variables, vault_key_path, sub_urls,logger,sw_archs=None): @@ -271,7 +299,10 @@ def parse_repo_urls(repo_config, local_repo_config_path, client_key = url_.get("sslclientkey", "") client_cert = url_.get("sslclientcert", "") policy_given = url_.get("policy", repo_config) - policy = REPO_CONFIG.get(policy_given) + caching_given = url_.get("caching", True) + policy = resolve_pulp_policy( + policy_given, caching_given, logger + ) logger.info(f"Processing user repo '{name}' for arch '{arch}' - URL: {url}") @@ -302,7 +333,7 @@ def parse_repo_urls(repo_config, local_repo_config_path, logger.info(f"Added user repo entry: {name}") - # Handle RHEL repositories + # Handle RHEL repositories (includes subscription-based repos) for arch, repo_list in rhel_repo_entry.items(): for url_ in repo_list: name = url_.get("name", "unknown") @@ -312,7 +343,10 @@ def parse_repo_urls(repo_config, local_repo_config_path, client_key = url_.get("sslclientkey", "") client_cert = url_.get("sslclientcert", "") policy_given = url_.get("policy", repo_config) - policy = REPO_CONFIG.get(policy_given) + caching_given = url_.get("caching", True) + policy = resolve_pulp_policy( + policy_given, caching_given, logger + ) logger.info(f"Processing RHEL repo '{name}' for arch '{arch}' - URL: {url}") @@ -357,7 +391,10 @@ def parse_repo_urls(repo_config, local_repo_config_path, url = repo.get("url", "") gpgkey = repo.get("gpgkey", "") policy_given = repo.get("policy", repo_config) - policy = REPO_CONFIG.get(policy_given) + caching_given = repo.get("caching", True) + policy = resolve_pulp_policy( + policy_given, caching_given, logger + ) logger.info(f"Processing OMNIA repo '{name}' for arch '{arch}' - Template URL: {url}") # Find unresolved template vars in URL @@ -476,17 +513,11 @@ def get_subgroup_dict(user_data,logger): def get_csv_software(file_name): """ - Retrieves a list of software names from a CSV file. - Parameters: - file_name (str): The name of the CSV file. - Returns: - list: A list of software names. - """ csv_software = [] @@ -892,7 +923,9 @@ def parse_additional_repos(local_repo_config_path, repo_config, vault_key_path, local_yaml = load_yaml(local_repo_config_path) additional_repos_config = {} - policy = REPO_CONFIG.get(repo_config, "on_demand") + global_policy = resolve_pulp_policy( + repo_config, True, logger + ) vault_key_full_path = os.path.join(vault_key_path, ".local_repo_credentials_key") @@ -949,7 +982,7 @@ def parse_additional_repos(local_repo_config_path, repo_config, vault_key_path, "ca_cert": ca_cert, "client_key": client_key, "client_cert": client_cert, - "policy": policy, + "policy": global_policy, "arch": arch }) logger.info(f"Added additional repo entry: {name}") diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 8428e6d94c..f81d62640c 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -43,7 +43,10 @@ # sslcacert : Path to SSL CA certificate (if using SSL) # sslclientkey: Path to SSL client key (if using SSL) # sslclientcert: Path to SSL client certificate (if using SSL) -# policy : Repository policy (always, partial) +# policy : Repository sync policy. Allowed values: always, partial (OPTIONAL) +# If not provided, uses repo_config from software_config.json +# caching : Enable or disable local caching. Allowed values: true, false (OPTIONAL) +# If not provided, defaults to true # Notes: # - Do not use Jinja variables in this configuration. # - Omit SSL fields entirely if SSL is not in use. @@ -63,7 +66,10 @@ # sslcacert : Path to SSL CA certificate (if using SSL) # sslclientkey: Path to SSL client key (if using SSL) # sslclientcert: Path to SSL client certificate (if using SSL) -# policy : Repository policy if mentioned allowed values (always, partial). IF not mentioned will consider from software_config.json +# policy : Repository policy if mentioned allowed values (always, partial). +# If not provided, uses repo_config from software_config.json +# caching : Enable or disable local caching. Allowed values: true, false (OPTIONAL) +# If not provided, defaults to true # name : Name of the repository [ Allowed repo names _codeready-builder, _appstream, _baseos # Notes: # - Do not use Jinja variables in this configuration. @@ -75,8 +81,35 @@ #---------------------------- # Same as above but for aarch64 architecture. # +# 6. rhel_subscription_repo_config_x86_64 +#------------------------------------------- +# Optional configuration for overriding policy and caching settings for RHEL +# subscription-based repositories on x86_64 architecture. +# When subscription is enabled, this config takes precedence over dynamically +# generated URLs for matching repositories and adds any additional repositories. +# Fields: +# url : Base URL of the repository (REQUIRED) +# gpgkey : GPG key URL (REQUIRED, can be empty to disable gpgcheck) +# name : Repository name for matching (REQUIRED) +# policy : Repository sync policy. Allowed values: always, partial (OPTIONAL) +# If not provided, uses repo_config from software_config.json +# caching : Enable or disable local caching. Allowed values: true, false (OPTIONAL) +# If not provided, defaults to true +# sslcacert : Path to SSL CA certificate (optional) +# sslclientkey: Path to SSL client key (optional) +# sslclientcert: Path to SSL client certificate (optional) +# Notes: +# - Do not use Jinja variables in this configuration. +# - Omit SSL fields entirely if SSL is not in use. +# - Matching is done by repository name (e.g., x86_64_appstream) +# - Non-matching repositories are added as additional repos +# +# 7. rhel_subscription_repo_config_aarch64 +#-------------------------------------------- +# Same as above but for aarch64 architecture. +# #### ADVANCE CONFIGURATIONS FOR LOCAL REPO ### -# 6. omnia_repo_url_rhel_x86_64 +# 8. omnia_repo_url_rhel_x86_64 #------------------------------- # Mandatory repository URLs for downloading RPMS for Omnia features on RHEL x86_64. # Each entry includes url, gpgkey, and name. @@ -88,12 +121,15 @@ # gpgkey : URL of the GPG key for the repository. # If left empty, gpgcheck=0 for that repository. # name : A unique identifier for the repository or registry. -# -# 7. omnia_repo_url_rhel_aarch64 +# policy : Repository sync policy. Allowed values: always, partial (OPTIONAL) +# If not provided, uses repo_config from software_config.json +# caching : Enable or disable local caching. Allowed values: true, false (OPTIONAL) +# If not provided, defaults to true +# 9. omnia_repo_url_rhel_aarch64 #-------------------------------- # Same as above but for RHEL aarch64. # -# 8. additional_repos_x86_64 +# 10. additional_repos_x86_64 #---------------------------- # Optional list of additional repository URLs for x86_64 architecture. # These repos are aggregated into a single Pulp repository, allowing dynamic @@ -105,6 +141,10 @@ # sslcacert : Path to SSL CA certificate (optional) # sslclientkey : Path to SSL client key (optional) # sslclientcert : Path to SSL client certificate (optional) +# policy : Repository sync policy. Allowed values: always, partial (OPTIONAL) +# If not provided, uses repo_config from software_config.json +# caching : Enable or disable local caching. Allowed values: true, false (OPTIONAL) +# If not provided, defaults to true # Notes: # - All repos are synced into a single aggregated Pulp repository # - Compute nodes are configured once with a fixed URL that never changes @@ -112,7 +152,7 @@ # - Name must be unique within this list and must not conflict with names in other repo keys # - Packages from these repos can only be used via additional_packages.json # -# 9. additional_repos_aarch64 +# 11. additional_repos_aarch64 #----------------------------- # Same as above but for aarch64 architecture. @@ -133,6 +173,12 @@ user_repo_url_aarch64: # - { url: "http://AppStream.com/AppStream/x86_64/os/", gpgkey: "http://AppStream.com/AppStream/x86_64/os/RPM-GPG-KEY", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_appstream" } rhel_os_url_x86_64: rhel_os_url_aarch64: +# Example: +# rhel_subscription_repo_config_x86_64: +# - { url: "https://example.com/appstream", gpgkey: "", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_appstream", policy: "always", caching: true } +# - { url: "https://cdn.redhat.com/content/dist/rhel10/10.0/x86_64/supplementary/os/", gpgkey: "file:///etc/pki/rpm-gpg/RPM-GPG-KEY-redhat-release", sslcacert: "", sslclientkey: "", sslclientcert: "", name: "x86_64_supplementary", policy: "always", caching: false } +rhel_subscription_repo_config_x86_64: +rhel_subscription_repo_config_aarch64: # Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. omnia_repo_url_rhel_x86_64: - { url: "https://download.docker.com/linux/centos/10/x86_64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} diff --git a/local_repo/roles/validation/tasks/configure_rhel_os_urls.yml b/local_repo/roles/validation/tasks/configure_rhel_os_urls.yml index ec6766f2a3..9464284758 100644 --- a/local_repo/roles/validation/tasks/configure_rhel_os_urls.yml +++ b/local_repo/roles/validation/tasks/configure_rhel_os_urls.yml @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -51,6 +51,11 @@ sslclientkey: "{{ lookup('pipe', 'ls {{ omnia_rhel_cert_dir }}/*-key.pem | head -n1') }}" sslclientcert: "{{ lookup('pipe', 'ls {{ omnia_rhel_cert_dir }}/*.pem | grep -v -- -key.pem | head -n1') }}" sub_rhel_x86_64_urls: [] + sub_rhel_aarch64_urls: [] + sub_policy_default: "{{ sw_config.repo_config | default('on_demand') }}" + sub_caching_default: true + sub_x86_64_override_config: "{{ local_config.rhel_subscription_repo_config_x86_64 | default([]) }}" + sub_aarch64_override_config: "{{ local_config.rhel_subscription_repo_config_aarch64 | default([]) }}" - name: Append repo entries to x86_64 list ansible.builtin.set_fact: @@ -64,7 +69,8 @@ 'sslcacert' : sslcacert, 'sslclientkey' : sslclientkey, 'sslclientcert': sslclientcert, - 'policy': 'partial', + 'policy': sub_policy_default, + 'caching': sub_caching_default, 'name': ( arch ~ '_appstream' if 'appstream' in repo_url else arch ~ '_baseos' if 'baseos' in repo_url else @@ -83,12 +89,106 @@ 'name': item.name | replace('x86_64', 'aarch64'), 'gpgkey': item.gpgkey, 'policy': item.policy, + 'caching': item.caching, 'sslcacert': item.sslcacert, 'sslclientcert': item.sslclientcert, 'sslclientkey': item.sslclientkey}] }}" loop: "{{ sub_rhel_x86_64_urls }}" loop_control: loop_var: item + + # 3️ Apply override configurations and merge additional repositories + - name: Create name mapping for x86_64 dynamic repos + ansible.builtin.set_fact: + x86_64_dynamic_names: "{{ sub_rhel_x86_64_urls | map(attribute='name') | list }}" + + - name: Apply x86_64 overrides to matching repos + ansible.builtin.set_fact: + sub_rhel_x86_64_urls: >- + {%- set result = [] -%} + {%- for repo in sub_rhel_x86_64_urls -%} + {%- set override = (sub_x86_64_override_config | selectattr('name', 'equalto', repo.name) | first | default({})) -%} + {%- set updated_repo = repo | combine({ + 'policy': override.policy | default(repo.policy), + 'caching': override.caching | default(repo.caching), + 'url': override.url | default(repo.url), + 'gpgkey': override.gpgkey | default(repo.gpgkey) + }) -%} + {%- set _ = result.append(updated_repo) -%} + {%- endfor -%} + {{ result }} + + - name: Identify non-matching x86_64 override repos + ansible.builtin.set_fact: + additional_x86_64_repos: >- + {{ + sub_x86_64_override_config | rejectattr('name', 'in', x86_64_dynamic_names) | list + }} + + - name: Add non-matching x86_64 override repos as additional + ansible.builtin.set_fact: + sub_rhel_x86_64_urls: >- + {%- set result = sub_rhel_x86_64_urls -%} + {%- for repo in additional_x86_64_repos -%} + {%- set new_repo = { + 'url': repo.url, + 'gpgkey': repo.gpgkey | default(''), + 'name': repo.name, + 'policy': repo.policy | default(sub_policy_default), + 'caching': repo.caching | default(sub_caching_default), + 'sslcacert': sslcacert, + 'sslclientcert': sslclientcert, + 'sslclientkey': sslclientkey + } -%} + {%- set _ = result.append(new_repo) -%} + {%- endfor -%} + {{ result }} + + - name: Apply aarch64 overrides to matching repos + ansible.builtin.set_fact: + sub_rhel_aarch64_urls: >- + {%- set result = [] -%} + {%- for repo in sub_rhel_aarch64_urls -%} + {%- set override = (sub_aarch64_override_config | selectattr('name', 'equalto', repo.name) | first | default({})) -%} + {%- set updated_repo = repo | combine({ + 'policy': override.policy | default(repo.policy), + 'caching': override.caching | default(repo.caching), + 'url': override.url | default(repo.url), + 'gpgkey': override.gpgkey | default(repo.gpgkey) + }) -%} + {%- set _ = result.append(updated_repo) -%} + {%- endfor -%} + {{ result }} + + - name: Identify non-matching aarch64 override repos + ansible.builtin.set_fact: + aarch64_dynamic_names: "{{ sub_rhel_aarch64_urls | map(attribute='name') | list }}" + additional_aarch64_repos: >- + {{ + sub_aarch64_override_config | rejectattr('name', 'in', aarch64_dynamic_names) | list + }} + when: "'aarch64' in archs" + + - name: Add non-matching aarch64 override repos as additional + ansible.builtin.set_fact: + sub_rhel_aarch64_urls: >- + {%- set result = sub_rhel_aarch64_urls -%} + {%- for repo in additional_aarch64_repos -%} + {%- set new_repo = { + 'url': repo.url, + 'gpgkey': repo.gpgkey | default(''), + 'name': repo.name, + 'policy': repo.policy | default(sub_policy_default), + 'caching': repo.caching | default(sub_caching_default), + 'sslcacert': sslcacert, + 'sslclientcert': sslclientcert, + 'sslclientkey': sslclientkey + } -%} + {%- set _ = result.append(new_repo) -%} + {%- endfor -%} + {{ result }} + when: "'aarch64' in archs" + - name: Build final repo dict ansible.builtin.set_fact: sub_final_repo_urls: diff --git a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml index 26dbec2dae..bcca679033 100644 --- a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml @@ -40,9 +40,12 @@ arg_list: - "-e PULP_API_WORKERS_MAX_REQUESTS_JITTER=50" pulp_deployed_msg: "The {{ pulp_container_name }} container has been successfully deployed." pulp_deployed_fail_msg: - The deployment of the {{ pulp_container_name }} container has failed. To resolve this issue, - please run the utility/oim_cleanup.yml playbook to clean up any existing OIM resources. - After the cleanup, you can re-run the original playbook to deploy the {{ pulp_container_name }} container successfully. + "The {{ pulp_container_name }} container deployment failed. Common causes: + • Missing or inaccessible pulp container image + • Pulp service not starting successfully + • NFS storage not reachable or not mounted + Run utility/oim_cleanup.yml to cleanup, then re-run the playbook to deploy the {{ pulp_container_name }} + container successfully." retries_var: 8 delay_var: 30 delay_var_sixty: 30 From 381a64055d4734e6b29b0c1432919e86bb9466ea Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 23 Feb 2026 15:48:27 +0530 Subject: [PATCH 59/77] SSH pemission and access issue fix for upgrade after prepare_oim --- omnia.sh | 38 ++++---- .../prepare_oim_validation/tasks/main.yml | 3 + .../tasks/validate_ssh_permissions.yml | 93 +++++++++++++++++++ .../templates/network_spec.j2 | 4 +- .../templates/omnia_config.j2 | 2 + 5 files changed, 121 insertions(+), 19 deletions(-) create mode 100644 prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml diff --git a/omnia.sh b/omnia.sh index 530c168e7d..a9d35defd6 100755 --- a/omnia.sh +++ b/omnia.sh @@ -381,6 +381,8 @@ setup_omnia_core() { # Post container setup configuration post_setup_config + remove_container_omnia_sh + # Start the container start_container_session } @@ -1102,8 +1104,6 @@ EOF firewall-cmd --permanent --zone=public --add-port=2222/tcp firewall-cmd --reload } - -# This function sets up the configuration for the Omnia core. # post_setup_config is a function that sets up the configuration for the Omnia core. # It creates the necessary directories and files, copies input files from the Omnia container, # and creates the oim_metadata.yml file. @@ -1117,7 +1117,6 @@ post_setup_config() { mkdir -p "$OMNIA_INPUT_DIR/" # Create the default.yml file if it does not exist. - # This file contains the name of the project. if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then echo -e "${BLUE} Creating default.yml file.${NC}" { @@ -1140,33 +1139,38 @@ post_setup_config() { } validate_nfs_server() { - - # Validate NFS server permission if [ "$share_option" = "NFS" ]; then - # Create a temporary file inside $omnia_path - temp_file="$omnia_path/temp_file" + local temp_file="$omnia_path/temp_file" touch "$temp_file" - # Check if the file can be chown to root if chown root:root "$temp_file"; then - rm "$temp_file" + rm -f "$temp_file" else echo "Error: Unable to chown file to root in $omnia_path. NFS server permission validation failed. Please ensure no_root_squash option is enabled in the NFS export configuration." exit 1 fi + if [ "`ls -ld $omnia_path/omnia/ssh_config/.ssh/id_rsa | awk '{print $3 ":" $4}'`" != "root:root" ]; then echo "Error: The $omnia_path/omnia/ssh_config/.ssh/id_rsa file should be owned by root:root. NFS server permission validation failed. Please verify the NFS export configuration." exit 1 fi fi +} +refresh_known_hosts() { + local ssh_port=2222 + + mkdir -p "$HOME/.ssh" + touch "$HOME/.ssh/known_hosts" + ssh-keygen -R "[localhost]:$ssh_port" >/dev/null 2>&1 || true + ssh-keyscan -p "$ssh_port" localhost 2>/dev/null | grep -v "^#" >> "$HOME/.ssh/known_hosts" || true } init_ssh_config() { - mkdir -p "$HOME/.ssh" - touch $HOME/.ssh/known_hosts - # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host - ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1 # Remove existing entry if it exists - ssh-keyscan -p 2222 localhost 2>/dev/null | grep -v "^#" >> $HOME/.ssh/known_hosts # Scan and add the new key + refresh_known_hosts +} + +remove_container_omnia_sh() { + podman exec -u root omnia_core bash -c 'if [ -f /omnia/omnia.sh ]; then rm -f /omnia/omnia.sh; fi' >/dev/null 2>&1 || true } start_container_session() { @@ -1213,8 +1217,8 @@ show_help() { echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]" echo " -i, --install Install and start the Omnia core container" echo " -u, --uninstall Uninstall the Omnia core container and clean up configuration" - echo " --upgrade Upgrade the Omnia core container to newer version - echo " --rollback Rollback the Omnia core container to previous version + echo " --upgrade Upgrade the Omnia core container to newer version" + echo " --rollback Rollback the Omnia core container to previous version" echo " -v, --version Display Omnia version information" echo " -h, --help More information about usage" } @@ -1906,6 +1910,7 @@ upgrade_omnia_core() { show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" # Initialize SSH config and start container session init_ssh_config + remove_container_omnia_sh start_container_session exit 0 } @@ -2324,6 +2329,7 @@ rollback_omnia_core() { # Initialize SSH config and start container session init_ssh_config + remove_container_omnia_sh start_container_session } diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/main.yml b/prepare_oim/roles/prepare_oim_validation/tasks/main.yml index 5a252a6114..7da0078e44 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/main.yml @@ -13,6 +13,9 @@ # limitations under the License. --- +- name: Validate SSH permissions and ownership + ansible.builtin.include_tasks: validate_ssh_permissions.yml + - name: Validate passwordless ssh host ansible.builtin.include_tasks: validate_passwordless_ssh_oim.yml diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml new file mode 100644 index 0000000000..aa5a019b93 --- /dev/null +++ b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml @@ -0,0 +1,93 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Ensure SSH critical paths have safe ownership and permissions + block: + - name: Define SSH critical paths + ansible.builtin.set_fact: + ssh_critical_paths: + - { path: "/root/.ssh", state: "directory", mode: "0700" } + - { path: "/root/.ssh/authorized_keys", state: "file", mode: "0600" } + - { path: "/root/.ssh/id_rsa", state: "file", mode: "0600" } + - { path: "/root/.ssh/id_rsa.pub", state: "file", mode: "0644" } + - { path: "/root/.ssh/known_hosts", state: "file", mode: "0644" } + - { path: "/root/.ssh/config", state: "file", mode: "0600" } + + - name: Ensure SSH directory exists with secure mode + ansible.builtin.file: + path: "/root/.ssh" + state: directory + mode: "0700" + owner: root + group: root + register: ssh_dir_result + + - name: Stat SSH critical paths + ansible.builtin.stat: + path: "{{ item.path }}" + get_checksum: false + register: ssh_path_stats + loop: "{{ ssh_critical_paths }}" + loop_control: + label: "{{ item.path }}" + + - name: Enforce SSH ownership and permissions for existing files + ansible.builtin.file: + path: "{{ item.item.path }}" + state: "{{ item.item.state }}" + mode: "{{ item.item.mode }}" + owner: root + group: root + loop: "{{ ssh_path_stats.results }}" + loop_control: + label: "{{ item.item.path }}" + when: + - item.stat.exists | default(false) + - item.item.state == 'file' + register: ssh_path_fixes + + - name: Log SSH permission adjustments + ansible.builtin.debug: + msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root" + loop: "{{ (ssh_path_fixes.results | default([]))\ + | selectattr('item', 'defined')\ + | selectattr('changed', 'defined')\ + | selectattr('changed')\ + | list }}" + loop_control: + label: "{{ item.item.path | default('unknown path') }}" + + - name: Log SSH directory adjustments + ansible.builtin.debug: + msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root" + when: ssh_dir_result.changed | default(false) + + - name: Validate SSH permission state + ansible.builtin.assert: + that: + - not (item.stat.exists | default(false)) or (item.stat.pw_name == 'root' and item.stat.gr_name == 'root') + - not (item.stat.exists | default(false)) or (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path] + fail_msg: "SSH path {{ item.item.path }} has invalid ownership or mode. Expected root:root with mode {{ item_mode_expected[item.item.path] }}. Fix manually or rerun prepare_oim." + vars: + item_mode_expected: "{{ dict(ssh_critical_paths | map(attribute='path') | zip(ssh_critical_paths | map(attribute='mode'))) }}" + loop: "{{ ssh_path_stats.results }}" + loop_control: + label: "{{ item.item.path }}" + when: item.stat.exists | default(false) + + rescue: + - name: Fail upgrade due to SSH permission issues + ansible.builtin.fail: + msg: "SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}. Correct SSH file permissions/ownership and rerun prepare_oim." diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2 index d9e41ba469..564c057db4 100644 --- a/upgrade/roles/import_input_parameters/templates/network_spec.j2 +++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2 @@ -43,9 +43,7 @@ Networks: oim_nic_name: "{{ admin_network.oim_nic_name | default('') }}" netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" primary_oim_admin_ip: "{{ admin_network.primary_oim_admin_ip | default('') }}" -{% if (admin_network.primary_oim_bmc_ip is defined) and ((admin_network.primary_oim_bmc_ip | string | trim) != '') %} - primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip }}" -{% endif %} + primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip | default('') }}" dynamic_range: "{{ admin_network.dynamic_range | default('') }}" dns: {{ admin_network.dns | default([]) }} ntp_servers: {{ admin_network.ntp_servers | default([]) }} diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 index aec7a05ab7..eff82ee1c5 100644 --- a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 @@ -47,6 +47,7 @@ slurm_cluster: - cluster_name: {{ _cluster.cluster_name | default('') }} nfs_storage_name: {{ _cluster.nfs_storage_name | default('') }} {% if _cluster.config_sources is defined and (_cluster.config_sources | length > 0) %} + skip_merge: {{ _cluster.skip_merge | default(true) }} config_sources: {% set _supported = ['slurm', 'cgroup', 'slurmdbd', 'gres'] %} {% for _conf_name, _conf_val in _cluster.config_sources.items() %} @@ -84,6 +85,7 @@ slurm_cluster: # slurmdbd: /path/to/custom_slurmdbd.conf # gres: /path/to/custom_gres.conf {% else %} + # skip_merge: True # config_sources: # slurm: # SlurmctldTimeout: 60 From f397f00701f5b6b0c1bdce13bd112c3b2c70ec63 Mon Sep 17 00:00:00 2001 From: pullan1 Date: Mon, 23 Feb 2026 16:28:31 +0530 Subject: [PATCH 60/77] ansible lint fix Signed-off-by: pullan1 --- prepare_oim/roles/deploy_containers/pulp/vars/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml index bcca679033..da17b168d3 100644 --- a/prepare_oim/roles/deploy_containers/pulp/vars/main.yml +++ b/prepare_oim/roles/deploy_containers/pulp/vars/main.yml @@ -42,9 +42,9 @@ pulp_deployed_msg: "The {{ pulp_container_name }} container has been successfull pulp_deployed_fail_msg: "The {{ pulp_container_name }} container deployment failed. Common causes: • Missing or inaccessible pulp container image - • Pulp service not starting successfully + • Pulp service not starting successfully • NFS storage not reachable or not mounted - Run utility/oim_cleanup.yml to cleanup, then re-run the playbook to deploy the {{ pulp_container_name }} + Run utility/oim_cleanup.yml to cleanup, then re-run the playbook to deploy the {{ pulp_container_name }} container successfully." retries_var: 8 delay_var: 30 From f0a6461a589523cda229ecd6ccbd797863812e32 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 23 Feb 2026 16:53:08 +0530 Subject: [PATCH 61/77] Update validate_ssh_permissions.yml --- .../tasks/validate_ssh_permissions.yml | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml index aa5a019b93..6c01a95f47 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml @@ -61,10 +61,11 @@ - name: Log SSH permission adjustments ansible.builtin.debug: msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root" - loop: "{{ (ssh_path_fixes.results | default([]))\ - | selectattr('item', 'defined')\ - | selectattr('changed', 'defined')\ - | selectattr('changed')\ + loop: "{{ (ssh_path_fixes.results | default([])) + | selectattr('item', 'defined') + | selectattr('item.path', 'defined') + | selectattr('changed', 'defined') + | selectattr('changed') | list }}" loop_control: label: "{{ item.item.path | default('unknown path') }}" @@ -73,13 +74,20 @@ ansible.builtin.debug: msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root" when: ssh_dir_result.changed | default(false) + changed_when: false - name: Validate SSH permission state ansible.builtin.assert: that: - not (item.stat.exists | default(false)) or (item.stat.pw_name == 'root' and item.stat.gr_name == 'root') - - not (item.stat.exists | default(false)) or (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path] - fail_msg: "SSH path {{ item.item.path }} has invalid ownership or mode. Expected root:root with mode {{ item_mode_expected[item.item.path] }}. Fix manually or rerun prepare_oim." + - >- + not (item.stat.exists | default(false)) or + (item.item.path == '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] in ['0600', '0644']) or + (item.item.path != '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path]) + fail_msg: >- + SSH path {{ item.item.path }} has invalid ownership or mode. + Expected root:root with mode {{ item_mode_expected[item.item.path] }}{% if item.item.path == '/root/.ssh/known_hosts' %} (or 0600/0644 for known_hosts){% endif %}. + Fix manually or rerun prepare_oim. vars: item_mode_expected: "{{ dict(ssh_critical_paths | map(attribute='path') | zip(ssh_critical_paths | map(attribute='mode'))) }}" loop: "{{ ssh_path_stats.results }}" @@ -90,4 +98,6 @@ rescue: - name: Fail upgrade due to SSH permission issues ansible.builtin.fail: - msg: "SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}. Correct SSH file permissions/ownership and rerun prepare_oim." + msg: >- + SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}. + Correct SSH file permissions/ownership and rerun prepare_oim. From d7f2cef080f952a7eae3d4da2f72907ce7b7d094 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 23 Feb 2026 17:04:46 +0530 Subject: [PATCH 62/77] updated validate_ssh_permissions.yml --- .../tasks/validate_ssh_permissions.yml | 13 ++++--------- .../roles/prepare_oim_validation/vars/main.yml | 12 ++++++++++++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml index 6c01a95f47..6d01c94a5e 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml @@ -60,7 +60,7 @@ - name: Log SSH permission adjustments ansible.builtin.debug: - msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root" + msg: "{{ ssh_file_log_msg }}" loop: "{{ (ssh_path_fixes.results | default([])) | selectattr('item', 'defined') | selectattr('item.path', 'defined') @@ -72,7 +72,7 @@ - name: Log SSH directory adjustments ansible.builtin.debug: - msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root" + msg: "{{ ssh_dir_log_msg }}" when: ssh_dir_result.changed | default(false) changed_when: false @@ -84,10 +84,7 @@ not (item.stat.exists | default(false)) or (item.item.path == '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] in ['0600', '0644']) or (item.item.path != '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path]) - fail_msg: >- - SSH path {{ item.item.path }} has invalid ownership or mode. - Expected root:root with mode {{ item_mode_expected[item.item.path] }}{% if item.item.path == '/root/.ssh/known_hosts' %} (or 0600/0644 for known_hosts){% endif %}. - Fix manually or rerun prepare_oim. + fail_msg: "{{ ssh_permission_fail_msg }}" vars: item_mode_expected: "{{ dict(ssh_critical_paths | map(attribute='path') | zip(ssh_critical_paths | map(attribute='mode'))) }}" loop: "{{ ssh_path_stats.results }}" @@ -98,6 +95,4 @@ rescue: - name: Fail upgrade due to SSH permission issues ansible.builtin.fail: - msg: >- - SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}. - Correct SSH file permissions/ownership and rerun prepare_oim. + msg: "{{ ssh_validation_fail_msg }}" diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml index 79bd5f5b4d..5eda60a210 100644 --- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml @@ -84,3 +84,15 @@ functional_groups_config_syntax_fail_msg: "Failed. Syntax errors present in func telemetry_config_file: "telemetry_config.yml" fail_msg_telemetry_config_file: "telemetry_config.yml file doesn't exist in the input folder." telemetry_config_syntax_fail_msg: "Failed. Syntax errors present in telemetry_config.yml. Fix errors and re-run playbook again. Common syntax Errors:" + +# Usage: validate_ssh_permissions.yml +ssh_dir_log_msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root" +ssh_file_log_msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root" +ssh_permission_fail_msg: >- + SSH path {{ item.item.path }} has invalid ownership or mode. + Expected root:root with mode {{ item_mode_expected[item.item.path] }} + {% if item.item.path == '/root/.ssh/known_hosts' %} (or 0600/0644 for known_hosts){% endif %}. + Fix manually or rerun prepare_oim. +ssh_validation_fail_msg: >- + SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}. + Correct SSH file permissions/ownership and rerun prepare_oim. From b606a13e91de44a56d2fb86c9c1aef19ca814322 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 23 Feb 2026 17:17:04 +0530 Subject: [PATCH 63/77] Update validate_ssh_permissions.yml --- .../prepare_oim_validation/tasks/validate_ssh_permissions.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml index 6d01c94a5e..a3581b39e1 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml @@ -73,7 +73,6 @@ - name: Log SSH directory adjustments ansible.builtin.debug: msg: "{{ ssh_dir_log_msg }}" - when: ssh_dir_result.changed | default(false) changed_when: false - name: Validate SSH permission state From 1d6e7e31cf18205ea4927e0f39bf7b29402e85e6 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 23 Feb 2026 12:48:35 +0000 Subject: [PATCH 64/77] ARM nodes gpu detection --- .../slurm_config/tasks/read_node_idrac.yml | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac.yml b/discovery/roles/slurm_config/tasks/read_node_idrac.yml index 8424f69603..a713863438 100644 --- a/discovery/roles/slurm_config/tasks/read_node_idrac.yml +++ b/discovery/roles/slurm_config/tasks/read_node_idrac.yml @@ -41,6 +41,99 @@ | selectattr('Manufacturer', 'defined') | selectattr('Manufacturer', 'search', '(?i)nvidia') | list }}" # TODO: other GPUs also +- name: Fallback - Read PCIe Devices for GPU detection (when no GPUs found via Processors) + ansible.builtin.uri: + url: "https://{{ bmc_ip_map[item] }}/redfish/v1/Chassis/System.Embedded.1/PCIeDevices" + user: "{{ bmc_username }}" + password: "{{ bmc_password }}" + method: GET + force_basic_auth: true + validate_certs: false + return_content: true + body_format: json + timeout: 60 + headers: + Accept: "application/json" + Content-Type: "application/json" + OData-Version: "4.0" + status_code: + - 200 + register: pcie_devices + failed_when: false + when: gpus | length == 0 + +- name: Debug - Show PCIe devices structure + ansible.builtin.debug: + var: pcie_devices.json.Members + when: gpus | length == 0 and pcie_devices.json.Members is defined + +- name: Fallback - Extract PCIe device URLs + ansible.builtin.set_fact: + pcie_device_urls: "{{ pcie_devices.json.Members | default([]) | json_query('[*].\"@odata.id\"') }}" + when: gpus | length == 0 + +- name: Fallback - Get PCIe Device details for GPU detection + ansible.builtin.uri: + url: "https://{{ bmc_ip_map[item.0] }}{{ item.1 }}" + user: "{{ bmc_username }}" + password: "{{ bmc_password }}" + method: GET + force_basic_auth: true + validate_certs: false + return_content: true + body_format: json + timeout: 60 + headers: + Accept: "application/json" + Content-Type: "application/json" + OData-Version: "4.0" + status_code: + - 200 + register: pcie_device_details + with_nested: + - ["{{ item }}"] + - "{{ pcie_device_urls | default([]) }}" + loop_control: + label: "{{ item.1 }}" + failed_when: false + when: gpus | length == 0 and pcie_device_urls is defined and pcie_device_urls | length > 0 + +- name: Fallback - Detect GPUs from PCIe devices + ansible.builtin.set_fact: + fallback_gpus: "{{ pcie_device_details.results | default([]) + | selectattr('json', 'defined') + | map(attribute='json') + | selectattr('ClassCode', 'defined') + | selectattr('VendorId', 'defined') + | selectattr('ClassCode', 'equalto', '0x0300') | list }}" + when: gpus | length == 0 + +- name: Fallback - Detect GPUs from PCIe devices (additional criteria) + ansible.builtin.set_fact: + fallback_gpus_additional: "{{ pcie_device_details.results | default([]) + | selectattr('json', 'defined') + | map(attribute='json') + | selectattr('ClassCode', 'defined') + | selectattr('VendorId', 'defined') + | selectattr('ClassCode', 'equalto', '0x0302') | list }}" + when: gpus | length == 0 and fallback_gpus | default([]) | length == 0 + +- name: Fallback - Detect GPUs from Manufacturer/Name (NVIDIA only) + ansible.builtin.set_fact: + fallback_gpus_manufacturer: "{{ pcie_device_details.results | default([]) + | selectattr('json', 'defined') + | map(attribute='json') + | selectattr('Manufacturer', 'defined') + | selectattr('Name', 'defined') + | selectattr('Manufacturer', 'search', '(?i)NVIDIA') + | selectattr('Name', 'search', '(?i)GPU|RTX|TESLA|A100|H100|L40|GB') | list }}" + when: gpus | length == 0 and fallback_gpus | default([]) | length == 0 and fallback_gpus_additional | default([]) | length == 0 + +- name: Fallback - Update GPUs list if PCIe detection found GPUs + ansible.builtin.set_fact: + gpus: "{{ (fallback_gpus | default([])) or (fallback_gpus_additional | default([])) or (fallback_gpus_manufacturer | default([])) }}" + when: gpus | length == 0 + - name: Read Memory NodeParams ansible.builtin.uri: From 513443ebb62348834fcd35ba71a4a0d1bc9c27ca Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 23 Feb 2026 18:50:55 +0530 Subject: [PATCH 65/77] Update omnia.sh --- omnia.sh | 13 +-- .../prepare_oim_validation/tasks/main.yml | 3 - .../tasks/validate_ssh_permissions.yml | 97 ------------------- .../prepare_oim_validation/vars/main.yml | 12 --- 4 files changed, 5 insertions(+), 120 deletions(-) delete mode 100644 prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml diff --git a/omnia.sh b/omnia.sh index a9d35defd6..9de277a56d 100755 --- a/omnia.sh +++ b/omnia.sh @@ -1156,7 +1156,7 @@ validate_nfs_server() { fi } -refresh_known_hosts() { +init_ssh_config() { local ssh_port=2222 mkdir -p "$HOME/.ssh" @@ -1165,10 +1165,6 @@ refresh_known_hosts() { ssh-keyscan -p "$ssh_port" localhost 2>/dev/null | grep -v "^#" >> "$HOME/.ssh/known_hosts" || true } -init_ssh_config() { - refresh_known_hosts -} - remove_container_omnia_sh() { podman exec -u root omnia_core bash -c 'if [ -f /omnia/omnia.sh ]; then rm -f /omnia/omnia.sh; fi' >/dev/null 2>&1 || true } @@ -1235,14 +1231,15 @@ install_omnia_core() { exit 1 fi fi - + local omnia_core_tag="2.1" local omnia_core_registry="" # Check if local omnia_core image exists using validate function - if validate_container_image "" "$omnia_core_tag" "install"; then - echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" + if ! validate_container_image "" "$omnia_core_tag" "install"; then + exit 1 fi + echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" # Check if any other containers with 'omnia' in their name are running other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core') diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/main.yml b/prepare_oim/roles/prepare_oim_validation/tasks/main.yml index 7da0078e44..5a252a6114 100644 --- a/prepare_oim/roles/prepare_oim_validation/tasks/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/tasks/main.yml @@ -13,9 +13,6 @@ # limitations under the License. --- -- name: Validate SSH permissions and ownership - ansible.builtin.include_tasks: validate_ssh_permissions.yml - - name: Validate passwordless ssh host ansible.builtin.include_tasks: validate_passwordless_ssh_oim.yml diff --git a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml b/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml deleted file mode 100644 index a3581b39e1..0000000000 --- a/prepare_oim/roles/prepare_oim_validation/tasks/validate_ssh_permissions.yml +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Ensure SSH critical paths have safe ownership and permissions - block: - - name: Define SSH critical paths - ansible.builtin.set_fact: - ssh_critical_paths: - - { path: "/root/.ssh", state: "directory", mode: "0700" } - - { path: "/root/.ssh/authorized_keys", state: "file", mode: "0600" } - - { path: "/root/.ssh/id_rsa", state: "file", mode: "0600" } - - { path: "/root/.ssh/id_rsa.pub", state: "file", mode: "0644" } - - { path: "/root/.ssh/known_hosts", state: "file", mode: "0644" } - - { path: "/root/.ssh/config", state: "file", mode: "0600" } - - - name: Ensure SSH directory exists with secure mode - ansible.builtin.file: - path: "/root/.ssh" - state: directory - mode: "0700" - owner: root - group: root - register: ssh_dir_result - - - name: Stat SSH critical paths - ansible.builtin.stat: - path: "{{ item.path }}" - get_checksum: false - register: ssh_path_stats - loop: "{{ ssh_critical_paths }}" - loop_control: - label: "{{ item.path }}" - - - name: Enforce SSH ownership and permissions for existing files - ansible.builtin.file: - path: "{{ item.item.path }}" - state: "{{ item.item.state }}" - mode: "{{ item.item.mode }}" - owner: root - group: root - loop: "{{ ssh_path_stats.results }}" - loop_control: - label: "{{ item.item.path }}" - when: - - item.stat.exists | default(false) - - item.item.state == 'file' - register: ssh_path_fixes - - - name: Log SSH permission adjustments - ansible.builtin.debug: - msg: "{{ ssh_file_log_msg }}" - loop: "{{ (ssh_path_fixes.results | default([])) - | selectattr('item', 'defined') - | selectattr('item.path', 'defined') - | selectattr('changed', 'defined') - | selectattr('changed') - | list }}" - loop_control: - label: "{{ item.item.path | default('unknown path') }}" - - - name: Log SSH directory adjustments - ansible.builtin.debug: - msg: "{{ ssh_dir_log_msg }}" - changed_when: false - - - name: Validate SSH permission state - ansible.builtin.assert: - that: - - not (item.stat.exists | default(false)) or (item.stat.pw_name == 'root' and item.stat.gr_name == 'root') - - >- - not (item.stat.exists | default(false)) or - (item.item.path == '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] in ['0600', '0644']) or - (item.item.path != '/root/.ssh/known_hosts' and (item.stat.mode | string)[-4:] == item_mode_expected[item.item.path]) - fail_msg: "{{ ssh_permission_fail_msg }}" - vars: - item_mode_expected: "{{ dict(ssh_critical_paths | map(attribute='path') | zip(ssh_critical_paths | map(attribute='mode'))) }}" - loop: "{{ ssh_path_stats.results }}" - loop_control: - label: "{{ item.item.path }}" - when: item.stat.exists | default(false) - - rescue: - - name: Fail upgrade due to SSH permission issues - ansible.builtin.fail: - msg: "{{ ssh_validation_fail_msg }}" diff --git a/prepare_oim/roles/prepare_oim_validation/vars/main.yml b/prepare_oim/roles/prepare_oim_validation/vars/main.yml index 5eda60a210..79bd5f5b4d 100644 --- a/prepare_oim/roles/prepare_oim_validation/vars/main.yml +++ b/prepare_oim/roles/prepare_oim_validation/vars/main.yml @@ -84,15 +84,3 @@ functional_groups_config_syntax_fail_msg: "Failed. Syntax errors present in func telemetry_config_file: "telemetry_config.yml" fail_msg_telemetry_config_file: "telemetry_config.yml file doesn't exist in the input folder." telemetry_config_syntax_fail_msg: "Failed. Syntax errors present in telemetry_config.yml. Fix errors and re-run playbook again. Common syntax Errors:" - -# Usage: validate_ssh_permissions.yml -ssh_dir_log_msg: "Ensured SSH directory {{ ssh_dir_result.path | default('/root/.ssh') }} mode 0700 owner root:root" -ssh_file_log_msg: "Adjusted SSH permissions for {{ item.item.path }} to mode {{ item.item.mode }} with owner root:root" -ssh_permission_fail_msg: >- - SSH path {{ item.item.path }} has invalid ownership or mode. - Expected root:root with mode {{ item_mode_expected[item.item.path] }} - {% if item.item.path == '/root/.ssh/known_hosts' %} (or 0600/0644 for known_hosts){% endif %}. - Fix manually or rerun prepare_oim. -ssh_validation_fail_msg: >- - SSH permission validation failed: {{ ansible_failed_result.msg | default('Unknown error') }}. - Correct SSH file permissions/ownership and rerun prepare_oim. From f2498e45ca1cf7c8be34d33bcfb127e65622d7c9 Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Mon, 23 Feb 2026 19:09:29 +0530 Subject: [PATCH 66/77] SSH pemission and access issue fix for upgrade after prepare_oim (#4019) --- omnia.sh | 41 ++++++++++--------- .../templates/network_spec.j2 | 4 +- .../templates/omnia_config.j2 | 2 + 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/omnia.sh b/omnia.sh index 530c168e7d..9de277a56d 100755 --- a/omnia.sh +++ b/omnia.sh @@ -381,6 +381,8 @@ setup_omnia_core() { # Post container setup configuration post_setup_config + remove_container_omnia_sh + # Start the container start_container_session } @@ -1102,8 +1104,6 @@ EOF firewall-cmd --permanent --zone=public --add-port=2222/tcp firewall-cmd --reload } - -# This function sets up the configuration for the Omnia core. # post_setup_config is a function that sets up the configuration for the Omnia core. # It creates the necessary directories and files, copies input files from the Omnia container, # and creates the oim_metadata.yml file. @@ -1117,7 +1117,6 @@ post_setup_config() { mkdir -p "$OMNIA_INPUT_DIR/" # Create the default.yml file if it does not exist. - # This file contains the name of the project. if [ ! -f "$OMNIA_INPUT_DIR/default.yml" ]; then echo -e "${BLUE} Creating default.yml file.${NC}" { @@ -1140,33 +1139,34 @@ post_setup_config() { } validate_nfs_server() { - - # Validate NFS server permission if [ "$share_option" = "NFS" ]; then - # Create a temporary file inside $omnia_path - temp_file="$omnia_path/temp_file" + local temp_file="$omnia_path/temp_file" touch "$temp_file" - # Check if the file can be chown to root if chown root:root "$temp_file"; then - rm "$temp_file" + rm -f "$temp_file" else echo "Error: Unable to chown file to root in $omnia_path. NFS server permission validation failed. Please ensure no_root_squash option is enabled in the NFS export configuration." exit 1 fi + if [ "`ls -ld $omnia_path/omnia/ssh_config/.ssh/id_rsa | awk '{print $3 ":" $4}'`" != "root:root" ]; then echo "Error: The $omnia_path/omnia/ssh_config/.ssh/id_rsa file should be owned by root:root. NFS server permission validation failed. Please verify the NFS export configuration." exit 1 fi fi - } init_ssh_config() { + local ssh_port=2222 + mkdir -p "$HOME/.ssh" - touch $HOME/.ssh/known_hosts - # Add entry to /root/.ssh/known_hosts file to prevent errors caused by Known host - ssh-keygen -R "[localhost]:2222" >/dev/null 2>&1 # Remove existing entry if it exists - ssh-keyscan -p 2222 localhost 2>/dev/null | grep -v "^#" >> $HOME/.ssh/known_hosts # Scan and add the new key + touch "$HOME/.ssh/known_hosts" + ssh-keygen -R "[localhost]:$ssh_port" >/dev/null 2>&1 || true + ssh-keyscan -p "$ssh_port" localhost 2>/dev/null | grep -v "^#" >> "$HOME/.ssh/known_hosts" || true +} + +remove_container_omnia_sh() { + podman exec -u root omnia_core bash -c 'if [ -f /omnia/omnia.sh ]; then rm -f /omnia/omnia.sh; fi' >/dev/null 2>&1 || true } start_container_session() { @@ -1213,8 +1213,8 @@ show_help() { echo "Usage: $0 [--install | --uninstall | --upgrade | --rollback | --version | --help]" echo " -i, --install Install and start the Omnia core container" echo " -u, --uninstall Uninstall the Omnia core container and clean up configuration" - echo " --upgrade Upgrade the Omnia core container to newer version - echo " --rollback Rollback the Omnia core container to previous version + echo " --upgrade Upgrade the Omnia core container to newer version" + echo " --rollback Rollback the Omnia core container to previous version" echo " -v, --version Display Omnia version information" echo " -h, --help More information about usage" } @@ -1231,14 +1231,15 @@ install_omnia_core() { exit 1 fi fi - + local omnia_core_tag="2.1" local omnia_core_registry="" # Check if local omnia_core image exists using validate function - if validate_container_image "" "$omnia_core_tag" "install"; then - echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" + if ! validate_container_image "" "$omnia_core_tag" "install"; then + exit 1 fi + echo -e "${GREEN}✓ Omnia core image (omnia_core:${omnia_core_tag}) found locally.${NC}" # Check if any other containers with 'omnia' in their name are running other_containers=$(podman ps -a --format '{{.Names}}' | grep -E 'omnia' | grep -v 'omnia_core') @@ -1906,6 +1907,7 @@ upgrade_omnia_core() { show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" # Initialize SSH config and start container session init_ssh_config + remove_container_omnia_sh start_container_session exit 0 } @@ -2324,6 +2326,7 @@ rollback_omnia_core() { # Initialize SSH config and start container session init_ssh_config + remove_container_omnia_sh start_container_session } diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2 index d9e41ba469..564c057db4 100644 --- a/upgrade/roles/import_input_parameters/templates/network_spec.j2 +++ b/upgrade/roles/import_input_parameters/templates/network_spec.j2 @@ -43,9 +43,7 @@ Networks: oim_nic_name: "{{ admin_network.oim_nic_name | default('') }}" netmask_bits: "{{ admin_network.netmask_bits | default('24') }}" primary_oim_admin_ip: "{{ admin_network.primary_oim_admin_ip | default('') }}" -{% if (admin_network.primary_oim_bmc_ip is defined) and ((admin_network.primary_oim_bmc_ip | string | trim) != '') %} - primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip }}" -{% endif %} + primary_oim_bmc_ip: "{{ admin_network.primary_oim_bmc_ip | default('') }}" dynamic_range: "{{ admin_network.dynamic_range | default('') }}" dns: {{ admin_network.dns | default([]) }} ntp_servers: {{ admin_network.ntp_servers | default([]) }} diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 index aec7a05ab7..eff82ee1c5 100644 --- a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 @@ -47,6 +47,7 @@ slurm_cluster: - cluster_name: {{ _cluster.cluster_name | default('') }} nfs_storage_name: {{ _cluster.nfs_storage_name | default('') }} {% if _cluster.config_sources is defined and (_cluster.config_sources | length > 0) %} + skip_merge: {{ _cluster.skip_merge | default(true) }} config_sources: {% set _supported = ['slurm', 'cgroup', 'slurmdbd', 'gres'] %} {% for _conf_name, _conf_val in _cluster.config_sources.items() %} @@ -84,6 +85,7 @@ slurm_cluster: # slurmdbd: /path/to/custom_slurmdbd.conf # gres: /path/to/custom_gres.conf {% else %} + # skip_merge: True # config_sources: # slurm: # SlurmctldTimeout: 60 From 7710bd435a8fae845778fbddf295143286e22777 Mon Sep 17 00:00:00 2001 From: Katakam-Rakesh Date: Mon, 23 Feb 2026 19:21:05 +0530 Subject: [PATCH 67/77] updating pulp_cleanup and timeout for tarball and iso Signed-off-by: pullan1 --- .../library/module_utils/local_repo/config.py | 9 ++- common/library/modules/pulp_cleanup.py | 55 ++++++++++++------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/common/library/module_utils/local_repo/config.py b/common/library/module_utils/local_repo/config.py index 3e812a6e47..d8e5593778 100644 --- a/common/library/module_utils/local_repo/config.py +++ b/common/library/module_utils/local_repo/config.py @@ -78,6 +78,11 @@ "aarch64": ["dnf", "info", "--quiet", "--forcearch=aarch64"] } +# ---------------------------- +# Cleanup File Types +# Used by pulp_cleanup.py +# ---------------------------- +CLEANUP_FILE_TYPES = ["iso", "manifest", "pip_module", "tarball", "git", "ansible_galaxy_collection"] # ---------------------------- # Used by download_common.py # ---------------------------- @@ -116,9 +121,9 @@ CLI_FILE_PATH = "/root/.config/pulp/cli.toml" POST_TIMEOUT = 3600 # seconds -TAR_POLL_VAL = 25 # minutes +TAR_POLL_VAL = 45 # minutes FILE_POLL_VAL = 1 # minutes -ISO_POLL_VAL = 15 # minutes +ISO_POLL_VAL = 45 # minutes FILE_URI = "/pulp/api/v3/content/file/files/" PULP_SSL_CA_CERT = "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" # ---------------------------- diff --git a/common/library/modules/pulp_cleanup.py b/common/library/modules/pulp_cleanup.py index a3c155ebdb..a97f6d28d2 100644 --- a/common/library/modules/pulp_cleanup.py +++ b/common/library/modules/pulp_cleanup.py @@ -36,6 +36,7 @@ from ansible.module_utils.local_repo.config import ( CLEANUP_BASE_PATH_DEFAULT, CLEANUP_STATUS_FILE_PATH_DEFAULT, + CLEANUP_FILE_TYPES, pulp_rpm_commands, pulp_container_commands, pulp_file_commands, @@ -173,26 +174,40 @@ def convert_to_pulp_container_name(image_name: str) -> str: # TYPE DETECTION # ============================================================================= -def detect_file_type(name: str) -> str: - """Detect artifact type from name.""" - # Pip module: contains == (e.g., cffi==1.17.1) - if '==' in name: - return "pip_module" - # Ansible Galaxy collection: contains . but no / or == (e.g., community.general, ansible.posix) - if '.' in name and '/' not in name and '==' not in name and any( - x in name.lower() for x in ['ansible', 'community', 'galaxy'] - ): - return "ansible_galaxy_collection" - if name.startswith('ansible_galaxy_collection'): - return "ansible_galaxy_collection" - if any(x in name.lower() for x in ['chart', 'tar', 'tgz', 'helm', 'bundle']): - return "tarball" - if any(x in name.lower() for x in ['git', 'repo', 'source', 'scm']): - return "git" - if any(x in name.lower() for x in ['manifest', 'calico', 'yml', 'yaml']): - return "manifest" - return "file" - +def detect_file_type(name: str, base_path: str = "/opt/omnia/offline_repo/cluster") -> str: + """Detect artifact type by searching for the package name in the filesystem. + + Searches in base_path////{type_folder}/name + and returns the folder type where the package is found. + + Storage structure: + - iso/ : ISO files, run files (e.g., cuda-run) + - manifest/ : Kubernetes manifests (e.g., calico-v3.30.3, metallb-native-v0.15.2) + - pip_module/ : Python pip packages (e.g., PyMySQL==1.1.2, kubernetes==33.1.0) + - tarball/ : Tarballs, helm charts (e.g., helm-v3.19.0-amd64, nvhpc_2025_2511_Linux_x86_64_cuda_13.0) + - git/ : Git repositories + - ansible_galaxy_collection/ : Ansible Galaxy collections + + Args: + name: Package name from JSON (e.g., "calico-v3.30.3", "helm-v3.19.0-amd64") + base_path: Base path to search (default: /opt/omnia/offline_repo/cluster) + + Returns: + str: Type based on folder where package is found, or fallback to name-based detection + """ + + # Search for the package name in the filesystem + # Pattern: base_path/*/*/*/{type_folder}/name + for file_type in CLEANUP_FILE_TYPES: + pattern = f"{base_path}/*/*/*/{file_type}/{name}" + matches = glob.glob(pattern) + if matches: + # Extract the parent folder name and return it + parent_folder = os.path.basename(os.path.dirname(matches[0])) + return parent_folder + + # If not found in filesystem, return None + return None # ============================================================================= # EXISTENCE CHECKS From b0ca27f9b3893f68380903d913fa74d926daddeb Mon Sep 17 00:00:00 2001 From: Nethra mg Date: Mon, 23 Feb 2026 23:01:41 +0530 Subject: [PATCH 68/77] Input validation fix for duplicate admin IP in pxe mapping file --- .../validation_flows/provision_validation.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index cc6b4d8e76..4ba1515129 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -225,6 +225,52 @@ def validate_duplicate_service_tags_in_mapping_file(pxe_mapping_file_path): raise ValueError(f"Duplicate SERVICE_TAG found in PXE mapping file: {'; '.join(duplicates)}") +def validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path): + """Validates that ADMIN_IP values in the mapping file are unique.""" + if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): + raise ValueError(f"PXE mapping file not found: {pxe_mapping_file_path}") + + with open(pxe_mapping_file_path, "r", encoding="utf-8") as fh: + raw_lines = fh.readlines() + + non_comment_lines = [ln for ln in raw_lines if ln.strip()] + reader = csv.DictReader(non_comment_lines) + + fieldname_map = {fn.strip().upper(): fn for fn in reader.fieldnames} + admin_ip_col = fieldname_map.get("ADMIN_IP") + hostname_col = fieldname_map.get("HOSTNAME") + + if not admin_ip_col: + raise ValueError("ADMIN_IP column not found in PXE mapping file") + + seen_admin_ips = {} + duplicates = [] + + for row_idx, row in enumerate(reader, start=2): + admin_ip = row.get(admin_ip_col, "").strip() if row.get(admin_ip_col) else "" + hostname = "" + if hostname_col: + hostname = row.get(hostname_col, "").strip() if row.get(hostname_col) else "" + + if not admin_ip: + continue + + if admin_ip in seen_admin_ips: + first_row = seen_admin_ips[admin_ip]["row"] + first_host = seen_admin_ips[admin_ip]["hostname"] + dup_host = hostname or "" + first_host_disp = first_host or "" + duplicates.append( + f"'{admin_ip}' at CSV rows {first_row} ({first_host_disp}) and {row_idx} ({dup_host})" + ) + continue + + seen_admin_ips[admin_ip] = {"row": row_idx, "hostname": hostname} + + if duplicates: + raise ValueError(f"Duplicate ADMIN_IP found in PXE mapping file: {'; '.join(duplicates)}") + + def validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path): """Validates that GROUP_NAME has a consistent PARENT_SERVICE_TAG across the mapping file.""" if not pxe_mapping_file_path or not os.path.isfile(pxe_mapping_file_path): @@ -740,6 +786,7 @@ def validate_provision_config( validate_functional_groups_in_mapping_file(pxe_mapping_file_path) validate_duplicate_service_tags_in_mapping_file(pxe_mapping_file_path) validate_duplicate_hostnames_in_mapping_file(pxe_mapping_file_path) + validate_duplicate_admin_ips_in_mapping_file(pxe_mapping_file_path) validate_group_parent_service_tag_consistency_in_mapping_file(pxe_mapping_file_path) validate_functional_groups_separation(pxe_mapping_file_path) validate_parent_service_tag_hierarchy(pxe_mapping_file_path) From c1fc2315cee81e065abd9a70b7c21517bbf379a8 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 24 Feb 2026 03:05:53 +0530 Subject: [PATCH 69/77] Handled delete scenarios along with /etc/hosts --- .../slurm_config/tasks/build_slurm_conf.yml | 12 +-- discovery/roles/slurm_config/tasks/confs.yml | 20 ++-- .../tasks/drain_and_remove_node.yml | 96 ++++++++++--------- .../tasks/extract_path_overrides.yml | 2 +- .../slurm_config/tasks/handle_extra_confs.yml | 4 +- .../slurm_config/tasks/update_hosts_munge.yml | 48 ++++++++-- discovery/roles/slurm_config/vars/main.yml | 1 + utils/roles/idrac_pxe_boot/vars/main.yml | 2 +- 8 files changed, 111 insertions(+), 74 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index 40b6137172..84bb493442 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -23,7 +23,7 @@ | combine({'slurm': (apply_config['slurm'] | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + (node_params | default([]))}))}) }}" when: node_params is defined and node_params - no_log: true + no_log: "{{ _no_log }}" - name: Append login nodes to NodeName list ansible.builtin.set_fact: @@ -32,7 +32,7 @@ | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}" loop: "{{ login_list }}" when: login_list is defined and login_list - no_log: true + no_log: "{{ _no_log }}" - name: Append compiler login nodes to NodeName list ansible.builtin.set_fact: @@ -41,7 +41,7 @@ | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}" loop: "{{ compiler_login_list }}" when: compiler_login_list is defined and compiler_login_list - no_log: true + no_log: "{{ _no_log }}" - name: Append Partition ansible.builtin.set_fact: @@ -49,16 +49,16 @@ | combine({'slurm': (apply_config['slurm'] | combine({'PartitionName': (apply_config['slurm'].PartitionName | default([])) + [partition_params]}))}) }}" when: node_params is defined and node_params - no_log: true + no_log: "{{ _no_log }}" - name: Add gpu parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" when: gpu_params is defined and gpu_params - no_log: true + no_log: "{{ _no_log }}" - name: Add dbd parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}" when: dbd_list is defined and dbd_list - no_log: true + no_log: "{{ _no_log }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 1e5a4e507e..d2069497eb 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -15,7 +15,7 @@ - name: Slurm dict ops ansible.builtin.set_fact: apply_config: "{{ __default_config }}" - no_log: true + no_log: "{{ _no_log }}" - name: Remove keys from conf_files if they have string values in configs_input (when skip_merge is true) ansible.builtin.set_fact: @@ -34,7 +34,7 @@ | combine({'slurmdbd': (apply_config['slurmdbd'] | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}" when: ctld_list - no_log: true + no_log: "{{ _no_log }}" - name: Check .conf files existence ansible.builtin.stat: @@ -51,7 +51,7 @@ delegate_to: localhost loop: "{{ configs_input | default({}) | dict2items }}" register: parsed_configs_input_results - no_log: true + no_log: "{{ _no_log }}" when: - configs_input is defined - configs_input @@ -62,7 +62,7 @@ ansible.builtin.set_fact: parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.item.key: item.conf_dict}) }}" loop: "{{ parsed_configs_input_results.results }}" - no_log: true + no_log: "{{ _no_log }}" when: - parsed_configs_input_results is defined - not item.skipped | default(false) @@ -71,7 +71,7 @@ ansible.builtin.set_fact: parsed_configs_input: "{{ parsed_configs_input | default({}) | combine({item.key: item.value}) }}" loop: "{{ configs_input | default({}) | dict2items }}" - no_log: true + no_log: "{{ _no_log }}" when: - configs_input is defined - configs_input @@ -94,7 +94,7 @@ loop_control: loop_var: existing_conf_set register: prepared_conf_lists - no_log: true + no_log: "{{ _no_log }}" # All the updates to the confs follow after this point before merge - name: Prepend ClusterName and SlurmctldHost to slurm conf sources @@ -102,14 +102,14 @@ conf_merge_dict: "{{ conf_merge_dict | combine({'slurm': [{'ClusterName': cluster_name, 'AccountingStorageHost': dbd_list[0], 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}" when: "'slurm' in conf_merge_dict" - no_log: true + no_log: "{{ _no_log }}" - name: Slurm dbd - DbdHost and StorageHost ansible.builtin.set_fact: conf_merge_dict: "{{ conf_merge_dict | combine({'slurmdbd': [{'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}] + conf_merge_dict['slurmdbd']}) }}" when: "'slurmdbd' in conf_merge_dict" - no_log: true + no_log: "{{ _no_log }}" - name: Merge the confs slurm_conf: @@ -118,7 +118,7 @@ conf_name: "{{ item.key }}" loop: "{{ conf_merge_dict | dict2items }}" register: merged_conf - no_log: true + no_log: "{{ _no_log }}" - name: Update slurm_conf_dict with merged configuration for cloud_init read. # TODO: Remove cloud init dependency ansible.builtin.set_fact: @@ -182,7 +182,7 @@ remote_src: "{{ copy_from_oim }}" loop: "{{ merged_conf.results }}" register: ctld_conf_files - no_log: true + no_log: "{{ _no_log }}" when: - item.ini_lines diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml index da1c41d3fe..1c60299ed2 100644 --- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -46,64 +46,72 @@ ansible.builtin.debug: msg: "Node {{ node_to_remove }} currently has {{ current_jobs.stdout }} running job(s)" - - name: Drain the node to prevent new job assignments - ansible.builtin.command: > - scontrol update NodeName={{ node_to_remove }} - State=DRAIN - Reason="Scheduled removal - waiting for jobs to complete" - changed_when: true - delegate_to: "{{ ctld }}" + - name: Prompt for user input when jobs are running + ansible.builtin.pause: + prompt: | + ================================================================================ + WARNING: ACTIVE JOBS DETECTED ON NODE {{ node_to_remove }} + ================================================================================ - - name: Wait for all jobs to complete on the node - ansible.builtin.shell: - cmd: | - set -o pipefail - squeue -w {{ node_to_remove }} -h | wc -l - register: job_count_check - until: job_count_check.stdout | int == 0 - retries: "{{ (node_drain_timeout / node_drain_delay) | int }}" - delay: "{{ node_drain_delay }}" - changed_when: false - delegate_to: "{{ ctld }}" - when: current_jobs.stdout | int > 0 + Current Status: + - Node: {{ node_to_remove }} + - Running Jobs: {{ current_jobs.stdout }} + - Node State: Will be set to DOWN and removed from cluster + - Impact: All running jobs on this node will be terminated - - name: Confirm jobs completed - ansible.builtin.debug: - msg: "All jobs on {{ node_to_remove }} have completed" - when: current_jobs.stdout | int > 0 + To view job details, run: + squeue -w {{ node_to_remove }} - - name: Log node removal - ansible.builtin.debug: - msg: "Node {{ node_to_remove }} has been drained, jobs completed, and set to DOWN state" + Available Options: + 1. ABORT AND CANCEL MANUALLY (Recommended) + - Press Ctrl+C, then 'A' to abort this playbook + - Manually cancel jobs: scancel -w {{ node_to_remove }} + - Or wait for jobs to complete naturally + - Then re-run this playbook - rescue: - - name: Log node removal failure - ansible.builtin.debug: - msg: "Failed to drain node {{ node_to_remove }}" + 2. FORCE REMOVAL (Destructive) + - Press Enter to proceed with immediate node removal + - All {{ current_jobs.stdout }} job(s) will be forcefully terminated + - Users will lose any unsaved work + - Job data may be incomplete or corrupted - - name: Remove slurm node with running job after timeout - ansible.builtin.pause: - prompt: | - Node {{ node_to_remove }} has been DRAINED to prevent new job assignments. - Jobs are still running on {{ node_to_remove }} after wait of {{ node_drain_timeout }} seconds. - Options: - 1. Press Ctrl+C then 'A' to abort - 2. Press Enter to force removal (jobs will be killed) - when: not force_scancel_node + ================================================================================ + Your choice (Ctrl+C then 'A' to abort, or Enter to force remove): + when: + - current_jobs.stdout | int > 0 + - not force_scancel_node - - name: Force cancel jobs if timeout reached - ansible.builtin.command: scancel -f -w {{ node_to_remove }} + - name: Force cancel jobs on the node to be removed from cluster + ansible.builtin.command: scancel -f -w {{ node_to_remove }} # Safe does not fail if no jobs are running changed_when: true - failed_when: false + register: scancel_result + failed_when: scancel_result.rc != 0 delegate_to: "{{ ctld }}" - always: - name: Set node to DOWN state ansible.builtin.command: > scontrol update NodeName={{ node_to_remove }} State=DOWN - Reason="Node removed from cluster" + Reason="Node removed from cluster via OMNIA discovery.yml" changed_when: true failed_when: false delegate_to: "{{ ctld }}" when: node_exists_check.rc == 0 + + - name: Stop the slurmd service on node + ansible.builtin.service: + name: slurmd + state: stopped + delegate_to: "{{ node_to_remove }}" + ignore_unreachable: true + failed_when: false + + - name: Delete the dir from NFS + ansible.builtin.file: + path: "{{ slurm_config_path }}/{{ node_to_remove }}" + state: absent + rescue: + - name: Failure to remove node + ansible.builtin.fail: + msg: "Node {{ node_to_remove }} failed to be removed from slurm cluster, + as task {{ ansible_failed_task.name }} failed." diff --git a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml index 0efcf18962..9e4ae518a2 100644 --- a/discovery/roles/slurm_config/tasks/extract_path_overrides.yml +++ b/discovery/roles/slurm_config/tasks/extract_path_overrides.yml @@ -24,7 +24,7 @@ ansible.builtin.set_fact: slurmdbd_merged_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurmdbd') | first).conf_dict }}" when: "'slurmdbd' in conf_merge_dict" - no_log: true + no_log: "{{ _no_log }}" - name: Extract cgroup.conf merged dict ansible.builtin.set_fact: diff --git a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml index 544822ec28..d7a0b4f382 100644 --- a/discovery/roles/slurm_config/tasks/handle_extra_confs.yml +++ b/discovery/roles/slurm_config/tasks/handle_extra_confs.yml @@ -19,7 +19,7 @@ conf_name: "{{ extra_conf }}" register: ex_conf delegate_to: localhost - no_log: true + no_log: "{{ _no_log }}" when: - "'.' not in extra_conf" @@ -31,7 +31,7 @@ owner: "{{ slurm_user }}" group: "{{ slurm_user_group }}" remote_src: "{{ copy_from_oim }}" - no_log: true + no_log: "{{ _no_log }}" when: - "'.' not in extra_conf" - ex_conf is success diff --git a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml index 64c36dbeaf..147f1b484c 100644 --- a/discovery/roles/slurm_config/tasks/update_hosts_munge.yml +++ b/discovery/roles/slurm_config/tasks/update_hosts_munge.yml @@ -12,18 +12,45 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Update /etc/hosts with controller hostname and IP - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '^{{ host_entry.value }}\s+{{ host_entry.key }}' - line: "{{ host_entry.value }} {{ host_entry.key }}" - state: present - loop: "{{ ip_name_map | dict2items | list }}" - loop_control: - loop_var: host_entry +- name: Edit /etc/hosts file till DNS ignore_unreachable: true - failed_when: false delegate_to: "{{ slurmhost_ip }}" + block: + - name: Remove deleted nodes if any hostname exists in /etc/hosts + ansible.builtin.lineinfile: + path: "/etc/hosts" + regexp: '(\b{{ node_to_remove }}\b)' + state: absent + loop: "{{ nodes_in_normal_not_in_cmpt }}" + loop_control: + loop_var: node_to_remove + when: + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + + - name: Remove existing /etc/hosts entries containing the IP or hostname + ansible.builtin.lineinfile: + path: "/etc/hosts" + regexp: '(\b{{ host_entry.value }}\b|\b{{ host_entry.key }}\b)' + state: absent + loop: "{{ ip_name_map | dict2items | list }}" + loop_control: + loop_var: host_entry + + - name: Add correct /etc/hosts entry for controller hostname and IP + ansible.builtin.lineinfile: + path: "/etc/hosts" + line: "{{ host_entry.value }} {{ host_entry.key }}" + state: present + mode: '0644' + create: true + loop: "{{ ip_name_map | dict2items | list }}" + loop_control: + loop_var: host_entry + rescue: + - name: Print error if editing /etc/hosts fails + ansible.builtin.debug: + msg: "Failed to edit /etc/hosts file on {{ slurmhost_ip }}" - name: Get munge changes ansible.builtin.set_fact: @@ -37,6 +64,7 @@ - munge_key_changed[name_ip_map[slurmhost_ip]]['changed'] | default(false) - restart_slurm_services delegate_to: "{{ slurmhost_ip }}" + no_log: "{{ _no_log }}" ignore_unreachable: true block: - name: Update munge key permissions diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index d708eb0777..cc57a984da 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -121,6 +121,7 @@ slurm_db_cnf_mode: "0600" node_drain_timeout: 900 node_drain_delay: 30 force_scancel_node: false +_no_log: true dbd_slurm_conf: AccountingStoragePort: "{{ slurm_dbd_port }}" AccountingStorageType: accounting_storage/slurmdbd diff --git a/utils/roles/idrac_pxe_boot/vars/main.yml b/utils/roles/idrac_pxe_boot/vars/main.yml index bebd2b4a42..53de8aa0e9 100644 --- a/utils/roles/idrac_pxe_boot/vars/main.yml +++ b/utils/roles/idrac_pxe_boot/vars/main.yml @@ -16,7 +16,7 @@ restart_host: true # Change to true for forceful reboot. by default graceful will happen -force_restart: false +force_restart: true # Set boot source override mode. Valid values are once, continuous, or disabled boot_source_override_enabled: continuous From a6722a966e462791b51d77c584377eaee084e23c Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Tue, 24 Feb 2026 12:45:30 +0530 Subject: [PATCH 70/77] Delete node removal of service and NFS data --- .../slurm_config/tasks/check_ctld_running.yml | 10 ---------- .../tasks/drain_and_remove_node.yml | 8 ++++---- .../roles/slurm_config/tasks/remove_node.yml | 20 +++++++++++++++++++ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml index ce27d3c362..92ba39376e 100644 --- a/discovery/roles/slurm_config/tasks/check_ctld_running.yml +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -22,16 +22,6 @@ register: ssh_check ignore_errors: true -- name: Drain and remove nodes if any - ansible.builtin.include_tasks: drain_and_remove_node.yml - loop: "{{ nodes_in_normal_not_in_cmpt }}" - loop_control: - loop_var: node_to_remove - when: - - ssh_check is success - - nodes_in_normal_not_in_cmpt is defined - - nodes_in_normal_not_in_cmpt | length > 0 - - name: Enter slurm controller when pingable when: - ssh_check is success diff --git a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml index 1c60299ed2..cf62b156aa 100644 --- a/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml +++ b/discovery/roles/slurm_config/tasks/drain_and_remove_node.yml @@ -18,7 +18,7 @@ failed_when: false ignore_unreachable: true changed_when: false - delegate_to: "{{ ctld }}" + delegate_to: "{{ ctld_list[0] }}" - name: Skip if node does not exist ansible.builtin.debug: @@ -40,7 +40,7 @@ squeue -w {{ node_to_remove }} -h | wc -l register: current_jobs changed_when: false - delegate_to: "{{ ctld }}" + delegate_to: "{{ ctld_list[0] }}" - name: Display job information ansible.builtin.debug: @@ -86,7 +86,7 @@ changed_when: true register: scancel_result failed_when: scancel_result.rc != 0 - delegate_to: "{{ ctld }}" + delegate_to: "{{ ctld_list[0] }}" - name: Set node to DOWN state ansible.builtin.command: > @@ -95,7 +95,7 @@ Reason="Node removed from cluster via OMNIA discovery.yml" changed_when: true failed_when: false - delegate_to: "{{ ctld }}" + delegate_to: "{{ ctld_list[0] }}" when: node_exists_check.rc == 0 - name: Stop the slurmd service on node diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index ba93bb086a..eecf6d4f1b 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -12,6 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Check if controller is reachable via SSH + ansible.builtin.wait_for: + host: "{{ ctld_list[0] }}" + port: 22 # TODO: make it configurable + timeout: 10 + state: started + delegate_to: localhost + register: ssh_check + ignore_errors: true + +- name: Drain and remove nodes if any + ansible.builtin.include_tasks: drain_and_remove_node.yml + loop: "{{ nodes_in_normal_not_in_cmpt }}" + loop_control: + loop_var: node_to_remove + when: + - ssh_check is success + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + - name: Remove nodes from NodeName list that are not in cmpt_list ansible.builtin.set_fact: filtered_nodenames: "{{ slurm_conf_dict.NodeName | rejectattr('NodeName', 'in', nodes_in_normal_not_in_cmpt) | list }}" From 8e198f40ad2aca07d96080ba9255695239a6a6eb Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Tue, 24 Feb 2026 13:05:52 +0530 Subject: [PATCH 71/77] Fix admin dynamic_range subnet validation and remove flawed netmask check (#4010) --- .../common_utils/validation_utils.py | 59 ++++++++----------- .../validation_flows/provision_validation.py | 25 ++++---- 2 files changed, 34 insertions(+), 50 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/validation_utils.py b/common/library/module_utils/input_validation/common_utils/validation_utils.py index 21a54cabe5..7f5693e9c9 100644 --- a/common/library/module_utils/input_validation/common_utils/validation_utils.py +++ b/common/library/module_utils/input_validation/common_utils/validation_utils.py @@ -527,6 +527,30 @@ def validate_netmask_bits(bits): except (ValueError, TypeError): return False +def is_range_within_subnet(ip_range, reference_ip, netmask_bits): + """ + Validates that the given IP range falls within the subnet + derived from reference_ip and netmask_bits. + + Args: + ip_range (str): IP range in "start_ip-end_ip" format. + reference_ip (str): A reference IP in the subnet (e.g., primary_oim_admin_ip). + netmask_bits (str or int): The CIDR prefix length (e.g., "24"). + + Returns: + bool: True if both start and end IPs are within the subnet, False otherwise. + """ + try: + network = ipaddress.IPv4Network(f"{reference_ip}/{netmask_bits}", strict=False) + parts = ip_range.split("-") + if len(parts) != 2: + return False + start_ip = ipaddress.IPv4Address(parts[0].strip()) + end_ip = ipaddress.IPv4Address(parts[1].strip()) + return start_ip in network and end_ip in network + except (ValueError, TypeError): + return False + def check_bmc_static_range_overlap(static_range, static_range_group_mapping) -> list: """ Checks if the given static BMC range overlaps with any of the ranges in other groups. @@ -625,41 +649,6 @@ def check_port_ranges(port_ranges) -> bool: return True -def is_range_within_netmask(ip_range, netmask_bits): - """ - Check if a given IP range falls within the valid IP address range for a given netmask. - - Args: - ip_range (str): The IP range in format "start_ip-end_ip" - (e.g., "192.168.1.10-192.168.1.50"). - netmask_bits (int or str): The netmask bits (e.g., 20 for /20). - - Returns: - bool: True if the IP range is valid for the given netmask, False otherwise. - """ - try: - # Parse the IP range - start_ip, end_ip = ip_range.split('-') - start_ip_obj = ipaddress.ip_address(start_ip) - end_ip_obj = ipaddress.ip_address(end_ip) - - # Ensure start_ip <= end_ip - if start_ip_obj > end_ip_obj: - return False - - # Create network from start_ip with the given netmask - network = ipaddress.ip_network(f"{start_ip}/{netmask_bits}", strict=False) - - # Get first and last usable addresses (excluding network and broadcast) - first_usable = network.network_address + 1 - last_usable = network.broadcast_address - 1 - - # Check if both start and end IPs are within the usable range - return (first_usable <= start_ip_obj <= last_usable and - first_usable <= end_ip_obj <= last_usable) - except (ValueError, TypeError): - return False - def is_ip_within_range(ip_range, ip): """ Check if a given IP falls within a specified IP range. diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index cc6b4d8e76..63c8c25387 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -905,6 +905,16 @@ def _validate_admin_network(network): ) ) + # Ensure dynamic_range is inside the admin subnet (primary_oim_admin_ip/netmask_bits) + if not validation_utils.is_range_within_subnet(admin_net["dynamic_range"], primary_oim_admin_ip, netmask): + errors.append( + create_error_msg( + "admin_network.dynamic_range", + admin_net["dynamic_range"], + en_us_validation_msg.RANGE_NETMASK_BOUNDARY_FAIL_MSG, + ) + ) + # Admin and BMC IP should not be the same errors.extend(validate_admin_bmc_ip_not_same(primary_oim_admin_ip, primary_oim_bmc_ip)) @@ -1034,20 +1044,5 @@ def _validate_ip_ranges(dynamic_range, network_type, netmask_bits): ) ) - # Validate that IP ranges are within the netmask boundaries - if netmask_bits: - # Check dynamic range - if (validation_utils.validate_ipv4_range(dynamic_range) and - not validation_utils.is_range_within_netmask( - dynamic_range, netmask_bits - )): - errors.append( - create_error_msg( - f"{network_type}.dynamic_range", - dynamic_range, - en_us_validation_msg.RANGE_NETMASK_BOUNDARY_FAIL_MSG, - ) - ) - return errors From 6a5d4f0ba6ca729f89b3ef9a266754ce44ca5315 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Tue, 24 Feb 2026 09:32:58 +0000 Subject: [PATCH 72/77] fix input validation for high_availability_config.yml Signed-off-by: Vrinda_Marwah --- .../common_utils/en_us_validation_msg.py | 2 ++ .../validation_flows/high_availability_validation.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index e72c474513..a8027529b9 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -384,6 +384,8 @@ def server_spec_network_key_fail_msg(nic_device): "roles_config.yml") FEILD_MUST_BE_EMPTY = "feild must be empty." DUPLICATE_VIRTUAL_IP = "is already used. Please give unique virtual ip address" +VIRTUAL_IP_SAME_AS_PRIMARY_OIM_ADMIN_IP = ("virtual_ip_address provided in high_availability_config.yml must not be the same as primary_oim_admin_ip in network_spec.yml. " + "Please provide a different virtual IP address.") INVALID_PASSIVE_NODE_SERVICE_TAG = "active node and passive node service tag cannot be same." GROUP_NOT_FOUND = "is not defined in the roles_config.yml. Please define the group in roles_config.yml" ROLE_NODE_FOUND = "is not defined in roles_config.yml. Please define the role in roles_config.yml" diff --git a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py index 4b67f09789..d7d2415b2b 100644 --- a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py @@ -310,6 +310,16 @@ def validate_vip_address( - None: The function does not return any value, it only appends error messages to the errors list. """ + + if vip_address == oim_admin_ip: + errors.append( + create_error_msg( + f"{config_type} virtual_ip_address", + vip_address, + en_us_validation_msg.VIRTUAL_IP_SAME_AS_PRIMARY_OIM_ADMIN_IP, + ) + ) + # virtual_ip_address is mutually exclusive with admin dynamic ranges vip_within_dynamic_range = validation_utils.is_ip_within_range( admin_network["dynamic_range"], vip_address From 663db575afa003886307229cf139fee8b7ce19e8 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Tue, 24 Feb 2026 09:39:09 +0000 Subject: [PATCH 73/77] updating copyrights Signed-off-by: Vrinda_Marwah --- .../input_validation/common_utils/en_us_validation_msg.py | 2 +- .../validation_flows/high_availability_validation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index a8027529b9..15a8537ac5 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py index d7d2415b2b..5e222d04b5 100644 --- a/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/high_availability_validation.py @@ -1,4 +1,4 @@ -# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 4e2b39196df305724aa1baf926bb85f7f7e8e7cd Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Tue, 24 Feb 2026 15:32:08 +0530 Subject: [PATCH 74/77] Updating Reprovision guidance, update success msg and rollback flow --- omnia.sh | 46 ++++++++------------ upgrade/roles/upgrade_cluster/tasks/main.yml | 40 +++++++++++++---- upgrade/roles/upgrade_cluster/vars/main.yml | 4 ++ 3 files changed, 55 insertions(+), 35 deletions(-) diff --git a/omnia.sh b/omnia.sh index 9de277a56d..e380de2745 100755 --- a/omnia.sh +++ b/omnia.sh @@ -255,19 +255,17 @@ show_post_upgrade_instructions() { echo -e "${YELLOW} IMPORTANT POST-UPGRADE STEP${NC}" echo -e "${YELLOW}================================================================================${NC}" echo "" - echo -e "${GREEN}✓ Omnia core container has been successfully upgraded${NC}" - echo -e "${GREEN}✓ Version updated to: $upgraded_version${NC}" - echo "" echo -e "${BLUE}NEXT REQUIRED ACTION:${NC}" echo -e "${YELLOW}You must now run the upgrade playbook inside the omnia_core container:${NC}" echo "" - echo -e "${GREEN}podman exec -it omnia_core ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}" + echo -e "${GREEN}ansible-playbook /omnia/upgrade/upgrade_omnia.yml${NC}" echo "" echo -e "${BLUE}This playbook will:${NC}" - echo -e "• Update input files" - echo -e "• Update internal configurations" + echo -e "• Update input files based on the previous version inputs" + echo -e "• Provide further steps to follow" + echo -e "• Provide user guidance for provisioning nodes" echo "" - echo -e "${YELLOW}Note: Run this command after the container is fully healthy and stable${NC}" + echo -e "${YELLOW}Note: Run the above command after the container is fully healthy and stable${NC}" echo -e "${YELLOW}================================================================================${NC}" echo "" } @@ -1167,6 +1165,7 @@ init_ssh_config() { remove_container_omnia_sh() { podman exec -u root omnia_core bash -c 'if [ -f /omnia/omnia.sh ]; then rm -f /omnia/omnia.sh; fi' >/dev/null 2>&1 || true + podman exec -u root omnia_core bash -c 'if [ -d /omnia/input ]; then rm -rf /omnia/input; fi' >/dev/null 2>&1 || true } start_container_session() { @@ -1904,6 +1903,16 @@ upgrade_omnia_core() { # Seed inputs and defaults after upgrade post_setup_config + echo "" + echo -e "${GREEN}================================================================================${NC}" + echo -e "${GREEN} UPGRADE COMPLETED SUCCESSFULLY${NC}" + echo -e "${GREEN}================================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Omnia core has been upgraded to version $TARGET_OMNIA_VERSION${NC}" + echo -e "${GREEN}✓ Container is running and healthy${NC}" + echo -e "${GREEN}✓ Configuration backed up to: $backup_base${NC}" + echo "" + show_post_upgrade_instructions "$TARGET_OMNIA_VERSION" # Initialize SSH config and start container session init_ssh_config @@ -2184,26 +2193,9 @@ rollback_omnia_core() { exit 1 fi - echo "" - echo "Available backups for version $selected_version:" - for i in "${!backup_dirs[@]}"; do - local backup_path="${backup_dirs[$i]}" - local backup_date=$(podman exec -u root omnia_core stat -c '%y' "$backup_path" 2>/dev/null | cut -d' ' -f1,2 | cut -d'.' -f1) - echo " $((i+1)). Backup created: $backup_date" - done - - # Prompt for backup selection - echo "" - echo -n "Select backup to restore from (1-${#backup_dirs[@]}): " - read -r backup_selection - - # Validate backup selection - if ! [[ "$backup_selection" =~ ^[0-9]+$ ]] || [ "$backup_selection" -lt 1 ] || [ "$backup_selection" -gt ${#backup_dirs[@]} ]; then - echo -e "${RED}ERROR: Invalid backup selection.${NC}" - exit 1 - fi - - local selected_backup="${backup_dirs[$((backup_selection-1))]}" + # Auto-select the most recent backup (first in sorted list) + local selected_backup="${backup_dirs[0]}" + echo "Auto-selecting backup: $selected_backup" # Validate selected backup exists if ! podman exec -u root omnia_core test -d "$selected_backup" 2>/dev/null; then diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index 90b25611b5..ce91c4c598 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -12,56 +12,80 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Gather NFS share paths from storage_config.yml + ansible.builtin.set_fact: + nfs_slurm_server_share_path: "{{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_slurm') | map(attribute='server_share_path') | first | default('not specified') }}" + nfs_k8s_server_share_path: "{{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_k8s') | map(attribute='server_share_path') | first | default('not specified') }}" - name: Display cluster reprovision guidance ansible.builtin.pause: prompt: "{{ '\x1b[32m' }}=================================================== CLUSTER REPROVISION REQUIRED - =========================================================== + ========================================================== Cluster reprovisioning is required after upgrade to enable new features. Review and update new 2.1 input fields present at /opt/omnia/input/project_default/ directory before reprovisioning: + 1. local_repo_config.yml - Set additional_repos_x86_64 (list of extra repo URLs or file paths for x86_64) - Set additional_repos_aarch64 (list of extra repo URLs or file paths for aarch64) + 2. network_spec.yml (ib_network section) - Define InfiniBand fabric settings (subnet manager/BMC, IP ranges, VLAN if applicable) - Ensure host IB interfaces map to the IB network entries + 3. omnia_config.yml (slurm_cluster.config_source) - Use the new structure: config_source: { type: , location: } - Populate location to point to your Slurm config bundle (local path or remote URL) - Do NFS cleanup (if NFS share is used for k8s/slurm) + - New variable: skip_merge (set to true to skip merging configs during upgrade when using external bundles) + + + Optional: NFS cleanup (only if you are reprovisioning the cluster) + + If you choose to reprovision the cluster and your setup uses an NFS share for Kubernetes and/or Slurm, you may optionally perform an NFS cleanup beforehand: + + Detected NFS share paths from storage_config.yml: - - Clean stale mounts and ensure the NFS share is accessible before reprovision + - Slurm (nfs_slurm) server_share_path: {{ nfs_slurm_server_share_path }} - - Remove any leftover cluster state on the NFS share that could conflict with fresh deployment + - Kubernetes (nfs_k8s) server_share_path: {{ nfs_k8s_server_share_path }} - Run the following playbooks in sequence from the Omnia root directory to reprovision the cluster: + Clean stale mounts and confirm the NFS share is reachable and accessible. + + Remove any leftover cluster state on the NFS share that could conflict with a fresh deployment. + + + Optional: Reprovision playbooks (run in order from the Omnia root directory) 1. ansible-playbook local_repo/local_repo.yml 2. ansible-playbook build_image_x86_64/build_image_x86_64.yml - 3. Only if the user is using aarch64 nodes, run the below playbook after build_image_x86_64: + 3. Only if using aarch64 nodes (run after x86_64 image build): - ansible-playbook build_image_aarch64/build_image_aarch64.yml + -> ansible-playbook build_image_aarch64/build_image_aarch64.yml 4. ansible-playbook discovery/discovery.yml - Please follow the omnia documentation for steps in more detail. + + For detailed steps and prerequisites, follow the official Omnia documentation. + + + ================================================================== + ======================================================================== {{ '\x1b[0m' }}" seconds: 1 diff --git a/upgrade/roles/upgrade_cluster/vars/main.yml b/upgrade/roles/upgrade_cluster/vars/main.yml index f4c5b1b7cb..fc50eacddb 100644 --- a/upgrade/roles/upgrade_cluster/vars/main.yml +++ b/upgrade/roles/upgrade_cluster/vars/main.yml @@ -12,3 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +storage_config_path: "/opt/omnia/input/project_default/storage_config.yml" +storage_content: "{{ lookup('file', storage_config_path, errors='ignore') | default('') }}" +storage_yaml: "{{ storage_content | length > 0 | ternary(storage_content | from_yaml, {}) }}" +nfs_params: "{{ storage_yaml.nfs_client_params | default([]) }}" From 986f21349b940c8f41b6e833c567a3f4c86b2edb Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Tue, 24 Feb 2026 15:41:15 +0530 Subject: [PATCH 75/77] Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index ce91c4c598..e1b5ec2a29 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -14,8 +14,12 @@ --- - name: Gather NFS share paths from storage_config.yml ansible.builtin.set_fact: - nfs_slurm_server_share_path: "{{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_slurm') | map(attribute='server_share_path') | first | default('not specified') }}" - nfs_k8s_server_share_path: "{{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_k8s') | map(attribute='server_share_path') | first | default('not specified') }}" + nfs_slurm_server_share_path: >- + {{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_slurm') + | map(attribute='server_share_path') | first | default('not specified') }} + nfs_k8s_server_share_path: >- + {{ nfs_params | selectattr('nfs_name', 'equalto', 'nfs_k8s') + | map(attribute='server_share_path') | first | default('not specified') }} - name: Display cluster reprovision guidance From d16bd01d8ddc7eccdb11ee74e208059c3b75d14f Mon Sep 17 00:00:00 2001 From: mithileshreddy04 Date: Tue, 24 Feb 2026 15:45:29 +0530 Subject: [PATCH 76/77] Update main.yml --- upgrade/roles/upgrade_cluster/tasks/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_cluster/tasks/main.yml b/upgrade/roles/upgrade_cluster/tasks/main.yml index e1b5ec2a29..ada4408f2e 100644 --- a/upgrade/roles/upgrade_cluster/tasks/main.yml +++ b/upgrade/roles/upgrade_cluster/tasks/main.yml @@ -58,7 +58,8 @@ Optional: NFS cleanup (only if you are reprovisioning the cluster) - If you choose to reprovision the cluster and your setup uses an NFS share for Kubernetes and/or Slurm, you may optionally perform an NFS cleanup beforehand: + If you choose to reprovision the cluster and your setup uses an NFS share for Kubernetes and/or Slurm, you may optionally perform an NFS + cleanup beforehand: Detected NFS share paths from storage_config.yml: From 53c9023f27958f132c86b02d75dc3234c5aa13f4 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 25 Feb 2026 12:23:27 +0530 Subject: [PATCH 77/77] Additional_Packages defect fix (#4042) * additional packages defect * reverse check * parent key check * removing the additional_package group --- .../input_validation/common_utils/config.py | 11 +++++ .../validation_flows/common_validation.py | 44 +++++++++++++++++-- .../validation_flows/local_repo_validation.py | 29 ++++++++++++ .../rhel/10.0/additional_packages.json | 15 ------- 4 files changed, 80 insertions(+), 19 deletions(-) diff --git a/common/library/module_utils/input_validation/common_utils/config.py b/common/library/module_utils/input_validation/common_utils/config.py index 0f369f3950..58cae556c4 100644 --- a/common/library/module_utils/input_validation/common_utils/config.py +++ b/common/library/module_utils/input_validation/common_utils/config.py @@ -33,6 +33,17 @@ OMNIA_ENTITLEMENT_PATH = '/opt/omnia/rhel_repo_certs/*.pem' OMNIA_REDHAT_REPO = '/opt/omnia/rhel_repo_certs/redhat.repo' +# Supported functional groups for additional_packages per architecture +ADDITIONAL_PACKAGES_SUPPORTED_SUBGROUPS = { + "x86_64": [ + "slurm_control_node", "slurm_node", "login_node", "login_compiler_node", + "service_kube_control_plane", "service_kube_control_plane_first", "service_kube_node" + ], + "aarch64": [ + "slurm_control_node", "slurm_node", "login_node", "login_compiler_node" + ] +} + # dict to hold the file names. If any file's name changes just change it here. files = { "local_repo_config": "local_repo_config.yml", diff --git a/common/library/module_utils/input_validation/validation_flows/common_validation.py b/common/library/module_utils/input_validation/validation_flows/common_validation.py index 36f55130d4..dcf812c929 100644 --- a/common/library/module_utils/input_validation/validation_flows/common_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/common_validation.py @@ -246,14 +246,14 @@ def validate_software_config( ) ) + supported_subgroups = config.ADDITIONAL_PACKAGES_SUPPORTED_SUBGROUPS + for software_pkg in data['softwares']: software = software_pkg['name'] arch_list = software_pkg.get('arch') - json_paths = [] for arch in arch_list: - json_paths.append(get_json_file_path( - software, cluster_os_type, cluster_os_version, input_file_path, arch)) - for json_path in json_paths: + json_path = get_json_file_path( + software, cluster_os_type, cluster_os_version, input_file_path, arch) # Check if json_path is None or if the JSON syntax is invalid if not json_path: errors.append( @@ -266,7 +266,43 @@ def validate_software_config( try: subgroup_softwares = subgroup_dict.get(software, None) json_data = load_json(json_path) + # For additional_packages, validate subgroup keys in the JSON + if software == "additional_packages": + if "additional_packages" not in json_data: + errors.append( + create_error_msg( + software + '/' + arch, + json_path, + f"Required key 'additional_packages' is missing from the JSON file." + ) + ) + arch_supported = supported_subgroups.get(arch, []) + user_subgroups = [p.get('name') for p in data.get(software, [])] + for json_key in json_data: + if json_key == "additional_packages": + continue + if json_key not in arch_supported: + errors.append( + create_error_msg( + software + '/' + arch, + json_path, + f"Subgroup '{json_key}' is not supported for architecture {arch}." + ) + ) + elif json_key not in user_subgroups: + errors.append( + create_error_msg( + software + '/' + arch, + json_path, + f"Subgroup '{json_key}' is present in JSON but not listed under additional_packages in software_config.json." + ) + ) for subgroup_software in subgroup_softwares: + # For additional_packages, skip subgroups that are + # not supported for this arch + if software == "additional_packages": + if subgroup_software not in supported_subgroups.get(arch, []): + continue _, fail_data = validation_utils.validate_softwaresubgroup_entries( subgroup_software, json_path, json_data, validation_results, failures ) diff --git a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py index 88e02845d2..447bd33c8d 100644 --- a/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/local_repo_validation.py @@ -208,6 +208,8 @@ def validate_local_repo_config(input_file_path, data, ) os_ver_path = f"/{software_config_json['cluster_os_type']}/{software_config_json['cluster_os_version']}/" + supported_subgroups = config.ADDITIONAL_PACKAGES_SUPPORTED_SUBGROUPS + for software in software_config_json["softwares"]: sw = software["name"] arch_list = software.get("arch") @@ -221,10 +223,37 @@ def validate_local_repo_config(input_file_path, data, else: curr_json = load_json(json_path) pkg_list = curr_json[sw]['cluster'] + # For additional_packages, validate subgroup keys in the JSON + if sw == "additional_packages": + if "additional_packages" not in curr_json: + errors.append( + create_error_msg(sw + '/' + arch, + json_path, + f"Required key 'additional_packages' is missing from the JSON file.")) + arch_supported = supported_subgroups.get(arch, []) + user_subgroups = [p.get('name') for p in software_config_json.get(sw, [])] + for json_key in curr_json: + if json_key == "additional_packages": + continue + if json_key not in arch_supported: + errors.append( + create_error_msg(sw + '/' + arch, + json_path, + f"Subgroup '{json_key}' is not supported for architecture {arch}.")) + elif json_key not in user_subgroups: + errors.append( + create_error_msg(sw + '/' + arch, + json_path, + f"Subgroup '{json_key}' is present in JSON but not listed under additional_packages in software_config.json.")) if sw in software_config_json: for sub_pkg in software_config_json[sw]: sub_sw = sub_pkg.get('name') if sub_sw not in curr_json: + # For additional_packages, skip subgroups that + # are not supported for this arch + if sw == "additional_packages": + if sub_sw not in supported_subgroups.get(arch, []): + continue errors.append( create_error_msg(sw + '/' + arch, json_path, diff --git a/input/config/aarch64/rhel/10.0/additional_packages.json b/input/config/aarch64/rhel/10.0/additional_packages.json index 0d6d9a0452..b01c3f78b5 100644 --- a/input/config/aarch64/rhel/10.0/additional_packages.json +++ b/input/config/aarch64/rhel/10.0/additional_packages.json @@ -4,21 +4,6 @@ ] }, - "service_kube_control_plane_first": { - "cluster": [ - - ] - }, - "service_kube_control_plane": { - "cluster": [ - - ] - }, - "service_kube_node": { - "cluster": [ - - ] - }, "slurm_control_node": { "cluster": [