From cb21aa03eae67bb096db4ebe17a6111e69f20e2d Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 09:48:06 +0000
Subject: [PATCH 01/31] Add K8s Agent: Streamlit-based on-prem Kubernetes
 cluster management UI

- Profile Manager: CRUD for cluster profiles with node definitions (control-plane/worker), SSH credentials
- Cluster Creation: SSH-based provisioning with CRI-O, Flannel CNI, kubeadm, best practices hardening
- Cluster Debugger: Diagnostic commands with AI-powered root cause analysis and recommendations
- Monitoring Setup: One-click Prometheus + Grafana deployment with dashboards and alerting rules
- Log Analysis: Multi-source log collection, error pattern extraction, cross-source correlation
- AI Assistant: Chat interface powered by LLM for Kubernetes questions
- Integrated with Infosys AI Gateway for LLM capabilities
---
 .gitignore                            |    1 +
 k8s-agent/README.md                   |   87 ++
 k8s-agent/app.py                      | 1111 +++++++++++++++++++++++++
 k8s-agent/config.py                   |   21 +
 k8s-agent/data/profiles/.gitkeep      |    1 +
 k8s-agent/modules/__init__.py         |    1 +
 k8s-agent/modules/cluster_creator.py  |  545 ++++++++++++
 k8s-agent/modules/cluster_debugger.py |  228 +++++
 k8s-agent/modules/llm_client.py       |  145 ++++
 k8s-agent/modules/log_analyzer.py     |  345 ++++++++
 k8s-agent/modules/monitoring_setup.py |  440 ++++++++++
 k8s-agent/modules/profile_manager.py  |  119 +++
 k8s-agent/requirements.txt            |    6 +
 k8s-agent/templates/.gitkeep          |    1 +
 14 files changed, 3051 insertions(+)
 create mode 100644 k8s-agent/README.md
 create mode 100644 k8s-agent/app.py
 create mode 100644 k8s-agent/config.py
 create mode 100644 k8s-agent/data/profiles/.gitkeep
 create mode 100644 k8s-agent/modules/__init__.py
 create mode 100644 k8s-agent/modules/cluster_creator.py
 create mode 100644 k8s-agent/modules/cluster_debugger.py
 create mode 100644 k8s-agent/modules/llm_client.py
 create mode 100644 k8s-agent/modules/log_analyzer.py
 create mode 100644 k8s-agent/modules/monitoring_setup.py
 create mode 100644 k8s-agent/modules/profile_manager.py
 create mode 100644 k8s-agent/requirements.txt
 create mode 100644 k8s-agent/templates/.gitkeep

diff --git a/.gitignore b/.gitignore
index 9359002..2a1bb18 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,4 @@ charts/*/charts/
 *.pem
 *.key
 kubeconfig*
+k8s-agent/__pycache__/
diff --git a/k8s-agent/README.md b/k8s-agent/README.md
new file mode 100644
index 0000000..17c03e9
--- /dev/null
+++ b/k8s-agent/README.md
@@ -0,0 +1,87 @@
+# K8s Agent — On-Prem Kubernetes Cluster Management
+
+A Streamlit-based UI for managing on-premises Kubernetes clusters with CRI-O container runtime and Flannel CNI.
+
+## Features
+
+1. **Profile Manager** — Create and manage profiles for multiple clusters with node definitions (control-plane / worker), SSH credentials, and K8s configuration.
+
+2. **Cluster Creation** — SSH into nodes and provision a full Kubernetes cluster:
+   - Installs CRI-O container runtime
+   - Installs kubeadm, kubelet, kubectl
+   - Initializes control plane with best-practice kubeadm config
+   - Deploys Flannel CNI
+   - Joins worker nodes automatically
+   - Applies security hardening (NetworkPolicies, RBAC, ResourceQuotas, PodSecurity)
+
+3. **Cluster Debugger** — Run diagnostic commands and get AI-powered analysis:
+   - Pre-built checks for nodes, pods, networking, storage, certificates
+   - Category-based scanning (Cluster Overview, Networking, Security, etc.)
+   - Custom command execution via SSH
+   - AI-powered root cause analysis and remediation recommendations
+
+4. **Monitoring Setup** — Deploy Prometheus + Grafana with production-ready configuration:
+   - One-click kube-prometheus-stack installation
+   - Grafana dashboard imports (cluster overview, node exporter, pods, etcd, API server, etc.)
+   - Alerting rules for node health, pod crashes, disk pressure, etcd latency
+   - AI-powered monitoring recommendations
+
+5. **Log Analysis** — Collect, parse, and correlate logs across cluster components:
+   - System component logs (kubelet, CRI-O, API server, etcd, Flannel, CoreDNS)
+   - Pod-level log collection with previous container support
+   - Automated error pattern extraction and grouping
+   - Cross-source error correlation
+   - AI-powered deep log analysis and root cause identification
+
+6. **AI Assistant** — Chat interface for Kubernetes questions powered by your LLM.
+
+## Quick Start
+
+```bash
+cd k8s-agent
+pip install -r requirements.txt
+
+# Set your LLM API key
+export LLM_API_KEY="your-api-key"
+# Or use the Infosys AI Gateway key
+export INFOSYS_CODER_API_KEY="your-key"
+
+# Run the app
+streamlit run app.py
+```
+
+## Configuration
+
+Environment variables:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `LLM_API_URL` | LLM API endpoint | Infosys AI Gateway |
+| `LLM_API_KEY` | LLM API key | Falls back to `INFOSYS_CODER_API_KEY` |
+| `LLM_MODEL` | Model name | `gpt-4` |
+| `LLM_TEMPERATURE` | Response temperature | `0.3` |
+| `LLM_MAX_TOKENS` | Max response tokens | `4096` |
+
+## Architecture
+
+```
+k8s-agent/
+├── app.py                     # Main Streamlit application
+├── config.py                  # Configuration and environment variables
+├── requirements.txt           # Python dependencies
+├── modules/
+│   ├── llm_client.py          # LLM API integration (query + streaming)
+│   ├── profile_manager.py     # Cluster profile CRUD operations
+│   ├── cluster_creator.py     # SSH-based cluster provisioning
+│   ├── cluster_debugger.py    # Diagnostic commands and AI analysis
+│   ├── monitoring_setup.py    # Prometheus/Grafana deployment
+│   └── log_analyzer.py        # Log collection, parsing, correlation
+├── templates/                 # Configuration templates
+└── data/profiles/             # Stored cluster profiles (JSON)
+```
+
+## Requirements
+
+- Python 3.10+
+- SSH access to target nodes (for cluster operations)
+- LLM API endpoint (Infosys AI Gateway or compatible OpenAI-style API)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
new file mode 100644
index 0000000..4225a81
--- /dev/null
+++ b/k8s-agent/app.py
@@ -0,0 +1,1111 @@
+"""K8s Agent — Streamlit-based Kubernetes Cluster Management UI."""
+
+import sys
+import os
+
+# Ensure the k8s-agent directory is on the Python path so sibling imports work.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import json
+import streamlit as st
+from streamlit_option_menu import option_menu
+
+import config
+from modules.profile_manager import (
+    ClusterProfile,
+    NodeInfo,
+    save_profile,
+    load_profile,
+    list_profiles,
+    delete_profile,
+    update_profile_status,
+)
+from modules.cluster_creator import (
+    test_ssh_connectivity,
+    generate_common_setup_script,
+    generate_control_plane_init_script,
+    generate_worker_join_script,
+    generate_best_practices_script,
+    provision_node_common,
+    init_control_plane,
+    retrieve_join_command,
+    join_worker_node,
+    apply_best_practices,
+    get_cluster_status,
+    get_llm_cluster_advice,
+)
+from modules.cluster_debugger import (
+    DIAGNOSTIC_COMMANDS,
+    CATEGORY_MAP,
+    run_diagnostic,
+    run_category_diagnostics,
+    run_all_diagnostics,
+    run_custom_command,
+    analyze_diagnostics,
+    get_debug_suggestion,
+    check_pod_issues,
+)
+from modules.monitoring_setup import (
+    GRAFANA_DASHBOARDS,
+    install_helm,
+    install_prometheus_stack,
+    install_dashboards,
+    install_alert_rules,
+    get_monitoring_status,
+    get_monitoring_advice,
+    generate_prometheus_install_script,
+    generate_dashboard_import_script,
+    generate_alerting_rules_script,
+)
+from modules.log_analyzer import (
+    LOG_SOURCES,
+    collect_logs,
+    collect_pod_logs,
+    collect_multi_source_logs,
+    analyze_logs,
+    correlate_errors,
+    llm_analyze_logs,
+    llm_correlate_analysis,
+    get_pod_list,
+)
+from modules.llm_client import query_llm, stream_llm
+
+
+# ── Page Configuration ────────────────────────────────────────────────────
+
+st.set_page_config(
+    page_title="K8s Agent",
+    page_icon="☸",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+
+# ── Custom CSS ────────────────────────────────────────────────────────────
+
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2rem;
+        font-weight: 700;
+        color: #326CE5;
+        margin-bottom: 0.5rem;
+    }
+    .sub-header {
+        font-size: 1rem;
+        color: #666;
+        margin-bottom: 1.5rem;
+    }
+    .status-active { color: #28a745; font-weight: bold; }
+    .status-error { color: #dc3545; font-weight: bold; }
+    .status-draft { color: #6c757d; font-weight: bold; }
+    .status-provisioning { color: #fd7e14; font-weight: bold; }
+    .node-card {
+        border: 1px solid #ddd;
+        border-radius: 8px;
+        padding: 1rem;
+        margin: 0.5rem 0;
+        background: #f8f9fa;
+    }
+    .metric-card {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 10px;
+        padding: 1.2rem;
+        color: white;
+        text-align: center;
+    }
+    .metric-value { font-size: 2rem; font-weight: 700; }
+    .metric-label { font-size: 0.85rem; opacity: 0.9; }
+    div[data-testid="stExpander"] details summary p {
+        font-size: 1rem;
+        font-weight: 600;
+    }
+</style>
+""", unsafe_allow_html=True)
+
+
+# ── Session state initialization ──────────────────────────────────────────
+
+def init_session_state():
+    defaults = {
+        "active_profile": None,
+        "chat_history": [],
+        "provisioning_log": [],
+        "debug_results": {},
+        "log_analysis_results": {},
+    }
+    for key, value in defaults.items():
+        if key not in st.session_state:
+            st.session_state[key] = value
+
+
+init_session_state()
+
+
+# ── Sidebar: Profile Manager + Navigation ─────────────────────────────────
+
+def render_sidebar():
+    with st.sidebar:
+        st.markdown('<div class="main-header">☸ K8s Agent</div>', unsafe_allow_html=True)
+        st.markdown('<div class="sub-header">On-Prem Kubernetes Management</div>', unsafe_allow_html=True)
+
+        st.divider()
+
+        # ── Profile selector ──
+        st.markdown("### Cluster Profiles")
+        profiles = list_profiles()
+        profile_names = [p.name for p in profiles]
+
+        if profile_names:
+            selected = st.selectbox(
+                "Active Profile",
+                options=["(none)"] + profile_names,
+                index=(
+                    profile_names.index(st.session_state.active_profile) + 1
+                    if st.session_state.active_profile in profile_names
+                    else 0
+                ),
+                key="profile_selector",
+            )
+            if selected != "(none)":
+                st.session_state.active_profile = selected
+                profile = load_profile(selected)
+                if profile:
+                    status_class = f"status-{profile.status}"
+                    st.markdown(
+                        f"**Status:** <span class='{status_class}'>{profile.status.upper()}</span>",
+                        unsafe_allow_html=True,
+                    )
+                    st.caption(
+                        f"K8s {profile.kubernetes_version} | CRI-O {profile.crio_version} | "
+                        f"{len(profile.get_control_plane_nodes())} CP + "
+                        f"{len(profile.get_worker_nodes())} Workers"
+                    )
+            else:
+                st.session_state.active_profile = None
+        else:
+            st.info("No profiles yet. Create one in Profile Manager.")
+
+        st.divider()
+
+        # ── Navigation ──
+        selected_page = option_menu(
+            menu_title="Navigation",
+            options=[
+                "Profile Manager",
+                "Cluster Creation",
+                "Cluster Debugger",
+                "Monitoring Setup",
+                "Log Analysis",
+                "AI Assistant",
+            ],
+            icons=[
+                "person-gear",
+                "hdd-rack",
+                "bug",
+                "graph-up",
+                "journal-text",
+                "robot",
+            ],
+            menu_icon="list",
+            default_index=0,
+        )
+
+        st.divider()
+
+        # ── LLM config ──
+        with st.expander("LLM Settings"):
+            st.text_input(
+                "API URL",
+                value=config.LLM_API_URL,
+                key="llm_api_url",
+                help="Endpoint for the LLM API",
+            )
+            st.text_input(
+                "API Key",
+                value=config.LLM_API_KEY[:8] + "..." if config.LLM_API_KEY else "",
+                type="password",
+                key="llm_api_key_display",
+                disabled=True,
+                help="Set via LLM_API_KEY or INFOSYS_CODER_API_KEY env var",
+            )
+            st.selectbox(
+                "Model",
+                options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"],
+                index=0,
+                key="llm_model_select",
+            )
+
+        return selected_page
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Profile Manager
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_profile_manager():
+    st.markdown("## Cluster Profile Manager")
+    st.markdown("Create, edit, and manage profiles for your on-prem Kubernetes clusters.")
+
+    tab_create, tab_list, tab_import = st.tabs(["Create Profile", "Manage Profiles", "Import / Export"])
+
+    # ── Create Profile ────────────────────────────────────────────────────
+    with tab_create:
+        with st.form("create_profile_form"):
+            st.markdown("### New Cluster Profile")
+            col1, col2 = st.columns(2)
+
+            with col1:
+                name = st.text_input("Profile Name *", placeholder="production-cluster")
+                description = st.text_area("Description", placeholder="Production on-prem cluster")
+                k8s_version = st.selectbox("Kubernetes Version", ["1.30", "1.29", "1.28", "1.27"], index=0)
+                crio_version = st.selectbox("CRI-O Version", ["1.30", "1.29", "1.28", "1.27"], index=0)
+                pod_security = st.selectbox(
+                    "Pod Security Standard",
+                    ["restricted", "baseline", "privileged"],
+                    index=0,
+                )
+
+            with col2:
+                pod_cidr = st.text_input("Pod CIDR", value="10.244.0.0/16")
+                service_cidr = st.text_input("Service CIDR", value="10.96.0.0/12")
+                dns_domain = st.text_input("DNS Domain", value="cluster.local")
+
+            st.divider()
+            st.markdown("### Nodes")
+            st.markdown("Define your control-plane and worker nodes.")
+
+            num_nodes = st.number_input("Number of Nodes", min_value=1, max_value=50, value=3, step=1)
+
+            nodes = []
+            for i in range(int(num_nodes)):
+                st.markdown(f"**Node {i + 1}**")
+                ncol1, ncol2, ncol3, ncol4, ncol5 = st.columns([2, 2, 1.5, 1, 1.5])
+                with ncol1:
+                    hostname = st.text_input(f"Hostname", key=f"host_{i}", placeholder=f"node-{i + 1}")
+                with ncol2:
+                    ip_addr = st.text_input(f"IP Address", key=f"ip_{i}", placeholder="192.168.1.x")
+                with ncol3:
+                    role = st.selectbox(f"Role", ["control-plane", "worker"], key=f"role_{i}",
+                                        index=0 if i == 0 else 1)
+                with ncol4:
+                    ssh_user = st.text_input(f"SSH User", key=f"user_{i}", value="root")
+                with ncol5:
+                    ssh_key = st.text_input(f"SSH Key Path", key=f"key_{i}", value="~/.ssh/id_rsa")
+
+                nodes.append({
+                    "hostname": hostname,
+                    "ip_address": ip_addr,
+                    "role": role,
+                    "ssh_user": ssh_user,
+                    "ssh_port": 22,
+                    "ssh_key_path": ssh_key,
+                })
+
+            submitted = st.form_submit_button("Create Profile", type="primary", use_container_width=True)
+
+            if submitted:
+                if not name:
+                    st.error("Profile name is required.")
+                elif not any(n["ip_address"] for n in nodes):
+                    st.error("At least one node must have an IP address.")
+                elif not any(n["role"] == "control-plane" for n in nodes):
+                    st.error("At least one control-plane node is required.")
+                else:
+                    valid_nodes = [n for n in nodes if n["ip_address"]]
+                    profile = ClusterProfile(
+                        name=name,
+                        description=description,
+                        kubernetes_version=k8s_version,
+                        crio_version=crio_version,
+                        cni_plugin="flannel",
+                        pod_cidr=pod_cidr,
+                        service_cidr=service_cidr,
+                        dns_domain=dns_domain,
+                        nodes=valid_nodes,
+                        pod_security_standard=pod_security,
+                    )
+                    path = save_profile(profile)
+                    st.session_state.active_profile = name
+                    st.success(f"Profile '{name}' created successfully!")
+                    st.rerun()
+
+    # ── Manage Profiles ───────────────────────────────────────────────────
+    with tab_list:
+        profiles = list_profiles()
+        if not profiles:
+            st.info("No profiles created yet.")
+            return
+
+        for profile in profiles:
+            with st.expander(f"**{profile.name}** — {profile.status.upper()}", expanded=False):
+                col1, col2, col3 = st.columns([2, 2, 1])
+                with col1:
+                    st.markdown(f"**Description:** {profile.description or 'N/A'}")
+                    st.markdown(f"**Kubernetes:** {profile.kubernetes_version} | **CRI-O:** {profile.crio_version}")
+                    st.markdown(f"**Pod CIDR:** {profile.pod_cidr} | **Service CIDR:** {profile.service_cidr}")
+                    st.markdown(f"**Pod Security:** {profile.pod_security_standard}")
+                with col2:
+                    st.markdown("**Nodes:**")
+                    for node in profile.nodes:
+                        icon = "🔵" if node["role"] == "control-plane" else "🟢"
+                        st.markdown(
+                            f"{icon} `{node.get('hostname', 'N/A')}` — "
+                            f"`{node['ip_address']}` ({node['role']})"
+                        )
+                with col3:
+                    st.markdown(f"**Created:** {profile.created_at[:10] if profile.created_at else 'N/A'}")
+                    st.markdown(f"**Updated:** {profile.updated_at[:10] if profile.updated_at else 'N/A'}")
+                    if st.button("Set Active", key=f"activate_{profile.name}"):
+                        st.session_state.active_profile = profile.name
+                        st.rerun()
+                    if st.button("Delete", key=f"delete_{profile.name}", type="secondary"):
+                        delete_profile(profile.name)
+                        if st.session_state.active_profile == profile.name:
+                            st.session_state.active_profile = None
+                        st.rerun()
+
+    # ── Import / Export ───────────────────────────────────────────────────
+    with tab_import:
+        col_export, col_import = st.columns(2)
+        with col_export:
+            st.markdown("### Export Profile")
+            profiles = list_profiles()
+            if profiles:
+                export_name = st.selectbox("Select profile to export", [p.name for p in profiles])
+                if st.button("Export as JSON"):
+                    profile = load_profile(export_name)
+                    if profile:
+                        from dataclasses import asdict
+                        st.download_button(
+                            label="Download JSON",
+                            data=json.dumps(asdict(profile), indent=2),
+                            file_name=f"{export_name}.json",
+                            mime="application/json",
+                        )
+
+        with col_import:
+            st.markdown("### Import Profile")
+            uploaded = st.file_uploader("Upload profile JSON", type=["json"])
+            if uploaded:
+                try:
+                    data = json.loads(uploaded.read())
+                    profile = ClusterProfile(**data)
+                    save_profile(profile)
+                    st.success(f"Profile '{profile.name}' imported!")
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"Failed to import: {e}")
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Cluster Creation
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_cluster_creation():
+    st.markdown("## Cluster Creation")
+    st.markdown("Provision an on-prem K8s cluster via SSH with CRI-O, Flannel CNI, and best practices.")
+
+    profile = _get_active_profile()
+    if not profile:
+        return
+
+    _show_profile_summary(profile)
+
+    tab_preflight, tab_provision, tab_scripts, tab_advice = st.tabs([
+        "Pre-flight Checks",
+        "Provision Cluster",
+        "View Scripts",
+        "AI Advice",
+    ])
+
+    # ── Pre-flight: SSH connectivity ──────────────────────────────────────
+    with tab_preflight:
+        st.markdown("### SSH Connectivity Test")
+        st.markdown("Test SSH access to all nodes before provisioning.")
+
+        if st.button("Test All Nodes", type="primary"):
+            for node in profile.nodes:
+                with st.status(f"Testing {node.get('hostname', node['ip_address'])}...", expanded=True):
+                    result = test_ssh_connectivity(node)
+                    if result.success:
+                        st.success(f"Connected to {node['ip_address']}")
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.error(f"Failed to connect to {node['ip_address']}")
+                        st.code(result.stderr, language="text")
+
+    # ── Provision ─────────────────────────────────────────────────────────
+    with tab_provision:
+        st.markdown("### Automated Cluster Provisioning")
+        st.warning(
+            "This will SSH into each node and install Kubernetes components. "
+            "Ensure all nodes are accessible and you have root/sudo access."
+        )
+
+        cp_nodes = profile.get_control_plane_nodes()
+        worker_nodes = profile.get_worker_nodes()
+
+        st.markdown(f"**Control Plane:** {len(cp_nodes)} node(s) | **Workers:** {len(worker_nodes)} node(s)")
+
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            step1 = st.checkbox("Step 1: Common Setup (all nodes)", value=True)
+        with col2:
+            step2 = st.checkbox("Step 2: Init Control Plane", value=True)
+        with col3:
+            step3 = st.checkbox("Step 3: Join Workers", value=True)
+        step4 = st.checkbox("Step 4: Apply Best Practices", value=True)
+
+        if st.button("Start Provisioning", type="primary", use_container_width=True):
+            update_profile_status(profile.name, "provisioning")
+
+            # Step 1: Common setup on all nodes
+            if step1:
+                st.markdown("---")
+                st.markdown("### Step 1: Common Setup")
+                for node in profile.nodes:
+                    with st.status(
+                        f"Setting up {node.get('hostname', node['ip_address'])} ({node['role']})...",
+                        expanded=True,
+                    ):
+                        result = provision_node_common(node, profile)
+                        if result.success:
+                            st.success(f"Common setup complete on {node['ip_address']}")
+                        else:
+                            st.error(f"Setup failed on {node['ip_address']}")
+                            st.code(result.stderr, language="text")
+
+            # Step 2: Initialize control plane
+            if step2 and cp_nodes:
+                st.markdown("---")
+                st.markdown("### Step 2: Control Plane Initialization")
+                cp_node = cp_nodes[0]
+                with st.status(f"Initializing control plane on {cp_node['ip_address']}...", expanded=True):
+                    result = init_control_plane(cp_node, profile)
+                    if result.success:
+                        st.success("Control plane initialized!")
+                        st.code(result.stdout[-2000:], language="text")
+                    else:
+                        st.error("Control plane initialization failed!")
+                        st.code(result.stderr, language="text")
+
+            # Step 3: Join worker nodes
+            if step3 and worker_nodes and cp_nodes:
+                st.markdown("---")
+                st.markdown("### Step 3: Join Worker Nodes")
+                join_cmd = retrieve_join_command(cp_nodes[0])
+                if join_cmd:
+                    for node in worker_nodes:
+                        with st.status(f"Joining {node.get('hostname', node['ip_address'])}...", expanded=True):
+                            result = join_worker_node(node, join_cmd)
+                            if result.success:
+                                st.success(f"Worker {node['ip_address']} joined!")
+                            else:
+                                st.error(f"Failed to join {node['ip_address']}")
+                                st.code(result.stderr, language="text")
+                else:
+                    st.error("Could not retrieve join command from control plane.")
+
+            # Step 4: Best practices
+            if step4 and cp_nodes:
+                st.markdown("---")
+                st.markdown("### Step 4: Best Practices")
+                with st.status("Applying security and resource best practices...", expanded=True):
+                    result = apply_best_practices(cp_nodes[0])
+                    if result.success:
+                        st.success("Best practices applied!")
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.error("Failed to apply best practices")
+                        st.code(result.stderr, language="text")
+
+            # Final status
+            st.markdown("---")
+            st.markdown("### Cluster Status")
+            if cp_nodes:
+                result = get_cluster_status(cp_nodes[0])
+                if result.success:
+                    update_profile_status(profile.name, "active")
+                    st.success("Cluster is active!")
+                    st.code(result.stdout, language="text")
+                else:
+                    update_profile_status(profile.name, "error")
+                    st.error("Could not verify cluster status")
+                    st.code(result.stderr, language="text")
+
+    # ── View Scripts ──────────────────────────────────────────────────────
+    with tab_scripts:
+        st.markdown("### Generated Scripts")
+        st.markdown("Review the scripts that will be executed during provisioning.")
+
+        with st.expander("Common Setup Script (all nodes)", expanded=False):
+            st.code(generate_common_setup_script(profile), language="bash")
+
+        with st.expander("Control Plane Init Script", expanded=False):
+            st.code(generate_control_plane_init_script(profile), language="bash")
+
+        with st.expander("Worker Join Script", expanded=False):
+            st.code(generate_worker_join_script(), language="bash")
+
+        with st.expander("Best Practices Script", expanded=False):
+            st.code(generate_best_practices_script(), language="bash")
+
+    # ── AI Advice ─────────────────────────────────────────────────────────
+    with tab_advice:
+        st.markdown("### AI Cluster Setup Advisor")
+        context = st.text_area(
+            "Additional context or questions",
+            placeholder="e.g., We have 3 nodes with 16GB RAM each. Any special considerations?",
+        )
+        if st.button("Get AI Recommendations", type="primary"):
+            with st.spinner("Analyzing your cluster configuration..."):
+                advice = get_llm_cluster_advice(profile, context)
+                st.markdown(advice)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Cluster Debugger
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_cluster_debugger():
+    st.markdown("## Cluster Debugger")
+    st.markdown("Diagnose issues and get AI-powered recommendations.")
+
+    profile = _get_active_profile()
+    if not profile:
+        return
+
+    cp_nodes = profile.get_control_plane_nodes()
+    if not cp_nodes:
+        st.error("No control-plane node defined in this profile.")
+        return
+    cp_node = cp_nodes[0]
+
+    tab_quick, tab_category, tab_custom, tab_ai = st.tabs([
+        "Quick Diagnostics",
+        "Category Scan",
+        "Custom Command",
+        "AI Debug Assistant",
+    ])
+
+    # ── Quick Diagnostics ─────────────────────────────────────────────────
+    with tab_quick:
+        st.markdown("### Quick Diagnostic Checks")
+        col1, col2 = st.columns(2)
+        with col1:
+            selected_checks = st.multiselect(
+                "Select checks to run",
+                options=list(DIAGNOSTIC_COMMANDS.keys()),
+                default=["Node Status", "Pod Status (All Namespaces)", "Events (Recent)"],
+            )
+        with col2:
+            run_all = st.checkbox("Run ALL diagnostics")
+
+        if st.button("Run Diagnostics", type="primary"):
+            if run_all:
+                with st.spinner("Running all diagnostics..."):
+                    results = run_all_diagnostics(cp_node)
+            else:
+                results = {}
+                for check in selected_checks:
+                    with st.spinner(f"Running: {check}..."):
+                        results[check] = run_diagnostic(cp_node, check)
+
+            st.session_state.debug_results = results
+
+            for name, result in results.items():
+                status_icon = "+" if result.success else "-"
+                with st.expander(f"{'✅' if result.success else '❌'} {name}", expanded=not result.success):
+                    st.code(result.stdout if result.success else result.stderr, language="text")
+
+        if st.session_state.debug_results and st.button("Analyze with AI", type="secondary"):
+            with st.spinner("AI is analyzing diagnostics..."):
+                analysis = analyze_diagnostics(
+                    st.session_state.debug_results,
+                    profile=profile,
+                )
+                st.markdown(analysis)
+
+    # ── Category Scan ─────────────────────────────────────────────────────
+    with tab_category:
+        st.markdown("### Category-Based Diagnostics")
+        category = st.selectbox("Select Category", options=list(CATEGORY_MAP.keys()))
+
+        if st.button("Run Category Scan", type="primary", key="cat_scan"):
+            with st.spinner(f"Running {category} diagnostics..."):
+                results = run_category_diagnostics(cp_node, category)
+
+            for name, result in results.items():
+                with st.expander(f"{'✅' if result.success else '❌'} {name}"):
+                    st.code(result.stdout if result.success else result.stderr, language="text")
+
+            if st.button("Analyze Category with AI", key="cat_ai"):
+                with st.spinner("Analyzing..."):
+                    analysis = analyze_diagnostics(results, profile=profile)
+                    st.markdown(analysis)
+
+    # ── Custom Command ────────────────────────────────────────────────────
+    with tab_custom:
+        st.markdown("### Run Custom Command")
+        st.warning("Commands execute on the control-plane node via SSH.")
+        custom_cmd = st.text_area(
+            "Command",
+            placeholder="kubectl get pods -A -o wide",
+            height=100,
+        )
+        if st.button("Execute", type="primary", key="exec_custom") and custom_cmd:
+            with st.spinner("Executing..."):
+                result = run_custom_command(cp_node, custom_cmd)
+                if result.success:
+                    st.code(result.stdout, language="text")
+                else:
+                    st.error("Command failed")
+                    st.code(result.stderr, language="text")
+
+    # ── AI Debug Assistant ────────────────────────────────────────────────
+    with tab_ai:
+        st.markdown("### AI Debug Assistant")
+        st.markdown("Describe your issue and get AI-powered debugging help.")
+
+        issue = st.text_area(
+            "Describe the issue",
+            placeholder="e.g., Pods are stuck in CrashLoopBackOff in the default namespace",
+            height=120,
+        )
+
+        col1, col2 = st.columns(2)
+        with col1:
+            auto_collect = st.checkbox("Auto-collect relevant diagnostics", value=True)
+        with col2:
+            check_pods = st.checkbox("Check for problematic pods", value=True)
+
+        if st.button("Debug", type="primary", key="ai_debug") and issue:
+            collected_data = ""
+
+            if check_pods:
+                with st.spinner("Checking pod issues..."):
+                    pod_result = check_pod_issues(cp_node)
+                    if pod_result.success and pod_result.stdout.strip():
+                        collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}"
+                        with st.expander("Problematic Pods"):
+                            st.code(pod_result.stdout, language="text")
+
+            if auto_collect:
+                with st.spinner("Collecting diagnostics..."):
+                    diag_results = run_category_diagnostics(cp_node, "Cluster Overview")
+                    for name, result in diag_results.items():
+                        if result.success:
+                            collected_data += f"\n\n{name}:\n{result.stdout}"
+
+            with st.spinner("AI is analyzing the issue..."):
+                full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}"
+                suggestion = get_debug_suggestion(issue, collected_data)
+                st.markdown("### AI Recommendation")
+                st.markdown(suggestion)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Monitoring Setup
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_monitoring_setup():
+    st.markdown("## Monitoring Setup")
+    st.markdown("Deploy Prometheus, Grafana, dashboards, and alerting rules.")
+
+    profile = _get_active_profile()
+    if not profile:
+        return
+
+    cp_nodes = profile.get_control_plane_nodes()
+    if not cp_nodes:
+        st.error("No control-plane node defined in this profile.")
+        return
+    cp_node = cp_nodes[0]
+
+    namespace = st.text_input("Monitoring Namespace", value="monitoring")
+
+    tab_install, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([
+        "Install Stack",
+        "Dashboards",
+        "Alert Rules",
+        "Status",
+        "View Scripts",
+        "AI Advice",
+    ])
+
+    # ── Install ───────────────────────────────────────────────────────────
+    with tab_install:
+        st.markdown("### Install Monitoring Stack")
+        st.markdown("This installs **kube-prometheus-stack** (Prometheus + Grafana + exporters).")
+
+        col1, col2 = st.columns(2)
+        with col1:
+            install_helm_first = st.checkbox("Install Helm (if not present)", value=True)
+        with col2:
+            install_alerts_too = st.checkbox("Also install alert rules", value=True)
+
+        if st.button("Install Prometheus + Grafana", type="primary", use_container_width=True):
+            if install_helm_first:
+                with st.status("Installing Helm...", expanded=True):
+                    result = install_helm(cp_node)
+                    if result.success:
+                        st.success("Helm ready!")
+                    else:
+                        st.error("Helm installation failed")
+                        st.code(result.stderr, language="text")
+
+            with st.status("Installing kube-prometheus-stack (this may take several minutes)...", expanded=True):
+                result = install_prometheus_stack(cp_node, namespace)
+                if result.success:
+                    st.success("Prometheus + Grafana installed!")
+                    st.code(result.stdout[-2000:], language="text")
+                else:
+                    st.error("Installation failed")
+                    st.code(result.stderr, language="text")
+
+            if install_alerts_too:
+                with st.status("Installing alert rules...", expanded=True):
+                    result = install_alert_rules(cp_node, namespace)
+                    if result.success:
+                        st.success("Alert rules installed!")
+                    else:
+                        st.error("Alert rules installation failed")
+                        st.code(result.stderr, language="text")
+
+    # ── Dashboards ────────────────────────────────────────────────────────
+    with tab_dashboards:
+        st.markdown("### Grafana Dashboards")
+        st.markdown("Select dashboards to import into Grafana.")
+
+        selected_dashboards = []
+        cols = st.columns(2)
+        for i, (key, dash) in enumerate(GRAFANA_DASHBOARDS.items()):
+            with cols[i % 2]:
+                if st.checkbox(f"**{dash['name']}**\n{dash['description']}", value=True, key=f"dash_{key}"):
+                    selected_dashboards.append(key)
+
+        if st.button("Import Dashboards", type="primary") and selected_dashboards:
+            with st.status("Importing dashboards...", expanded=True):
+                result = install_dashboards(cp_node, selected_dashboards, namespace)
+                if result.success:
+                    st.success(f"Imported {len(selected_dashboards)} dashboards!")
+                    st.code(result.stdout, language="text")
+                else:
+                    st.error("Dashboard import failed")
+                    st.code(result.stderr, language="text")
+
+    # ── Alert Rules ───────────────────────────────────────────────────────
+    with tab_alerts:
+        st.markdown("### Alerting Rules")
+        st.markdown("Install production-ready alerting rules for nodes, pods, and etcd.")
+
+        with st.expander("View Alert Rules", expanded=False):
+            st.code(generate_alerting_rules_script(namespace), language="yaml")
+
+        if st.button("Install Alert Rules", type="primary", key="install_alerts"):
+            with st.spinner("Installing alert rules..."):
+                result = install_alert_rules(cp_node, namespace)
+                if result.success:
+                    st.success("Alert rules installed!")
+                    st.code(result.stdout, language="text")
+                else:
+                    st.error("Failed to install alert rules")
+                    st.code(result.stderr, language="text")
+
+    # ── Status ────────────────────────────────────────────────────────────
+    with tab_status:
+        st.markdown("### Monitoring Stack Status")
+        if st.button("Check Status", type="primary", key="mon_status"):
+            with st.spinner("Checking monitoring stack..."):
+                result = get_monitoring_status(cp_node, namespace)
+                if result.success:
+                    st.code(result.stdout, language="text")
+                else:
+                    st.warning("Could not retrieve monitoring status")
+                    st.code(result.stderr, language="text")
+
+    # ── View Scripts ──────────────────────────────────────────────────────
+    with tab_scripts:
+        st.markdown("### Generated Scripts")
+        with st.expander("Prometheus Install Script"):
+            st.code(generate_prometheus_install_script(namespace), language="bash")
+        with st.expander("Dashboard Import Script"):
+            all_keys = list(GRAFANA_DASHBOARDS.keys())
+            st.code(generate_dashboard_import_script(all_keys, namespace), language="bash")
+        with st.expander("Alert Rules Script"):
+            st.code(generate_alerting_rules_script(namespace), language="bash")
+
+    # ── AI Advice ─────────────────────────────────────────────────────────
+    with tab_advice:
+        st.markdown("### AI Monitoring Advisor")
+        if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"):
+            current_status = ""
+            status_result = get_monitoring_status(cp_node, namespace)
+            if status_result.success:
+                current_status = status_result.stdout
+
+            with st.spinner("Getting AI recommendations..."):
+                advice = get_monitoring_advice(profile, current_status)
+                st.markdown(advice)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Log Analysis
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_log_analysis():
+    st.markdown("## Log Analysis & Error Correlation")
+    st.markdown("Collect, parse, and analyze logs from your cluster components.")
+
+    profile = _get_active_profile()
+    if not profile:
+        return
+
+    cp_nodes = profile.get_control_plane_nodes()
+    if not cp_nodes:
+        st.error("No control-plane node defined in this profile.")
+        return
+    cp_node = cp_nodes[0]
+
+    tab_system, tab_pod, tab_correlation, tab_ai = st.tabs([
+        "System Logs",
+        "Pod Logs",
+        "Error Correlation",
+        "AI Log Analysis",
+    ])
+
+    # ── System Logs ───────────────────────────────────────────────────────
+    with tab_system:
+        st.markdown("### System Component Logs")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            sources = st.multiselect(
+                "Log Sources",
+                options=list(LOG_SOURCES.keys()),
+                default=["Kubelet", "CRI-O", "Events"],
+            )
+        with col2:
+            log_lines = st.number_input("Lines to fetch", min_value=50, max_value=1000, value=200)
+        with col3:
+            since_options = {"Last 15 min": ("15 minutes ago", "15m"),
+                             "Last 1 hour": ("1 hour ago", "1h"),
+                             "Last 6 hours": ("6 hours ago", "6h"),
+                             "Last 24 hours": ("24 hours ago", "24h")}
+            since_label = st.selectbox("Time Range", options=list(since_options.keys()), index=1)
+            since, since_k8s = since_options[since_label]
+
+        if st.button("Collect Logs", type="primary", key="collect_sys"):
+            log_data = {}
+            for source in sources:
+                with st.spinner(f"Collecting {source} logs..."):
+                    result = collect_logs(cp_node, source, log_lines, since, since_k8s)
+                    if result.success:
+                        log_data[source] = result.stdout
+                        analysis = analyze_logs(result.stdout, source)
+
+                        with st.expander(
+                            f"{'❌' if analysis.error_count > 0 else '✅'} {source} "
+                            f"({analysis.error_count} errors, {analysis.warning_count} warnings)",
+                            expanded=analysis.error_count > 0,
+                        ):
+                            # Metrics
+                            m1, m2, m3 = st.columns(3)
+                            m1.metric("Total Lines", analysis.total_lines)
+                            m2.metric("Errors", analysis.error_count)
+                            m3.metric("Warnings", analysis.warning_count)
+
+                            if analysis.error_patterns:
+                                st.markdown("**Top Error Patterns:**")
+                                for pattern, count in list(analysis.error_patterns.items())[:5]:
+                                    st.markdown(f"- `{pattern}` (x{count})")
+
+                            st.code(result.stdout[-3000:], language="text")
+                    else:
+                        with st.expander(f"❌ {source} — FAILED"):
+                            st.code(result.stderr, language="text")
+
+            st.session_state.log_analysis_results = log_data
+
+    # ── Pod Logs ──────────────────────────────────────────────────────────
+    with tab_pod:
+        st.markdown("### Pod Logs")
+        col1, col2 = st.columns(2)
+        with col1:
+            pod_ns = st.text_input("Namespace", value="default", key="pod_ns")
+            pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz", key="pod_name_input")
+        with col2:
+            container = st.text_input("Container (optional)", key="pod_container")
+            pod_lines = st.number_input("Lines", min_value=50, max_value=1000, value=200, key="pod_lines")
+            pod_previous = st.checkbox("Previous container logs (crash recovery)")
+
+        if st.button("Fetch Pod Logs", type="primary", key="fetch_pod") and pod_name:
+            with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."):
+                result = collect_pod_logs(
+                    cp_node, pod_ns, pod_name, container, pod_lines,
+                    "1h", pod_previous,
+                )
+                if result.success:
+                    analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}")
+                    m1, m2, m3 = st.columns(3)
+                    m1.metric("Total Lines", analysis.total_lines)
+                    m2.metric("Errors", analysis.error_count)
+                    m3.metric("Warnings", analysis.warning_count)
+
+                    if analysis.error_patterns:
+                        st.markdown("**Error Patterns:**")
+                        for pattern, count in list(analysis.error_patterns.items())[:10]:
+                            st.markdown(f"- `{pattern}` (x{count})")
+
+                    st.code(result.stdout[-5000:], language="text")
+
+                    if analysis.error_count > 0 and st.button("Analyze with AI", key="pod_ai"):
+                        with st.spinner("AI analyzing pod logs..."):
+                            ai_analysis = llm_analyze_logs(
+                                result.stdout, f"{pod_ns}/{pod_name}"
+                            )
+                            st.markdown(ai_analysis)
+                else:
+                    st.error("Failed to fetch pod logs")
+                    st.code(result.stderr, language="text")
+
+    # ── Error Correlation ─────────────────────────────────────────────────
+    with tab_correlation:
+        st.markdown("### Cross-Source Error Correlation")
+        st.markdown("Collect logs from multiple sources and correlate errors across them.")
+
+        corr_sources = st.multiselect(
+            "Sources to correlate",
+            options=list(LOG_SOURCES.keys()),
+            default=["Kubelet", "CRI-O", "API Server", "Events"],
+            key="corr_sources",
+        )
+
+        if st.button("Collect & Correlate", type="primary", key="correlate"):
+            with st.spinner("Collecting logs from multiple sources..."):
+                results = collect_multi_source_logs(cp_node, corr_sources, lines=150)
+
+            correlated = correlate_errors(results)
+
+            if correlated:
+                st.markdown(f"### Found {len(correlated)} correlated error groups")
+                for i, group in enumerate(correlated):
+                    with st.expander(
+                        f"Correlation #{i + 1}: {', '.join(group['sources_involved'])}",
+                        expanded=True,
+                    ):
+                        st.markdown(f"**Primary Error** ({group['primary']['source']}):")
+                        st.code(group["primary"]["message"], language="text")
+                        st.markdown("**Related Errors:**")
+                        for related in group["related"]:
+                            st.markdown(f"- **{related['source']}**: `{related['message'][:200]}`")
+            else:
+                st.info("No correlated errors found across sources.")
+
+            # LLM correlation analysis
+            if st.button("Deep AI Correlation Analysis", key="deep_corr"):
+                multi_logs = {
+                    src: res.stdout for src, res in results.items() if res.success
+                }
+                with st.spinner("AI is performing deep correlation analysis..."):
+                    analysis = llm_correlate_analysis(multi_logs)
+                    st.markdown(analysis)
+
+    # ── AI Log Analysis ───────────────────────────────────────────────────
+    with tab_ai:
+        st.markdown("### AI-Powered Log Analysis")
+        st.markdown("Paste logs or describe an issue for AI analysis.")
+
+        log_input = st.text_area(
+            "Paste log output",
+            height=200,
+            placeholder="Paste your Kubernetes logs here...",
+        )
+        context_input = st.text_input(
+            "Additional context",
+            placeholder="e.g., This started happening after we upgraded to K8s 1.30",
+        )
+
+        if st.button("Analyze Logs", type="primary", key="ai_log_analyze") and log_input:
+            with st.spinner("AI is analyzing logs..."):
+                analysis = llm_analyze_logs(log_input, context=context_input)
+                st.markdown(analysis)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: AI Assistant
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_ai_assistant():
+    st.markdown("## AI Kubernetes Assistant")
+    st.markdown("Chat with the AI about any Kubernetes topic.")
+
+    # Chat history
+    for msg in st.session_state.chat_history:
+        with st.chat_message(msg["role"]):
+            st.markdown(msg["content"])
+
+    # Chat input
+    if prompt := st.chat_input("Ask about Kubernetes..."):
+        st.session_state.chat_history.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.markdown(prompt)
+
+        with st.chat_message("assistant"):
+            placeholder = st.empty()
+            full_response = ""
+            for chunk in stream_llm(
+                prompt,
+                conversation_history=st.session_state.chat_history[:-1],
+            ):
+                full_response += chunk
+                placeholder.markdown(full_response + "▌")
+            placeholder.markdown(full_response)
+
+        st.session_state.chat_history.append({"role": "assistant", "content": full_response})
+
+
+# ── Helper functions ──────────────────────────────────────────────────────
+
+def _get_active_profile() -> ClusterProfile | None:
+    """Get the active profile or show a warning."""
+    if not st.session_state.active_profile:
+        st.warning("No active cluster profile selected. Please create or select one in the Profile Manager.")
+        return None
+    profile = load_profile(st.session_state.active_profile)
+    if not profile:
+        st.error(f"Profile '{st.session_state.active_profile}' not found.")
+        return None
+    return profile
+
+
+def _show_profile_summary(profile: ClusterProfile):
+    """Display a compact profile summary."""
+    cols = st.columns(5)
+    cols[0].metric("Profile", profile.name)
+    cols[1].metric("K8s Version", profile.kubernetes_version)
+    cols[2].metric("Runtime", f"CRI-O {profile.crio_version}")
+    cols[3].metric("CNI", "Flannel")
+    cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W")
+
+
+# ── Main Router ───────────────────────────────────────────────────────────
+
+def main():
+    page = render_sidebar()
+
+    if page == "Profile Manager":
+        page_profile_manager()
+    elif page == "Cluster Creation":
+        page_cluster_creation()
+    elif page == "Cluster Debugger":
+        page_cluster_debugger()
+    elif page == "Monitoring Setup":
+        page_monitoring_setup()
+    elif page == "Log Analysis":
+        page_log_analysis()
+    elif page == "AI Assistant":
+        page_ai_assistant()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
new file mode 100644
index 0000000..e46dd95
--- /dev/null
+++ b/k8s-agent/config.py
@@ -0,0 +1,21 @@
+"""Configuration for the K8s Agent application."""
+
+import os
+
+# LLM Configuration
+LLM_API_URL = os.getenv(
+    "LLM_API_URL",
+    "https://aigateway-intern.ad.infosys.com/aigateway/chat/completions",
+)
+LLM_API_KEY = os.getenv("LLM_API_KEY", os.getenv("INFOSYS_CODER_API_KEY", ""))
+LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4")
+LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
+LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096"))
+
+# Application paths
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+PROFILES_DIR = os.path.join(DATA_DIR, "profiles")
+TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
+
+# Ensure directories exist
+os.makedirs(PROFILES_DIR, exist_ok=True)
diff --git a/k8s-agent/data/profiles/.gitkeep b/k8s-agent/data/profiles/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/k8s-agent/data/profiles/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/k8s-agent/modules/__init__.py b/k8s-agent/modules/__init__.py
new file mode 100644
index 0000000..fc1c144
--- /dev/null
+++ b/k8s-agent/modules/__init__.py
@@ -0,0 +1 @@
+"""K8s Agent modules."""
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
new file mode 100644
index 0000000..ef89dea
--- /dev/null
+++ b/k8s-agent/modules/cluster_creator.py
@@ -0,0 +1,545 @@
+"""Cluster Creator — SSH-based K8s cluster provisioning with CRI-O + Flannel."""
+
+import subprocess
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+from modules.llm_client import query_llm
+from modules.profile_manager import ClusterProfile
+
+
+@dataclass
+class SSHResult:
+    """Result of an SSH command execution."""
+
+    hostname: str
+    command: str
+    return_code: int
+    stdout: str
+    stderr: str
+    success: bool
+
+
+def run_ssh_command(
+    ip_address: str,
+    command: str,
+    ssh_user: str = "root",
+    ssh_port: int = 22,
+    ssh_key_path: str = "~/.ssh/id_rsa",
+    timeout: int = 600,
+) -> SSHResult:
+    """Execute a command on a remote node via SSH.
+
+    Args:
+        ip_address: Target node IP.
+        command: Shell command to execute remotely.
+        ssh_user: SSH username.
+        ssh_port: SSH port number.
+        ssh_key_path: Path to SSH private key.
+        timeout: Command timeout in seconds.
+
+    Returns:
+        SSHResult with command output and status.
+    """
+    ssh_cmd = [
+        "ssh",
+        "-o", "StrictHostKeyChecking=no",
+        "-o", "UserKnownHostsFile=/dev/null",
+        "-o", "ConnectTimeout=10",
+        "-o", "BatchMode=yes",
+        "-i", ssh_key_path,
+        "-p", str(ssh_port),
+        f"{ssh_user}@{ip_address}",
+        command,
+    ]
+
+    try:
+        result = subprocess.run(
+            ssh_cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+        return SSHResult(
+            hostname=ip_address,
+            command=command,
+            return_code=result.returncode,
+            stdout=result.stdout,
+            stderr=result.stderr,
+            success=result.returncode == 0,
+        )
+    except subprocess.TimeoutExpired:
+        return SSHResult(
+            hostname=ip_address,
+            command=command,
+            return_code=-1,
+            stdout="",
+            stderr=f"Command timed out after {timeout}s",
+            success=False,
+        )
+    except Exception as exc:
+        return SSHResult(
+            hostname=ip_address,
+            command=command,
+            return_code=-1,
+            stdout="",
+            stderr=str(exc),
+            success=False,
+        )
+
+
+def test_ssh_connectivity(node: dict) -> SSHResult:
+    """Test SSH connectivity to a node."""
+    return run_ssh_command(
+        ip_address=node["ip_address"],
+        command="echo 'SSH connection successful' && hostname && uname -r",
+        ssh_user=node.get("ssh_user", "root"),
+        ssh_port=node.get("ssh_port", 22),
+        ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=15,
+    )
+
+
+def generate_common_setup_script(profile: ClusterProfile) -> str:
+    """Generate the common setup script that runs on ALL nodes (control-plane + workers)."""
+    return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== K8s Node Common Setup ==="
+echo "Kubernetes Version: {profile.kubernetes_version}"
+echo "CRI-O Version: {profile.crio_version}"
+echo "Timestamp: $(date -u)"
+
+# ── 1. System prerequisites ──────────────────────────────────────────────
+echo ">> Disabling swap..."
+swapoff -a
+sed -i '/\\bswap\\b/d' /etc/fstab
+
+echo ">> Loading kernel modules..."
+cat > /etc/modules-load.d/k8s.conf <<EOF
+overlay
+br_netfilter
+EOF
+modprobe overlay
+modprobe br_netfilter
+
+echo ">> Setting sysctl parameters..."
+cat > /etc/sysctl.d/99-kubernetes.conf <<EOF
+net.bridge.bridge-nf-call-iptables  = 1
+net.bridge.bridge-nf-call-ip6tables = 1
+net.ipv4.ip_forward                 = 1
+EOF
+sysctl --system
+
+echo ">> Disabling SELinux (if present)..."
+if command -v setenforce &>/dev/null; then
+    setenforce 0 || true
+    sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/null || true
+fi
+
+echo ">> Configuring firewalld (if present)..."
+if systemctl is-active --quiet firewalld; then
+    firewall-cmd --permanent --add-port=6443/tcp    # API server
+    firewall-cmd --permanent --add-port=2379-2380/tcp  # etcd
+    firewall-cmd --permanent --add-port=10250/tcp   # Kubelet API
+    firewall-cmd --permanent --add-port=10259/tcp   # kube-scheduler
+    firewall-cmd --permanent --add-port=10257/tcp   # kube-controller-manager
+    firewall-cmd --permanent --add-port=30000-32767/tcp  # NodePort
+    firewall-cmd --permanent --add-port=8472/udp    # Flannel VXLAN
+    firewall-cmd --reload
+fi
+
+# ── 2. Install CRI-O ─────────────────────────────────────────────────────
+echo ">> Installing CRI-O {profile.crio_version}..."
+
+OS="$(. /etc/os-release && echo "$ID")"
+VERSION_ID="$(. /etc/os-release && echo "$VERSION_ID")"
+
+if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then
+    apt-get update -y
+    apt-get install -y software-properties-common curl gnupg2
+
+    CRIO_VERSION="{profile.crio_version}"
+    curl -fsSL "https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/Release.key" | \\
+        gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
+    echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/ /" | \\
+        tee /etc/apt/sources.list.d/cri-o.list
+
+    apt-get update -y
+    apt-get install -y cri-o
+elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then
+    CRIO_VERSION="{profile.crio_version}"
+    cat > /etc/yum.repos.d/cri-o.repo <<REPO
+[cri-o]
+name=CRI-O
+baseurl=https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/rpm/
+enabled=1
+gpgcheck=1
+gpgkey=https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/rpm/repodata/repomd.xml.key
+REPO
+    dnf install -y cri-o
+fi
+
+systemctl daemon-reload
+systemctl enable --now crio
+echo ">> CRI-O installed and running."
+
+# ── 3. Install kubeadm, kubelet, kubectl ──────────────────────────────────
+echo ">> Installing Kubernetes {profile.kubernetes_version} components..."
+
+K8S_VERSION="{profile.kubernetes_version}"
+
+if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then
+    curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/Release.key" | \\
+        gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
+    echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/ /" | \\
+        tee /etc/apt/sources.list.d/kubernetes.list
+
+    apt-get update -y
+    apt-get install -y kubelet kubeadm kubectl
+    apt-mark hold kubelet kubeadm kubectl
+elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then
+    cat > /etc/yum.repos.d/kubernetes.repo <<REPO
+[kubernetes]
+name=Kubernetes
+baseurl=https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/rpm/
+enabled=1
+gpgcheck=1
+gpgkey=https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/rpm/repodata/repomd.xml.key
+REPO
+    dnf install -y kubelet kubeadm kubectl
+fi
+
+systemctl enable --now kubelet
+echo ">> Kubernetes components installed."
+
+echo "=== Common setup complete ==="
+"""
+
+
+def generate_control_plane_init_script(profile: ClusterProfile) -> str:
+    """Generate the kubeadm init script for the control-plane node."""
+    cp_nodes = profile.get_control_plane_nodes()
+    cp_ip = cp_nodes[0]["ip_address"] if cp_nodes else "CONTROL_PLANE_IP"
+
+    return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Initializing Kubernetes Control Plane ==="
+
+# ── kubeadm init ──────────────────────────────────────────────────────────
+cat > /tmp/kubeadm-config.yaml <<EOF
+apiVersion: kubeadm.k8s.io/v1beta3
+kind: InitConfiguration
+localAPIEndpoint:
+  advertiseAddress: "{cp_ip}"
+  bindPort: 6443
+nodeRegistration:
+  criSocket: "unix:///var/run/crio/crio.sock"
+  kubeletExtraArgs:
+    container-runtime-endpoint: "unix:///var/run/crio/crio.sock"
+---
+apiVersion: kubeadm.k8s.io/v1beta3
+kind: ClusterConfiguration
+kubernetesVersion: "v{profile.kubernetes_version}.0"
+networking:
+  podSubnet: "{profile.pod_cidr}"
+  serviceSubnet: "{profile.service_cidr}"
+  dnsDomain: "{profile.dns_domain}"
+controlPlaneEndpoint: "{cp_ip}:6443"
+apiServer:
+  extraArgs:
+    authorization-mode: "Node,RBAC"
+    enable-admission-plugins: "NodeRestriction,PodSecurity"
+    audit-log-path: "/var/log/kubernetes/audit.log"
+    audit-log-maxage: "30"
+    audit-log-maxbackup: "10"
+    audit-log-maxsize: "100"
+  extraVolumes:
+  - name: audit-log
+    hostPath: "/var/log/kubernetes"
+    mountPath: "/var/log/kubernetes"
+    pathType: DirectoryOrCreate
+controllerManager:
+  extraArgs:
+    bind-address: "0.0.0.0"
+    terminated-pod-gc-threshold: "100"
+scheduler:
+  extraArgs:
+    bind-address: "0.0.0.0"
+etcd:
+  local:
+    extraArgs:
+      listen-metrics-urls: "http://0.0.0.0:2381"
+---
+apiVersion: kubelet.config.k8s.io/v1beta1
+kind: KubeletConfiguration
+cgroupDriver: systemd
+containerRuntimeEndpoint: "unix:///var/run/crio/crio.sock"
+evictionHard:
+  memory.available: "100Mi"
+  nodefs.available: "10%"
+  imagefs.available: "15%"
+EOF
+
+echo ">> Running kubeadm init..."
+kubeadm init --config=/tmp/kubeadm-config.yaml --upload-certs | tee /tmp/kubeadm-init.log
+
+# ── Configure kubectl for root ────────────────────────────────────────────
+echo ">> Configuring kubectl..."
+mkdir -p /root/.kube
+cp /etc/kubernetes/admin.conf /root/.kube/config
+chown root:root /root/.kube/config
+
+# ── Install Flannel CNI ───────────────────────────────────────────────────
+echo ">> Installing Flannel CNI..."
+kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
+
+# Wait for Flannel to be ready
+echo ">> Waiting for Flannel pods to be ready..."
+kubectl -n kube-flannel wait --for=condition=ready pod -l app=flannel --timeout=120s || true
+
+# ── Apply Pod Security Standards ──────────────────────────────────────────
+echo ">> Applying Pod Security Standards ({profile.pod_security_standard})..."
+kubectl label namespace default \\
+    pod-security.kubernetes.io/enforce={profile.pod_security_standard} \\
+    pod-security.kubernetes.io/warn={profile.pod_security_standard} \\
+    pod-security.kubernetes.io/audit={profile.pod_security_standard} \\
+    --overwrite
+
+# ── Generate join command ─────────────────────────────────────────────────
+echo ">> Generating worker join command..."
+kubeadm token create --print-join-command > /tmp/kubeadm-join-command.txt
+echo "Join command saved to /tmp/kubeadm-join-command.txt"
+
+echo ""
+echo "=== Control Plane initialization complete ==="
+echo "Join command:"
+cat /tmp/kubeadm-join-command.txt
+"""
+
+
+def generate_worker_join_script() -> str:
+    """Generate the script that runs on worker nodes to join the cluster."""
+    return """#!/bin/bash
+set -euo pipefail
+
+echo "=== Joining Worker Node to Cluster ==="
+
+JOIN_COMMAND="$1"
+
+if [ -z "$JOIN_COMMAND" ]; then
+    echo "ERROR: Join command not provided."
+    echo "Usage: $0 '<kubeadm join command>'"
+    exit 1
+fi
+
+echo ">> Executing join command..."
+eval "$JOIN_COMMAND --cri-socket unix:///var/run/crio/crio.sock"
+
+echo "=== Worker node joined successfully ==="
+"""
+
+
+def generate_best_practices_script() -> str:
+    """Generate a post-install best practices hardening script."""
+    return """#!/bin/bash
+set -euo pipefail
+
+echo "=== Applying Kubernetes Best Practices ==="
+
+# ── Default Network Policy (deny-all) ────────────────────────────────────
+echo ">> Creating default-deny network policy for default namespace..."
+cat <<EOF | kubectl apply -f -
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: default-deny-all
+  namespace: default
+spec:
+  podSelector: {}
+  policyTypes:
+  - Ingress
+  - Egress
+EOF
+
+# ── Resource Quotas for default namespace ─────────────────────────────────
+echo ">> Setting resource quotas..."
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+  name: default-quota
+  namespace: default
+spec:
+  hard:
+    requests.cpu: "4"
+    requests.memory: 8Gi
+    limits.cpu: "8"
+    limits.memory: 16Gi
+    pods: "50"
+    services: "20"
+    persistentvolumeclaims: "10"
+EOF
+
+# ── Limit Ranges ──────────────────────────────────────────────────────────
+echo ">> Setting limit ranges..."
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: LimitRange
+metadata:
+  name: default-limits
+  namespace: default
+spec:
+  limits:
+  - default:
+      cpu: "500m"
+      memory: "512Mi"
+    defaultRequest:
+      cpu: "100m"
+      memory: "128Mi"
+    type: Container
+EOF
+
+# ── RBAC: Create read-only ClusterRole ────────────────────────────────────
+echo ">> Creating read-only ClusterRole..."
+cat <<EOF | kubectl apply -f -
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: cluster-reader
+rules:
+- apiGroups: [""]
+  resources: ["pods", "services", "namespaces", "nodes", "events", "configmaps"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: ["apps"]
+  resources: ["deployments", "replicasets", "statefulsets", "daemonsets"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: ["networking.k8s.io"]
+  resources: ["networkpolicies", "ingresses"]
+  verbs: ["get", "list", "watch"]
+EOF
+
+# ── Enable audit logging directory ────────────────────────────────────────
+echo ">> Ensuring audit log directory exists..."
+mkdir -p /var/log/kubernetes
+
+echo "=== Best practices applied ==="
+echo ""
+echo "Summary of applied best practices:"
+echo "  - Default-deny NetworkPolicy in default namespace"
+echo "  - ResourceQuota for default namespace (CPU: 4/8, Memory: 8/16Gi)"
+echo "  - LimitRange with default container limits"
+echo "  - Read-only ClusterRole (cluster-reader)"
+echo "  - Audit logging directory configured"
+"""
+
+
+def provision_node_common(node: dict, profile: ClusterProfile) -> SSHResult:
+    """Run the common setup script on a single node via SSH."""
+    script = generate_common_setup_script(profile)
+    return run_ssh_command(
+        ip_address=node["ip_address"],
+        command=script,
+        ssh_user=node.get("ssh_user", "root"),
+        ssh_port=node.get("ssh_port", 22),
+        ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=600,
+    )
+
+
+def init_control_plane(node: dict, profile: ClusterProfile) -> SSHResult:
+    """Initialize the control plane on the given node."""
+    script = generate_control_plane_init_script(profile)
+    return run_ssh_command(
+        ip_address=node["ip_address"],
+        command=script,
+        ssh_user=node.get("ssh_user", "root"),
+        ssh_port=node.get("ssh_port", 22),
+        ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=600,
+    )
+
+
+def retrieve_join_command(control_plane_node: dict) -> Optional[str]:
+    """Retrieve the kubeadm join command from the control-plane node."""
+    result = run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command="cat /tmp/kubeadm-join-command.txt",
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=30,
+    )
+    if result.success:
+        return result.stdout.strip()
+    return None
+
+
+def join_worker_node(node: dict, join_command: str) -> SSHResult:
+    """Join a worker node to the cluster."""
+    full_command = f"{join_command} --cri-socket unix:///var/run/crio/crio.sock"
+    return run_ssh_command(
+        ip_address=node["ip_address"],
+        command=full_command,
+        ssh_user=node.get("ssh_user", "root"),
+        ssh_port=node.get("ssh_port", 22),
+        ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=300,
+    )
+
+
+def apply_best_practices(control_plane_node: dict) -> SSHResult:
+    """Apply best practices hardening on the cluster via the control-plane."""
+    script = generate_best_practices_script()
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=script,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=120,
+    )
+
+
+def get_cluster_status(control_plane_node: dict) -> SSHResult:
+    """Get the cluster node status from the control-plane."""
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command="kubectl get nodes -o wide && echo '---' && kubectl get pods -A",
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=30,
+    )
+
+
+def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
+    """Ask the LLM for cluster setup advice based on the profile."""
+    nodes_desc = []
+    for n in profile.nodes:
+        nodes_desc.append(f"  - {n.get('hostname', 'unknown')} ({n['ip_address']}) — role: {n['role']}")
+    nodes_str = "\n".join(nodes_desc)
+
+    prompt = f"""I am setting up an on-premises Kubernetes cluster with the following configuration:
+
+- Kubernetes Version: {profile.kubernetes_version}
+- Container Runtime: CRI-O {profile.crio_version}
+- CNI Plugin: Flannel
+- Pod CIDR: {profile.pod_cidr}
+- Service CIDR: {profile.service_cidr}
+- Pod Security Standard: {profile.pod_security_standard}
+
+Nodes:
+{nodes_str}
+
+{context}
+
+Please review this configuration and provide:
+1. Any potential issues or conflicts
+2. Recommended optimizations
+3. Security hardening recommendations specific to this setup
+4. Network configuration tips for Flannel with CRI-O
+"""
+    return query_llm(prompt)
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
new file mode 100644
index 0000000..9ee9dd1
--- /dev/null
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -0,0 +1,228 @@
+"""Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations."""
+
+from modules.cluster_creator import run_ssh_command, SSHResult
+from modules.llm_client import query_llm, stream_llm
+from modules.profile_manager import ClusterProfile
+
+
+# ── Diagnostic command definitions ────────────────────────────────────────
+
+DIAGNOSTIC_COMMANDS = {
+    "Node Status": "kubectl get nodes -o wide",
+    "Pod Status (All Namespaces)": "kubectl get pods -A -o wide",
+    "Events (Recent)": "kubectl get events -A --sort-by='.lastTimestamp' | tail -50",
+    "Component Status": "kubectl get componentstatuses 2>/dev/null; kubectl get --raw='/healthz?verbose' 2>/dev/null || true",
+    "System Pods": "kubectl -n kube-system get pods -o wide",
+    "Node Resources": "kubectl top nodes 2>/dev/null || echo 'metrics-server not installed'",
+    "Pod Resources": "kubectl top pods -A 2>/dev/null || echo 'metrics-server not installed'",
+    "Cluster Info": "kubectl cluster-info",
+    "CRI-O Status": "systemctl status crio --no-pager -l",
+    "Kubelet Status": "systemctl status kubelet --no-pager -l",
+    "Kubelet Logs (Recent)": "journalctl -u kubelet --no-pager -n 50",
+    "CRI-O Logs (Recent)": "journalctl -u crio --no-pager -n 50",
+    "Flannel Status": "kubectl -n kube-flannel get pods -o wide 2>/dev/null || kubectl -n kube-system get pods -l app=flannel -o wide 2>/dev/null || echo 'Flannel pods not found'",
+    "Network Policies": "kubectl get networkpolicies -A",
+    "Services": "kubectl get svc -A",
+    "PVCs": "kubectl get pvc -A",
+    "Ingresses": "kubectl get ingress -A 2>/dev/null || true",
+    "Disk Usage": "df -h / /var/lib/containers /var/lib/kubelet 2>/dev/null || df -h /",
+    "Memory Info": "free -h",
+    "DNS Resolution": "kubectl run dns-test --image=busybox:1.36 --rm -it --restart=Never -- nslookup kubernetes.default 2>/dev/null || echo 'DNS test skipped'",
+    "Certificate Expiry": "kubeadm certs check-expiration 2>/dev/null || echo 'Not a kubeadm node or kubeadm not found'",
+}
+
+CATEGORY_MAP = {
+    "Cluster Overview": [
+        "Node Status",
+        "Pod Status (All Namespaces)",
+        "Cluster Info",
+        "Component Status",
+    ],
+    "Pod & Workload Health": [
+        "Pod Status (All Namespaces)",
+        "System Pods",
+        "Events (Recent)",
+    ],
+    "Resource Usage": [
+        "Node Resources",
+        "Pod Resources",
+        "Disk Usage",
+        "Memory Info",
+    ],
+    "Networking": [
+        "Flannel Status",
+        "Network Policies",
+        "Services",
+        "Ingresses",
+        "DNS Resolution",
+    ],
+    "Container Runtime & Kubelet": [
+        "CRI-O Status",
+        "Kubelet Status",
+        "CRI-O Logs (Recent)",
+        "Kubelet Logs (Recent)",
+    ],
+    "Security & Certificates": [
+        "Certificate Expiry",
+        "Network Policies",
+    ],
+    "Storage": [
+        "PVCs",
+        "Disk Usage",
+    ],
+}
+
+
+def run_diagnostic(
+    control_plane_node: dict,
+    command_name: str,
+) -> SSHResult:
+    """Run a single diagnostic command on the control-plane node."""
+    command = DIAGNOSTIC_COMMANDS.get(command_name)
+    if not command:
+        return SSHResult(
+            hostname=control_plane_node["ip_address"],
+            command=command_name,
+            return_code=1,
+            stdout="",
+            stderr=f"Unknown diagnostic command: {command_name}",
+            success=False,
+        )
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=60,
+    )
+
+
+def run_category_diagnostics(
+    control_plane_node: dict,
+    category: str,
+) -> dict[str, SSHResult]:
+    """Run all diagnostic commands for a given category."""
+    results = {}
+    command_names = CATEGORY_MAP.get(category, [])
+    for name in command_names:
+        results[name] = run_diagnostic(control_plane_node, name)
+    return results
+
+
+def run_all_diagnostics(control_plane_node: dict) -> dict[str, SSHResult]:
+    """Run every diagnostic command."""
+    results = {}
+    for name in DIAGNOSTIC_COMMANDS:
+        results[name] = run_diagnostic(control_plane_node, name)
+    return results
+
+
+def run_custom_command(
+    control_plane_node: dict,
+    command: str,
+) -> SSHResult:
+    """Run a custom command on the control-plane node."""
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=60,
+    )
+
+
+def format_diagnostics_for_llm(results: dict[str, SSHResult]) -> str:
+    """Format diagnostic results into a text block for the LLM."""
+    sections = []
+    for name, result in results.items():
+        status = "OK" if result.success else "FAILED"
+        output = result.stdout if result.success else result.stderr
+        sections.append(
+            f"### {name} [{status}]\n"
+            f"```\n{output.strip()}\n```\n"
+        )
+    return "\n".join(sections)
+
+
+def analyze_diagnostics(
+    results: dict[str, SSHResult],
+    user_description: str = "",
+    profile: ClusterProfile | None = None,
+) -> str:
+    """Send diagnostic results to the LLM for analysis and recommendations."""
+    diag_text = format_diagnostics_for_llm(results)
+
+    cluster_info = ""
+    if profile:
+        cluster_info = f"""
+Cluster Configuration:
+- Kubernetes: {profile.kubernetes_version}
+- Runtime: CRI-O {profile.crio_version}
+- CNI: Flannel
+- Pod CIDR: {profile.pod_cidr}
+- Service CIDR: {profile.service_cidr}
+"""
+
+    prompt = f"""Analyze the following Kubernetes cluster diagnostic output and provide a detailed assessment.
+{cluster_info}
+
+User's issue description: {user_description or 'General health check'}
+
+== Diagnostic Output ==
+{diag_text}
+== End Diagnostic Output ==
+
+Please provide:
+1. **Health Summary**: Overall cluster health status (Healthy / Degraded / Critical)
+2. **Issues Found**: List each issue with severity (Critical / Warning / Info)
+3. **Root Cause Analysis**: For each issue, explain the likely root cause
+4. **Remediation Steps**: Specific commands or actions to fix each issue
+5. **Preventive Recommendations**: Steps to prevent these issues in the future
+
+Format your response with clear headings and actionable commands where applicable.
+"""
+    return query_llm(prompt)
+
+
+def get_debug_suggestion(
+    error_message: str,
+    context: str = "",
+) -> str:
+    """Get a quick debugging suggestion from the LLM for a specific error."""
+    prompt = f"""I encountered the following error in my Kubernetes cluster (CRI-O + Flannel):
+
+Error: {error_message}
+
+Additional context: {context or 'None'}
+
+Provide a concise diagnosis and the exact commands to fix this issue.
+"""
+    return query_llm(prompt)
+
+
+def check_pod_issues(control_plane_node: dict, namespace: str = "") -> SSHResult:
+    """Check for pods in non-running states."""
+    ns_flag = f"-n {namespace}" if namespace else "-A"
+    command = (
+        f"kubectl get pods {ns_flag} --field-selector="
+        "'status.phase!=Running,status.phase!=Succeeded' -o wide 2>/dev/null; "
+        f"echo '---DESCRIBE---'; "
+        f"for pod in $(kubectl get pods {ns_flag} --field-selector="
+        "'status.phase!=Running,status.phase!=Succeeded' "
+        "-o jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name} {end}' 2>/dev/null); do "
+        "ns=$(echo $pod | cut -d/ -f1); "
+        "name=$(echo $pod | cut -d/ -f2); "
+        "echo \"=== $ns/$name ===\"; "
+        "kubectl describe pod $name -n $ns 2>/dev/null | tail -20; "
+        "done"
+    )
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=60,
+    )
diff --git a/k8s-agent/modules/llm_client.py b/k8s-agent/modules/llm_client.py
new file mode 100644
index 0000000..2b7eb44
--- /dev/null
+++ b/k8s-agent/modules/llm_client.py
@@ -0,0 +1,145 @@
+"""LLM client for the Infosys AI Gateway."""
+
+import json
+from typing import Generator, Optional
+
+import requests
+
+import config
+
+
+SYSTEM_PROMPT = """You are an expert Kubernetes platform engineer specializing in on-premises
+cluster administration. You have deep knowledge of:
+- Kubernetes cluster setup with CRI-O container runtime and Flannel CNI
+- kubeadm-based cluster bootstrapping and lifecycle management
+- Cluster debugging, troubleshooting, and remediation
+- Prometheus and Grafana monitoring stack setup and dashboard design
+- Kubernetes log analysis, error correlation, and root cause analysis
+- Security best practices including RBAC, network policies, and pod security standards
+
+Always provide actionable, production-ready advice. When generating scripts, include
+error handling and idempotency. When diagnosing issues, ask clarifying questions if
+the provided information is insufficient."""
+
+
+def query_llm(
+    user_message: str,
+    system_message: Optional[str] = None,
+    conversation_history: Optional[list[dict]] = None,
+    temperature: Optional[float] = None,
+    max_tokens: Optional[int] = None,
+) -> str:
+    """Send a query to the LLM and return the response text.
+
+    Args:
+        user_message: The user's message/query.
+        system_message: Optional system prompt override.
+        conversation_history: Optional list of prior messages for context.
+        temperature: Optional temperature override.
+        max_tokens: Optional max tokens override.
+
+    Returns:
+        The assistant's response text.
+    """
+    messages = []
+
+    sys_msg = system_message or SYSTEM_PROMPT
+    messages.append({"role": "system", "content": sys_msg})
+
+    if conversation_history:
+        messages.extend(conversation_history)
+
+    messages.append({"role": "user", "content": user_message})
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {config.LLM_API_KEY}",
+    }
+
+    payload = {
+        "model": config.LLM_MODEL,
+        "messages": messages,
+        "temperature": temperature if temperature is not None else config.LLM_TEMPERATURE,
+        "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
+    }
+
+    try:
+        response = requests.post(
+            config.LLM_API_URL,
+            headers=headers,
+            json=payload,
+            timeout=120,
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data["choices"][0]["message"]["content"]
+    except requests.exceptions.Timeout:
+        return "Error: LLM request timed out. Please try again."
+    except requests.exceptions.ConnectionError:
+        return "Error: Could not connect to the LLM endpoint. Please check your network and LLM_API_URL configuration."
+    except requests.exceptions.HTTPError as exc:
+        return f"Error: LLM API returned HTTP {exc.response.status_code}: {exc.response.text}"
+    except (KeyError, IndexError, json.JSONDecodeError) as exc:
+        return f"Error: Unexpected LLM response format: {exc}"
+
+
+def stream_llm(
+    user_message: str,
+    system_message: Optional[str] = None,
+    conversation_history: Optional[list[dict]] = None,
+    temperature: Optional[float] = None,
+    max_tokens: Optional[int] = None,
+) -> Generator[str, None, None]:
+    """Stream a response from the LLM token-by-token.
+
+    Yields chunks of text as they arrive from the API.
+    """
+    messages = []
+
+    sys_msg = system_message or SYSTEM_PROMPT
+    messages.append({"role": "system", "content": sys_msg})
+
+    if conversation_history:
+        messages.extend(conversation_history)
+
+    messages.append({"role": "user", "content": user_message})
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {config.LLM_API_KEY}",
+    }
+
+    payload = {
+        "model": config.LLM_MODEL,
+        "messages": messages,
+        "temperature": temperature if temperature is not None else config.LLM_TEMPERATURE,
+        "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
+        "stream": True,
+    }
+
+    try:
+        response = requests.post(
+            config.LLM_API_URL,
+            headers=headers,
+            json=payload,
+            timeout=120,
+            stream=True,
+        )
+        response.raise_for_status()
+
+        for line in response.iter_lines(decode_unicode=True):
+            if not line or not line.startswith("data: "):
+                continue
+            data_str = line[len("data: "):]
+            if data_str.strip() == "[DONE]":
+                break
+            try:
+                chunk = json.loads(data_str)
+                delta = chunk.get("choices", [{}])[0].get("delta", {})
+                content = delta.get("content", "")
+                if content:
+                    yield content
+            except (json.JSONDecodeError, KeyError, IndexError):
+                continue
+    except requests.exceptions.RequestException as exc:
+        yield f"\n\nError during streaming: {exc}"
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
new file mode 100644
index 0000000..9abc838
--- /dev/null
+++ b/k8s-agent/modules/log_analyzer.py
@@ -0,0 +1,345 @@
+"""Log Analyzer — Kubernetes log collection, parsing, error correlation, and analysis."""
+
+import re
+from collections import Counter
+from dataclasses import dataclass, field
+from typing import Optional
+
+from modules.cluster_creator import run_ssh_command, SSHResult
+from modules.llm_client import query_llm
+
+
+@dataclass
+class LogEntry:
+    """Represents a parsed log line."""
+
+    timestamp: str = ""
+    level: str = "INFO"
+    source: str = ""
+    message: str = ""
+    raw: str = ""
+
+
+@dataclass
+class LogAnalysisResult:
+    """Results from log analysis."""
+
+    total_lines: int = 0
+    error_count: int = 0
+    warning_count: int = 0
+    error_patterns: dict[str, int] = field(default_factory=dict)
+    warning_patterns: dict[str, int] = field(default_factory=dict)
+    timeline: list[dict] = field(default_factory=list)
+    correlated_errors: list[dict] = field(default_factory=list)
+
+
+# ── Log collection commands ───────────────────────────────────────────────
+
+LOG_SOURCES = {
+    "Kubelet": "journalctl -u kubelet --no-pager -n {lines} --since '{since}'",
+    "CRI-O": "journalctl -u crio --no-pager -n {lines} --since '{since}'",
+    "API Server": "kubectl logs -n kube-system -l component=kube-apiserver --tail={lines} --since={since_k8s}",
+    "Controller Manager": "kubectl logs -n kube-system -l component=kube-controller-manager --tail={lines} --since={since_k8s}",
+    "Scheduler": "kubectl logs -n kube-system -l component=kube-scheduler --tail={lines} --since={since_k8s}",
+    "CoreDNS": "kubectl logs -n kube-system -l k8s-app=kube-dns --tail={lines} --since={since_k8s}",
+    "Flannel": "kubectl logs -n kube-flannel -l app=flannel --tail={lines} --since={since_k8s} 2>/dev/null || kubectl logs -n kube-system -l app=flannel --tail={lines} --since={since_k8s} 2>/dev/null || echo 'Flannel logs not found'",
+    "etcd": "kubectl logs -n kube-system -l component=etcd --tail={lines} --since={since_k8s}",
+    "Events": "kubectl get events -A --sort-by='.lastTimestamp' | tail -{lines}",
+}
+
+POD_LOG_COMMAND = "kubectl logs {pod_ref} --tail={lines} --since={since_k8s} {container_flag}"
+POD_PREVIOUS_LOG_COMMAND = "kubectl logs {pod_ref} --previous --tail={lines} {container_flag} 2>/dev/null || echo 'No previous logs available'"
+
+
+def collect_logs(
+    control_plane_node: dict,
+    source: str,
+    lines: int = 200,
+    since: str = "1 hour ago",
+    since_k8s: str = "1h",
+) -> SSHResult:
+    """Collect logs from a specific source on the cluster."""
+    cmd_template = LOG_SOURCES.get(source)
+    if not cmd_template:
+        return SSHResult(
+            hostname=control_plane_node["ip_address"],
+            command=source,
+            return_code=1,
+            stdout="",
+            stderr=f"Unknown log source: {source}",
+            success=False,
+        )
+
+    command = cmd_template.format(
+        lines=lines,
+        since=since,
+        since_k8s=since_k8s,
+    )
+
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=60,
+    )
+
+
+def collect_pod_logs(
+    control_plane_node: dict,
+    namespace: str,
+    pod_name: str,
+    container: str = "",
+    lines: int = 200,
+    since_k8s: str = "1h",
+    previous: bool = False,
+) -> SSHResult:
+    """Collect logs from a specific pod."""
+    pod_ref = f"-n {namespace} {pod_name}"
+    container_flag = f"-c {container}" if container else ""
+
+    if previous:
+        command = POD_PREVIOUS_LOG_COMMAND.format(
+            pod_ref=pod_ref,
+            lines=lines,
+            container_flag=container_flag,
+        )
+    else:
+        command = POD_LOG_COMMAND.format(
+            pod_ref=pod_ref,
+            lines=lines,
+            since_k8s=since_k8s,
+            container_flag=container_flag,
+        )
+
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=60,
+    )
+
+
+def collect_multi_source_logs(
+    control_plane_node: dict,
+    sources: list[str],
+    lines: int = 100,
+    since: str = "1 hour ago",
+    since_k8s: str = "1h",
+) -> dict[str, SSHResult]:
+    """Collect logs from multiple sources."""
+    results = {}
+    for source in sources:
+        results[source] = collect_logs(
+            control_plane_node, source, lines, since, since_k8s
+        )
+    return results
+
+
+# ── Log parsing ───────────────────────────────────────────────────────────
+
+ERROR_PATTERNS = [
+    re.compile(r"\b(?:error|err|fatal|panic|fail(?:ed|ure)?)\b", re.IGNORECASE),
+]
+
+WARNING_PATTERNS = [
+    re.compile(r"\b(?:warn(?:ing)?|deprecated)\b", re.IGNORECASE),
+]
+
+TIMESTAMP_PATTERNS = [
+    re.compile(r"(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2})"),
+    re.compile(r"([A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})"),
+]
+
+
+def parse_log_line(line: str, source: str = "") -> LogEntry:
+    """Parse a single log line into a LogEntry."""
+    entry = LogEntry(raw=line, source=source)
+
+    for pattern in TIMESTAMP_PATTERNS:
+        match = pattern.search(line)
+        if match:
+            entry.timestamp = match.group(1)
+            break
+
+    for pattern in ERROR_PATTERNS:
+        if pattern.search(line):
+            entry.level = "ERROR"
+            break
+    else:
+        for pattern in WARNING_PATTERNS:
+            if pattern.search(line):
+                entry.level = "WARNING"
+                break
+
+    entry.message = line.strip()
+    return entry
+
+
+def analyze_logs(log_text: str, source: str = "") -> LogAnalysisResult:
+    """Analyze a block of log text and extract patterns."""
+    result = LogAnalysisResult()
+    lines = log_text.strip().split("\n")
+    result.total_lines = len(lines)
+
+    error_messages = []
+    warning_messages = []
+
+    for line in lines:
+        if not line.strip():
+            continue
+        entry = parse_log_line(line, source)
+
+        if entry.level == "ERROR":
+            result.error_count += 1
+            normalized = _normalize_error(entry.message)
+            error_messages.append(normalized)
+        elif entry.level == "WARNING":
+            result.warning_count += 1
+            normalized = _normalize_error(entry.message)
+            warning_messages.append(normalized)
+
+    result.error_patterns = dict(Counter(error_messages).most_common(20))
+    result.warning_patterns = dict(Counter(warning_messages).most_common(20))
+
+    return result
+
+
+def _normalize_error(message: str) -> str:
+    """Normalize an error message by removing variable parts for grouping."""
+    normalized = re.sub(r"\b[0-9a-f]{8,}\b", "<ID>", message)
+    normalized = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?", "<IP>", normalized)
+    normalized = re.sub(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[^\s]*", "<TS>", normalized)
+    normalized = re.sub(r"pod/[\w-]+", "pod/<NAME>", normalized)
+    normalized = re.sub(r"node/[\w.-]+", "node/<NAME>", normalized)
+    if len(normalized) > 150:
+        normalized = normalized[:150] + "..."
+    return normalized
+
+
+def correlate_errors(
+    multi_source_results: dict[str, SSHResult],
+) -> list[dict]:
+    """Correlate errors across multiple log sources to find related issues."""
+    all_errors = []
+
+    for source, result in multi_source_results.items():
+        if not result.success:
+            continue
+        for line in result.stdout.split("\n"):
+            entry = parse_log_line(line, source)
+            if entry.level == "ERROR":
+                all_errors.append({
+                    "source": source,
+                    "timestamp": entry.timestamp,
+                    "message": entry.message,
+                })
+
+    all_errors.sort(key=lambda e: e.get("timestamp", ""))
+
+    correlated = []
+    window_seconds = 30
+    used = set()
+
+    for i, err in enumerate(all_errors):
+        if i in used:
+            continue
+        group = [err]
+        used.add(i)
+
+        for j in range(i + 1, len(all_errors)):
+            if j in used:
+                continue
+            if all_errors[j].get("source") != err.get("source"):
+                group.append(all_errors[j])
+                used.add(j)
+
+        if len(group) > 1:
+            correlated.append({
+                "primary": err,
+                "related": group[1:],
+                "sources_involved": list({e["source"] for e in group}),
+            })
+
+    return correlated
+
+
+# ── LLM-powered analysis ─────────────────────────────────────────────────
+
+def llm_analyze_logs(
+    log_text: str,
+    source: str = "",
+    context: str = "",
+) -> str:
+    """Send log output to the LLM for deep analysis."""
+    truncated = log_text[-8000:] if len(log_text) > 8000 else log_text
+
+    prompt = f"""Analyze the following Kubernetes logs and provide a detailed assessment.
+
+Log Source: {source or 'Multiple sources'}
+Context: {context or 'General analysis'}
+
+== Log Output ==
+{truncated}
+== End Log Output ==
+
+Please provide:
+1. **Error Summary**: List all distinct errors found with frequency
+2. **Root Cause Analysis**: For each error pattern, explain the likely root cause
+3. **Error Correlation**: Identify errors that are likely related / cascading
+4. **Impact Assessment**: What is the impact of these errors on the cluster?
+5. **Remediation Steps**: Specific commands to fix each issue
+6. **Patterns & Trends**: Any concerning patterns (increasing errors, recurring issues)
+"""
+    return query_llm(prompt)
+
+
+def llm_correlate_analysis(
+    multi_source_logs: dict[str, str],
+    issue_description: str = "",
+) -> str:
+    """Send logs from multiple sources to the LLM for cross-source correlation."""
+    log_sections = []
+    for source, log_text in multi_source_logs.items():
+        truncated = log_text[-3000:] if len(log_text) > 3000 else log_text
+        log_sections.append(f"### {source}\n```\n{truncated}\n```\n")
+
+    all_logs = "\n".join(log_sections)
+
+    prompt = f"""Perform a cross-source correlation analysis on these Kubernetes cluster logs.
+
+Issue Description: {issue_description or 'General health analysis'}
+
+== Multi-Source Logs ==
+{all_logs}
+== End Logs ==
+
+Please provide:
+1. **Cross-Source Correlation**: Identify errors that appear related across different components
+2. **Causal Chain**: Determine the sequence of events / root cause chain
+3. **Timeline Reconstruction**: Reconstruct what happened based on timestamps
+4. **Root Cause**: Identify the single most likely root cause
+5. **Remediation Plan**: Step-by-step plan to resolve the issue
+6. **Monitoring Recommendations**: What alerts/metrics should be added to catch this earlier
+"""
+    return query_llm(prompt)
+
+
+def get_pod_list(
+    control_plane_node: dict,
+    namespace: str = "",
+) -> SSHResult:
+    """Get list of pods for the log analysis UI."""
+    ns_flag = f"-n {namespace}" if namespace else "-A"
+    command = f"kubectl get pods {ns_flag} -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,CONTAINERS:.spec.containers[*].name' --no-headers"
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=30,
+    )
diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py
new file mode 100644
index 0000000..fdf8b42
--- /dev/null
+++ b/k8s-agent/modules/monitoring_setup.py
@@ -0,0 +1,440 @@
+"""Monitoring Setup — Prometheus, Grafana, and dashboard provisioning via SSH."""
+
+from modules.cluster_creator import run_ssh_command, SSHResult
+from modules.llm_client import query_llm
+from modules.profile_manager import ClusterProfile
+
+
+def generate_helm_install_script() -> str:
+    """Generate script to install Helm on the control-plane node."""
+    return """#!/bin/bash
+set -euo pipefail
+
+echo "=== Installing Helm ==="
+
+if command -v helm &>/dev/null; then
+    echo "Helm already installed: $(helm version --short)"
+else
+    curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+    echo "Helm installed: $(helm version --short)"
+fi
+"""
+
+
+def generate_prometheus_install_script(namespace: str = "monitoring") -> str:
+    """Generate script to install kube-prometheus-stack via Helm."""
+    return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Installing Prometheus Stack ==="
+
+# Add Helm repos
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
+helm repo update
+
+# Create namespace
+kubectl create namespace {namespace} --dry-run=client -o yaml | kubectl apply -f -
+
+# Install kube-prometheus-stack
+helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \\
+    --namespace {namespace} \\
+    --set prometheus.prometheusSpec.retention=15d \\
+    --set prometheus.prometheusSpec.resources.requests.memory=512Mi \\
+    --set prometheus.prometheusSpec.resources.requests.cpu=250m \\
+    --set prometheus.prometheusSpec.resources.limits.memory=2Gi \\
+    --set prometheus.prometheusSpec.resources.limits.cpu=1000m \\
+    --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.accessModes[0]=ReadWriteOnce \\
+    --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi \\
+    --set alertmanager.alertmanagerSpec.resources.requests.memory=128Mi \\
+    --set alertmanager.alertmanagerSpec.resources.requests.cpu=50m \\
+    --set grafana.enabled=true \\
+    --set grafana.adminPassword=admin \\
+    --set grafana.persistence.enabled=true \\
+    --set grafana.persistence.size=10Gi \\
+    --set grafana.resources.requests.memory=256Mi \\
+    --set grafana.resources.requests.cpu=100m \\
+    --set grafana.resources.limits.memory=512Mi \\
+    --set grafana.resources.limits.cpu=500m \\
+    --set grafana.sidecar.dashboards.enabled=true \\
+    --set grafana.sidecar.dashboards.searchNamespace=ALL \\
+    --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \\
+    --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \\
+    --wait --timeout 10m
+
+echo ""
+echo "=== Prometheus Stack installed ==="
+echo ""
+kubectl -n {namespace} get pods
+"""
+
+
+def generate_standalone_grafana_script(namespace: str = "monitoring") -> str:
+    """Generate script to install standalone Grafana with provisioned dashboards."""
+    return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Installing Standalone Grafana ==="
+
+helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
+helm repo update
+
+kubectl create namespace {namespace} --dry-run=client -o yaml | kubectl apply -f -
+
+helm upgrade --install grafana grafana/grafana \\
+    --namespace {namespace} \\
+    --set adminPassword=admin \\
+    --set persistence.enabled=true \\
+    --set persistence.size=10Gi \\
+    --set resources.requests.memory=256Mi \\
+    --set resources.requests.cpu=100m \\
+    --set resources.limits.memory=512Mi \\
+    --set resources.limits.cpu=500m \\
+    --set sidecar.dashboards.enabled=true \\
+    --set sidecar.dashboards.searchNamespace=ALL \\
+    --set sidecar.datasources.enabled=true \\
+    --set 'datasources.datasources\\.yaml.apiVersion=1' \\
+    --set 'datasources.datasources\\.yaml.datasources[0].name=Prometheus' \\
+    --set 'datasources.datasources\\.yaml.datasources[0].type=prometheus' \\
+    --set 'datasources.datasources\\.yaml.datasources[0].url=http://prometheus-kube-prometheus-prometheus.{namespace}.svc:9090' \\
+    --set 'datasources.datasources\\.yaml.datasources[0].access=proxy' \\
+    --set 'datasources.datasources\\.yaml.datasources[0].isDefault=true' \\
+    --wait --timeout 5m
+
+echo ""
+echo "=== Grafana installed ==="
+kubectl -n {namespace} get pods -l app.kubernetes.io/name=grafana
+"""
+
+
+GRAFANA_DASHBOARDS = {
+    "cluster-overview": {
+        "name": "Kubernetes Cluster Overview",
+        "description": "Overall cluster health, node status, resource utilization",
+        "gnet_id": 15520,
+    },
+    "node-exporter": {
+        "name": "Node Exporter Full",
+        "description": "Detailed node metrics — CPU, memory, disk, network",
+        "gnet_id": 1860,
+    },
+    "pod-monitoring": {
+        "name": "Kubernetes Pods",
+        "description": "Pod-level CPU, memory, network, restarts",
+        "gnet_id": 15760,
+    },
+    "namespace-resources": {
+        "name": "Namespace Resources",
+        "description": "Resource usage per namespace with quota tracking",
+        "gnet_id": 15758,
+    },
+    "coredns": {
+        "name": "CoreDNS",
+        "description": "DNS query rates, latency, errors",
+        "gnet_id": 15762,
+    },
+    "etcd": {
+        "name": "etcd",
+        "description": "etcd cluster health, leader changes, WAL sync duration",
+        "gnet_id": 3070,
+    },
+    "api-server": {
+        "name": "Kubernetes API Server",
+        "description": "API server request rates, latency, errors",
+        "gnet_id": 15761,
+    },
+    "persistent-volumes": {
+        "name": "Persistent Volumes",
+        "description": "PV/PVC usage and capacity tracking",
+        "gnet_id": 13646,
+    },
+}
+
+
+def generate_dashboard_import_script(
+    dashboard_keys: list[str],
+    namespace: str = "monitoring",
+) -> str:
+    """Generate script to import Grafana dashboards as ConfigMaps."""
+    configmaps = []
+    for key in dashboard_keys:
+        dash = GRAFANA_DASHBOARDS.get(key)
+        if not dash:
+            continue
+        configmaps.append(f"""
+# Import: {dash['name']}
+cat <<'DASHEOF' | kubectl apply -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-{key}
+  namespace: {namespace}
+  labels:
+    grafana_dashboard: "1"
+data:
+  {key}.json: |
+    {{
+      "annotations": {{"list": []}},
+      "description": "{dash['description']}",
+      "editable": true,
+      "gnetId": {dash['gnet_id']},
+      "title": "{dash['name']}",
+      "uid": "{key}",
+      "version": 1,
+      "__inputs": [
+        {{
+          "name": "DS_PROMETHEUS",
+          "label": "Prometheus",
+          "type": "datasource",
+          "pluginId": "prometheus"
+        }}
+      ]
+    }}
+DASHEOF
+echo "  Imported: {dash['name']} (grafana.net #{dash['gnet_id']})"
+""")
+
+    script_body = "\n".join(configmaps)
+
+    return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Importing Grafana Dashboards ==="
+{script_body}
+
+# Also import from grafana.net directly via Grafana API
+GRAFANA_POD=$(kubectl -n {namespace} get pods -l app.kubernetes.io/name=grafana -o jsonpath='{{.items[0].metadata.name}}' 2>/dev/null || echo "")
+
+if [ -n "$GRAFANA_POD" ]; then
+    echo ""
+    echo ">> Importing full dashboards from grafana.net via API..."
+    kubectl -n {namespace} port-forward "$GRAFANA_POD" 3000:3000 &
+    PF_PID=$!
+    sleep 3
+
+    for gnet_id in {' '.join(str(GRAFANA_DASHBOARDS[k]['gnet_id']) for k in dashboard_keys if k in GRAFANA_DASHBOARDS)}; do
+        curl -s -X POST http://localhost:3000/api/dashboards/import \\
+            -H "Content-Type: application/json" \\
+            -u admin:admin \\
+            -d "{{
+                \\"dashboard\\": {{\\"id\\": null}},
+                \\"overwrite\\": true,
+                \\"inputs\\": [{{\\"name\\": \\"DS_PROMETHEUS\\", \\"type\\": \\"datasource\\", \\"pluginId\\": \\"prometheus\\", \\"value\\": \\"Prometheus\\"}}],
+                \\"folderId\\": 0,
+                \\"gnetId\\": $gnet_id
+            }}" 2>/dev/null && echo "  Imported grafana.net #$gnet_id" || echo "  Failed grafana.net #$gnet_id (non-critical)"
+    done
+
+    kill $PF_PID 2>/dev/null || true
+fi
+
+echo ""
+echo "=== Dashboard import complete ==="
+"""
+
+
+def generate_alerting_rules_script(namespace: str = "monitoring") -> str:
+    """Generate PrometheusRule resources for common K8s alerts."""
+    return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Installing Alert Rules ==="
+
+cat <<'EOF' | kubectl apply -f -
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: k8s-cluster-alerts
+  namespace: {namespace}
+  labels:
+    release: prometheus
+spec:
+  groups:
+  - name: k8s-node-alerts
+    rules:
+    - alert: NodeNotReady
+      expr: kube_node_status_condition{{condition="Ready",status="true"}} == 0
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: "Node {{{{ $labels.node }}}} is not ready"
+        description: "Node has been in NotReady state for more than 5 minutes."
+    - alert: NodeHighCPU
+      expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{{mode="idle"}}[5m])) * 100) > 85
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Node {{{{ $labels.instance }}}} has high CPU usage"
+        description: "CPU usage is above 85% for more than 10 minutes."
+    - alert: NodeHighMemory
+      expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Node {{{{ $labels.instance }}}} has high memory usage"
+        description: "Memory usage is above 85% for more than 10 minutes."
+    - alert: NodeDiskPressure
+      expr: (1 - node_filesystem_avail_bytes{{mountpoint="/"}} / node_filesystem_size_bytes{{mountpoint="/"}}) * 100 > 85
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Node {{{{ $labels.instance }}}} disk usage is high"
+        description: "Root filesystem usage is above 85%."
+  - name: k8s-pod-alerts
+    rules:
+    - alert: PodCrashLooping
+      expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 5
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: "Pod {{{{ $labels.namespace }}}}/{{{{ $labels.pod }}}} is crash looping"
+        description: "Pod has restarted more than 5 times in the last 15 minutes."
+    - alert: PodNotReady
+      expr: kube_pod_status_ready{{condition="true"}} == 0
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Pod {{{{ $labels.namespace }}}}/{{{{ $labels.pod }}}} is not ready"
+        description: "Pod has been in a non-ready state for more than 10 minutes."
+    - alert: PVCAlmostFull
+      expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 > 85
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: "PVC {{{{ $labels.persistentvolumeclaim }}}} is almost full"
+        description: "PVC in namespace {{{{ $labels.namespace }}}} is over 85% full."
+  - name: k8s-etcd-alerts
+    rules:
+    - alert: EtcdHighLatency
+      expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: "etcd WAL fsync latency is high"
+        description: "99th percentile etcd WAL fsync duration exceeds 500ms."
+EOF
+
+echo "=== Alert rules installed ==="
+"""
+
+
+def install_helm(control_plane_node: dict) -> SSHResult:
+    """Install Helm on the control-plane node."""
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=generate_helm_install_script(),
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=120,
+    )
+
+
+def install_prometheus_stack(
+    control_plane_node: dict,
+    namespace: str = "monitoring",
+) -> SSHResult:
+    """Install the full kube-prometheus-stack."""
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=generate_prometheus_install_script(namespace),
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=900,
+    )
+
+
+def install_dashboards(
+    control_plane_node: dict,
+    dashboard_keys: list[str],
+    namespace: str = "monitoring",
+) -> SSHResult:
+    """Import selected Grafana dashboards."""
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=generate_dashboard_import_script(dashboard_keys, namespace),
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=300,
+    )
+
+
+def install_alert_rules(
+    control_plane_node: dict,
+    namespace: str = "monitoring",
+) -> SSHResult:
+    """Install Prometheus alerting rules."""
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=generate_alerting_rules_script(namespace),
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=60,
+    )
+
+
+def get_monitoring_status(
+    control_plane_node: dict,
+    namespace: str = "monitoring",
+) -> SSHResult:
+    """Check the status of the monitoring stack."""
+    command = f"""
+echo "=== Monitoring Stack Status ==="
+echo ""
+echo ">> Pods:"
+kubectl -n {namespace} get pods -o wide
+echo ""
+echo ">> Services:"
+kubectl -n {namespace} get svc
+echo ""
+echo ">> PVCs:"
+kubectl -n {namespace} get pvc
+echo ""
+echo ">> PrometheusRules:"
+kubectl -n {namespace} get prometheusrules 2>/dev/null || echo "No PrometheusRules found"
+echo ""
+echo ">> ServiceMonitors:"
+kubectl -n {namespace} get servicemonitors 2>/dev/null || echo "No ServiceMonitors found"
+"""
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=30,
+    )
+
+
+def get_monitoring_advice(
+    profile: ClusterProfile,
+    current_status: str = "",
+) -> str:
+    """Ask the LLM for monitoring setup advice."""
+    prompt = f"""I have a Kubernetes cluster with the following setup:
+- Kubernetes: {profile.kubernetes_version}
+- Runtime: CRI-O {profile.crio_version}
+- CNI: Flannel
+- Nodes: {len(profile.nodes)} ({len(profile.get_control_plane_nodes())} control-plane, {len(profile.get_worker_nodes())} workers)
+
+Current monitoring status:
+{current_status or 'Not yet installed'}
+
+Please recommend:
+1. The optimal Prometheus retention and resource settings for this cluster size
+2. Essential Grafana dashboards to install
+3. Critical alerting rules beyond the standard set
+4. Any additional exporters I should install (e.g., blackbox, SNMP)
+5. Log aggregation recommendations (Loki, EFK, etc.)
+"""
+    return query_llm(prompt)
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
new file mode 100644
index 0000000..a4ad9cf
--- /dev/null
+++ b/k8s-agent/modules/profile_manager.py
@@ -0,0 +1,119 @@
+"""Cluster Profile Manager — CRUD operations for K8s cluster profiles."""
+
+import json
+import os
+import time
+from dataclasses import asdict, dataclass, field
+from typing import Optional
+
+import config
+
+
+@dataclass
+class NodeInfo:
+    """Represents a node in the cluster."""
+
+    hostname: str
+    ip_address: str
+    role: str  # "control-plane" or "worker"
+    ssh_user: str = "root"
+    ssh_port: int = 22
+    ssh_key_path: str = "~/.ssh/id_rsa"
+
+
+@dataclass
+class ClusterProfile:
+    """Represents a complete cluster profile configuration."""
+
+    name: str
+    description: str = ""
+    kubernetes_version: str = "1.30"
+    crio_version: str = "1.30"
+    cni_plugin: str = "flannel"
+    pod_cidr: str = "10.244.0.0/16"
+    service_cidr: str = "10.96.0.0/12"
+    dns_domain: str = "cluster.local"
+    nodes: list[dict] = field(default_factory=list)
+    created_at: str = ""
+    updated_at: str = ""
+    status: str = "draft"  # draft, provisioning, active, error
+    kubeconfig_path: str = ""
+    monitoring_enabled: bool = False
+    pod_security_standard: str = "restricted"  # privileged, baseline, restricted
+
+    def get_control_plane_nodes(self) -> list[dict]:
+        return [n for n in self.nodes if n.get("role") == "control-plane"]
+
+    def get_worker_nodes(self) -> list[dict]:
+        return [n for n in self.nodes if n.get("role") == "worker"]
+
+
+def _profile_path(name: str) -> str:
+    """Return the file path for a given profile name."""
+    safe_name = name.replace(" ", "_").replace("/", "_").lower()
+    return os.path.join(config.PROFILES_DIR, f"{safe_name}.json")
+
+
+def save_profile(profile: ClusterProfile) -> str:
+    """Save a cluster profile to disk.
+
+    Returns the file path where the profile was saved.
+    """
+    now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+    if not profile.created_at:
+        profile.created_at = now
+    profile.updated_at = now
+
+    path = _profile_path(profile.name)
+    with open(path, "w") as f:
+        json.dump(asdict(profile), f, indent=2)
+    return path
+
+
+def load_profile(name: str) -> Optional[ClusterProfile]:
+    """Load a cluster profile from disk by name."""
+    path = _profile_path(name)
+    if not os.path.exists(path):
+        return None
+    with open(path, "r") as f:
+        data = json.load(f)
+    return ClusterProfile(**data)
+
+
+def list_profiles() -> list[ClusterProfile]:
+    """List all saved cluster profiles."""
+    profiles = []
+    if not os.path.exists(config.PROFILES_DIR):
+        return profiles
+    for filename in sorted(os.listdir(config.PROFILES_DIR)):
+        if filename.endswith(".json"):
+            filepath = os.path.join(config.PROFILES_DIR, filename)
+            try:
+                with open(filepath, "r") as f:
+                    data = json.load(f)
+                profiles.append(ClusterProfile(**data))
+            except (json.JSONDecodeError, TypeError):
+                continue
+    return profiles
+
+
+def delete_profile(name: str) -> bool:
+    """Delete a cluster profile by name.
+
+    Returns True if the profile was deleted, False if it didn't exist.
+    """
+    path = _profile_path(name)
+    if os.path.exists(path):
+        os.remove(path)
+        return True
+    return False
+
+
+def update_profile_status(name: str, status: str) -> bool:
+    """Update the status field of an existing profile."""
+    profile = load_profile(name)
+    if profile is None:
+        return False
+    profile.status = status
+    save_profile(profile)
+    return True
diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt
new file mode 100644
index 0000000..effeada
--- /dev/null
+++ b/k8s-agent/requirements.txt
@@ -0,0 +1,6 @@
+streamlit>=1.32.0
+requests>=2.31.0
+pyyaml>=6.0.1
+pandas>=2.2.0
+plotly>=5.18.0
+streamlit-option-menu>=0.3.12
diff --git a/k8s-agent/templates/.gitkeep b/k8s-agent/templates/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/k8s-agent/templates/.gitkeep
@@ -0,0 +1 @@
+

From dc1fb13c25429f4e01358a3e5bc683812301f26a Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 09:49:27 +0000
Subject: [PATCH 02/31] Clean up unused imports and dependencies

- Remove unused NodeInfo import from app.py
- Remove unused pyyaml and pandas from requirements.txt
---
 k8s-agent/app.py           | 1 -
 k8s-agent/requirements.txt | 2 --
 2 files changed, 3 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 4225a81..59ff5aa 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -13,7 +13,6 @@
 import config
 from modules.profile_manager import (
     ClusterProfile,
-    NodeInfo,
     save_profile,
     load_profile,
     list_profiles,
diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt
index effeada..96fb8cf 100644
--- a/k8s-agent/requirements.txt
+++ b/k8s-agent/requirements.txt
@@ -1,6 +1,4 @@
 streamlit>=1.32.0
 requests>=2.31.0
-pyyaml>=6.0.1
-pandas>=2.2.0
 plotly>=5.18.0
 streamlit-option-menu>=0.3.12

From 5566c5e43aad878bdbbe05b0e9c1eb3b124354c9 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 09:59:19 +0000
Subject: [PATCH 03/31] Add CRI-O custom storage paths and proxy settings for
 master node

- Add crio_root, crio_runroot, kubelet_root, log_root fields to ClusterProfile
- Add http_proxy, https_proxy, no_proxy, http_proxy_alt, https_proxy_alt fields
- Update generated scripts to configure CRI-O storage paths via crio.conf.d
- Update control-plane init script to use custom audit log dir and kubelet root
- Add proxy env vars to common setup and control-plane init scripts
- Add Storage Paths and Proxy Settings sections to Profile Manager UI
- Show storage/proxy details in Manage Profiles view and profile summary
---
 k8s-agent/app.py                     | 105 +++++++++++++++++++++++++
 k8s-agent/modules/cluster_creator.py | 111 +++++++++++++++++++++++++--
 k8s-agent/modules/profile_manager.py |  11 +++
 3 files changed, 220 insertions(+), 7 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 59ff5aa..471d1a0 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -269,6 +269,76 @@ def page_profile_manager():
                 service_cidr = st.text_input("Service CIDR", value="10.96.0.0/12")
                 dns_domain = st.text_input("DNS Domain", value="cluster.local")
 
+            st.divider()
+            st.markdown("### Storage Paths")
+            st.markdown(
+                "Configure where CRI-O stores container images, pods, and logs. "
+                "Change these to use a dedicated disk instead of the default `/var/lib`."
+            )
+            scol1, scol2 = st.columns(2)
+            with scol1:
+                crio_root = st.text_input(
+                    "CRI-O Storage Root",
+                    value="/var/lib/containers/storage",
+                    help="Root directory for CRI-O container/image storage (default: /var/lib/containers/storage)",
+                )
+                crio_runroot = st.text_input(
+                    "CRI-O Run Root",
+                    value="/run/containers/storage",
+                    help="Runtime root for CRI-O (default: /run/containers/storage)",
+                )
+            with scol2:
+                kubelet_root = st.text_input(
+                    "Kubelet Data Directory",
+                    value="/var/lib/kubelet",
+                    help="Kubelet data directory for pods, volumes, etc. (default: /var/lib/kubelet)",
+                )
+                log_root = st.text_input(
+                    "Log Root Directory",
+                    value="/var/log",
+                    help="Base directory for all logs — CRI-O pod logs, kubernetes audit logs, etc. (default: /var/log)",
+                )
+
+            st.divider()
+            st.markdown("### Proxy Settings (Master Node)")
+            st.markdown(
+                "Configure HTTP/HTTPS proxy for the master/control-plane node. "
+                "These are used during package installation and cluster initialization."
+            )
+            pcol1, pcol2 = st.columns(2)
+            with pcol1:
+                http_proxy = st.text_input(
+                    "HTTP Proxy",
+                    value="",
+                    placeholder="http://proxy.example.com:8080",
+                    help="Primary HTTP proxy for outbound connections",
+                )
+                https_proxy = st.text_input(
+                    "HTTPS Proxy",
+                    value="",
+                    placeholder="http://proxy.example.com:8443",
+                    help="Primary HTTPS proxy for outbound connections",
+                )
+                no_proxy = st.text_input(
+                    "No Proxy",
+                    value="",
+                    placeholder="localhost,127.0.0.1,10.96.0.0/12,10.244.0.0/16",
+                    help="Comma-separated list of hosts/CIDRs to bypass proxy",
+                )
+            with pcol2:
+                http_proxy_alt = st.text_input(
+                    "Alternate HTTP Proxy",
+                    value="",
+                    placeholder="http://backup-proxy.example.com:8080",
+                    help="Fallback HTTP proxy if the primary is unavailable",
+                )
+                https_proxy_alt = st.text_input(
+                    "Alternate HTTPS Proxy",
+                    value="",
+                    placeholder="http://backup-proxy.example.com:8443",
+                    help="Fallback HTTPS proxy if the primary is unavailable",
+                )
+
             st.divider()
             st.markdown("### Nodes")
             st.markdown("Define your control-plane and worker nodes.")
@@ -322,6 +392,15 @@ def page_profile_manager():
                         dns_domain=dns_domain,
                         nodes=valid_nodes,
                         pod_security_standard=pod_security,
+                        crio_root=crio_root,
+                        crio_runroot=crio_runroot,
+                        kubelet_root=kubelet_root,
+                        log_root=log_root,
+                        http_proxy=http_proxy,
+                        https_proxy=https_proxy,
+                        no_proxy=no_proxy,
+                        http_proxy_alt=http_proxy_alt,
+                        https_proxy_alt=https_proxy_alt,
                     )
                     path = save_profile(profile)
                     st.session_state.active_profile = name
@@ -343,6 +422,12 @@ def page_profile_manager():
                     st.markdown(f"**Kubernetes:** {profile.kubernetes_version} | **CRI-O:** {profile.crio_version}")
                     st.markdown(f"**Pod CIDR:** {profile.pod_cidr} | **Service CIDR:** {profile.service_cidr}")
                     st.markdown(f"**Pod Security:** {profile.pod_security_standard}")
+                    st.markdown(f"**CRI-O Root:** `{profile.crio_root}` | **Kubelet Dir:** `{profile.kubelet_root}`")
+                    st.markdown(f"**Log Root:** `{profile.log_root}`")
+                    if profile.http_proxy or profile.https_proxy:
+                        st.markdown(f"**Proxy:** `{profile.http_proxy or profile.https_proxy}`")
+                    if profile.http_proxy_alt or profile.https_proxy_alt:
+                        st.markdown(f"**Alt Proxy:** `{profile.http_proxy_alt or profile.https_proxy_alt}`")
                 with col2:
                     st.markdown("**Nodes:**")
                     for node in profile.nodes:
@@ -1086,6 +1171,26 @@ def _show_profile_summary(profile: ClusterProfile):
     cols[3].metric("CNI", "Flannel")
     cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W")
 
+    with st.expander("Storage & Proxy Details", expanded=False):
+        scol1, scol2, scol3 = st.columns(3)
+        with scol1:
+            st.markdown(f"**CRI-O Root:** `{profile.crio_root}`")
+            st.markdown(f"**CRI-O RunRoot:** `{profile.crio_runroot}`")
+        with scol2:
+            st.markdown(f"**Kubelet Dir:** `{profile.kubelet_root}`")
+            st.markdown(f"**Log Root:** `{profile.log_root}`")
+        with scol3:
+            if profile.http_proxy or profile.https_proxy:
+                st.markdown(f"**HTTP Proxy:** `{profile.http_proxy or 'N/A'}`")
+                st.markdown(f"**HTTPS Proxy:** `{profile.https_proxy or 'N/A'}`")
+                if profile.no_proxy:
+                    st.markdown(f"**No Proxy:** `{profile.no_proxy}`")
+            if profile.http_proxy_alt or profile.https_proxy_alt:
+                st.markdown(f"**Alt HTTP Proxy:** `{profile.http_proxy_alt or 'N/A'}`")
+                st.markdown(f"**Alt HTTPS Proxy:** `{profile.https_proxy_alt or 'N/A'}`")
+            if not (profile.http_proxy or profile.https_proxy or profile.http_proxy_alt or profile.https_proxy_alt):
+                st.markdown("**Proxy:** Not configured")
+
 
 # ── Main Router ───────────────────────────────────────────────────────────
 
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index ef89dea..830a56e 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -101,16 +101,76 @@ def test_ssh_connectivity(node: dict) -> SSHResult:
     )
 
 
+def _proxy_env_block(profile: ClusterProfile) -> str:
+    """Generate shell export lines for proxy environment variables."""
+    lines = []
+    proxy = profile.http_proxy or profile.http_proxy_alt
+    proxys = profile.https_proxy or profile.https_proxy_alt
+    if proxy:
+        lines.append(f'export http_proxy="{proxy}"')
+        lines.append(f'export HTTP_PROXY="{proxy}"')
+    if proxys:
+        lines.append(f'export https_proxy="{proxys}"')
+        lines.append(f'export HTTPS_PROXY="{proxys}"')
+    if profile.no_proxy:
+        lines.append(f'export no_proxy="{profile.no_proxy}"')
+        lines.append(f'export NO_PROXY="{profile.no_proxy}"')
+    return "\n".join(lines)
+
+
 def generate_common_setup_script(profile: ClusterProfile) -> str:
     """Generate the common setup script that runs on ALL nodes (control-plane + workers)."""
+    proxy_block = _proxy_env_block(profile)
+    proxy_section = ""
+    if proxy_block:
+        proxy_section = f"""
+# ── 0. Proxy configuration ───────────────────────────────────────────────
+echo ">> Configuring proxy settings..."
+{proxy_block}
+
+# Persist proxy in /etc/environment for all users
+cat >> /etc/environment <<PROXYEOF
+{proxy_block}
+PROXYEOF
+"""
+
+    crio_storage_section = ""
+    if profile.crio_root != "/var/lib/containers/storage":
+        crio_storage_section = f"""
+# ── Custom CRI-O storage paths ───────────────────────────────────────────
+echo ">> Configuring CRI-O custom storage root: {profile.crio_root}"
+mkdir -p "{profile.crio_root}"
+mkdir -p "{profile.crio_runroot}"
+"""
+
+    kubelet_section = ""
+    if profile.kubelet_root != "/var/lib/kubelet":
+        kubelet_section = f"""
+# ── Custom kubelet data directory ────────────────────────────────────────
+echo ">> Configuring kubelet data directory: {profile.kubelet_root}"
+mkdir -p "{profile.kubelet_root}"
+"""
+
+    log_section = ""
+    if profile.log_root != "/var/log":
+        log_section = f"""
+# ── Custom log directory ─────────────────────────────────────────────────
+echo ">> Configuring custom log root: {profile.log_root}"
+mkdir -p "{profile.log_root}/pods"
+mkdir -p "{profile.log_root}/containers"
+"""
+
     return f"""#!/bin/bash
 set -euo pipefail
 
 echo "=== K8s Node Common Setup ==="
 echo "Kubernetes Version: {profile.kubernetes_version}"
 echo "CRI-O Version: {profile.crio_version}"
+echo "CRI-O Storage Root: {profile.crio_root}"
+echo "Kubelet Data Dir: {profile.kubelet_root}"
+echo "Log Root: {profile.log_root}"
 echo "Timestamp: $(date -u)"
-
+{proxy_section}{crio_storage_section}{kubelet_section}{log_section}
 # ── 1. System prerequisites ──────────────────────────────────────────────
 echo ">> Disabling swap..."
 swapoff -a
@@ -182,8 +242,19 @@ def generate_common_setup_script(profile: ClusterProfile) -> str:
 fi
 
 systemctl daemon-reload
+
+# ── Configure CRI-O storage paths ────────────────────────────────────────
+echo ">> Configuring CRI-O storage to {profile.crio_root}..."
+mkdir -p /etc/crio/crio.conf.d
+cat > /etc/crio/crio.conf.d/01-storage.conf <<CRIOCONF
+[crio]
+  root = "{profile.crio_root}"
+  runroot = "{profile.crio_runroot}"
+  log_dir = "{profile.log_root}/crio/pods"
+CRIOCONF
+
 systemctl enable --now crio
-echo ">> CRI-O installed and running."
+echo ">> CRI-O installed and configured (storage: {profile.crio_root})."
 
 # ── 3. Install kubeadm, kubelet, kubectl ──────────────────────────────────
 echo ">> Installing Kubernetes {profile.kubernetes_version} components..."
@@ -223,12 +294,31 @@ def generate_control_plane_init_script(profile: ClusterProfile) -> str:
     cp_nodes = profile.get_control_plane_nodes()
     cp_ip = cp_nodes[0]["ip_address"] if cp_nodes else "CONTROL_PLANE_IP"
 
+    # Build proxy environment block for the control-plane
+    proxy_block = _proxy_env_block(profile)
+    proxy_section = ""
+    if proxy_block:
+        proxy_section = f"""
+# ── Proxy configuration (master node) ───────────────────────────────────
+echo ">> Setting proxy environment for kubeadm..."
+{proxy_block}
+"""
+
+    # Audit log path respects custom log_root
+    audit_log_dir = f"{profile.log_root}/kubernetes"
+
+    # Extra kubelet args for custom root dir
+    kubelet_extra = '    container-runtime-endpoint: "unix:///var/run/crio/crio.sock"'
+    if profile.kubelet_root != "/var/lib/kubelet":
+        kubelet_extra += f'\n    root-dir: "{profile.kubelet_root}"'
+
     return f"""#!/bin/bash
 set -euo pipefail
 
 echo "=== Initializing Kubernetes Control Plane ==="
-
+{proxy_section}
 # ── kubeadm init ──────────────────────────────────────────────────────────
+mkdir -p "{audit_log_dir}"
 cat > /tmp/kubeadm-config.yaml <<EOF
 apiVersion: kubeadm.k8s.io/v1beta3
 kind: InitConfiguration
@@ -238,7 +328,7 @@ def generate_control_plane_init_script(profile: ClusterProfile) -> str:
 nodeRegistration:
   criSocket: "unix:///var/run/crio/crio.sock"
   kubeletExtraArgs:
-    container-runtime-endpoint: "unix:///var/run/crio/crio.sock"
+{kubelet_extra}
 ---
 apiVersion: kubeadm.k8s.io/v1beta3
 kind: ClusterConfiguration
@@ -252,14 +342,14 @@ def generate_control_plane_init_script(profile: ClusterProfile) -> str:
   extraArgs:
     authorization-mode: "Node,RBAC"
     enable-admission-plugins: "NodeRestriction,PodSecurity"
-    audit-log-path: "/var/log/kubernetes/audit.log"
+    audit-log-path: "{audit_log_dir}/audit.log"
     audit-log-maxage: "30"
     audit-log-maxbackup: "10"
     audit-log-maxsize: "100"
   extraVolumes:
   - name: audit-log
-    hostPath: "/var/log/kubernetes"
-    mountPath: "/var/log/kubernetes"
+    hostPath: "{audit_log_dir}"
+    mountPath: "{audit_log_dir}"
     pathType: DirectoryOrCreate
 controllerManager:
   extraArgs:
@@ -530,6 +620,13 @@ def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
 - Pod CIDR: {profile.pod_cidr}
 - Service CIDR: {profile.service_cidr}
 - Pod Security Standard: {profile.pod_security_standard}
+- CRI-O Storage Root: {profile.crio_root}
+- Kubelet Data Dir: {profile.kubelet_root}
+- Log Root: {profile.log_root}
+- HTTP Proxy: {profile.http_proxy or 'none'}
+- HTTPS Proxy: {profile.https_proxy or 'none'}
+- Alternate HTTP Proxy: {profile.http_proxy_alt or 'none'}
+- Alternate HTTPS Proxy: {profile.https_proxy_alt or 'none'}
 
 Nodes:
 {nodes_str}
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
index a4ad9cf..9754801 100644
--- a/k8s-agent/modules/profile_manager.py
+++ b/k8s-agent/modules/profile_manager.py
@@ -40,6 +40,17 @@ class ClusterProfile:
     kubeconfig_path: str = ""
     monitoring_enabled: bool = False
     pod_security_standard: str = "restricted"  # privileged, baseline, restricted
+    # CRI-O storage paths (override defaults in /var/lib)
+    crio_root: str = "/var/lib/containers/storage"  # container storage root
+    crio_runroot: str = "/run/containers/storage"  # runtime root
+    kubelet_root: str = "/var/lib/kubelet"  # kubelet data dir
+    log_root: str = "/var/log"  # base log directory
+    # Proxy settings for master node
+    http_proxy: str = ""
+    https_proxy: str = ""
+    no_proxy: str = ""
+    http_proxy_alt: str = ""  # alternate proxy
+    https_proxy_alt: str = ""  # alternate proxy
 
     def get_control_plane_nodes(self) -> list[dict]:
         return [n for n in self.nodes if n.get("role") == "control-plane"]

From b47ee9e8be44d3ff60dbcd560980e4a5ad47fc35 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 10:16:25 +0000
Subject: [PATCH 04/31] Add gitignore entries for profile data and pycache

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 2a1bb18..dc0bed2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,5 @@ charts/*/charts/
 *.key
 kubeconfig*
 k8s-agent/__pycache__/
+k8s-agent/data/profiles/*.json
+k8s-agent/modules/__pycache__/

From cb3658f682f1d95f651afc823b7e7673d41baa6b Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 10:34:30 +0000
Subject: [PATCH 05/31] Add step-by-step SSH provisioning with granular
 per-step progress

---
 k8s-agent/app.py                     | 170 ++++++---
 k8s-agent/modules/cluster_creator.py | 523 ++++++++++++++++++++++++++-
 2 files changed, 637 insertions(+), 56 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 471d1a0..c41ba61 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -32,6 +32,12 @@
     apply_best_practices,
     get_cluster_status,
     get_llm_cluster_advice,
+    ProvisionStep,
+    _run_step,
+    get_common_setup_steps,
+    get_control_plane_steps,
+    get_worker_join_steps,
+    get_best_practices_steps,
 )
 from modules.cluster_debugger import (
     DIAGNOSTIC_COMMANDS,
@@ -522,8 +528,8 @@ def page_cluster_creation():
     with tab_provision:
         st.markdown("### Automated Cluster Provisioning")
         st.warning(
-            "This will SSH into each node and install Kubernetes components. "
-            "Ensure all nodes are accessible and you have root/sudo access."
+            "This will SSH into each node and execute every provisioning step "
+            "automatically. Ensure all nodes are accessible and you have root/sudo access."
         )
 
         cp_nodes = profile.get_control_plane_nodes()
@@ -542,80 +548,136 @@ def page_cluster_creation():
 
         if st.button("Start Provisioning", type="primary", use_container_width=True):
             update_profile_status(profile.name, "provisioning")
+            overall_success = True
 
-            # Step 1: Common setup on all nodes
+            # ── Step 1: Common setup on ALL nodes (granular per-step) ────
             if step1:
                 st.markdown("---")
-                st.markdown("### Step 1: Common Setup")
+                st.markdown("### Step 1: Common Node Setup")
+                common_steps = get_common_setup_steps(profile)
                 for node in profile.nodes:
-                    with st.status(
-                        f"Setting up {node.get('hostname', node['ip_address'])} ({node['role']})...",
-                        expanded=True,
-                    ):
-                        result = provision_node_common(node, profile)
-                        if result.success:
-                            st.success(f"Common setup complete on {node['ip_address']}")
-                        else:
-                            st.error(f"Setup failed on {node['ip_address']}")
-                            st.code(result.stderr, language="text")
+                    node_label = f"{node.get('hostname', node['ip_address'])} ({node['role']})"
+                    st.markdown(f"#### Node: {node_label}")
+                    node_ok = True
+                    progress = st.progress(0, text=f"Starting setup on {node_label}...")
+                    for idx, step in enumerate(common_steps):
+                        pct = int((idx / len(common_steps)) * 100)
+                        progress.progress(pct, text=f"[{idx+1}/{len(common_steps)}] {step.title}")
+                        with st.status(f"{step.title}...", expanded=False) as status:
+                            result = _run_step(node, step)
+                            if result.success:
+                                st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text")
+                                status.update(label=f"{step.title} — done", state="complete")
+                            else:
+                                st.error(f"FAILED: {step.title}")
+                                st.code(result.stderr or result.stdout, language="text")
+                                status.update(label=f"{step.title} — FAILED", state="error")
+                                node_ok = False
+                                if step.fatal:
+                                    overall_success = False
+                                    break
+                    progress.progress(100, text=f"{'Setup complete' if node_ok else 'Setup FAILED'} on {node_label}")
+                    if node_ok:
+                        st.success(f"Common setup complete on {node['ip_address']}")
+                    else:
+                        st.error(f"Common setup failed on {node['ip_address']}")
 
-            # Step 2: Initialize control plane
-            if step2 and cp_nodes:
+            # ── Step 2: Control plane init (granular per-step) ───────────
+            if step2 and cp_nodes and overall_success:
                 st.markdown("---")
                 st.markdown("### Step 2: Control Plane Initialization")
                 cp_node = cp_nodes[0]
-                with st.status(f"Initializing control plane on {cp_node['ip_address']}...", expanded=True):
-                    result = init_control_plane(cp_node, profile)
-                    if result.success:
-                        st.success("Control plane initialized!")
-                        st.code(result.stdout[-2000:], language="text")
-                    else:
-                        st.error("Control plane initialization failed!")
-                        st.code(result.stderr, language="text")
+                cp_steps = get_control_plane_steps(profile)
+                progress = st.progress(0, text="Starting control plane init...")
+                for idx, step in enumerate(cp_steps):
+                    pct = int((idx / len(cp_steps)) * 100)
+                    progress.progress(pct, text=f"[{idx+1}/{len(cp_steps)}] {step.title}")
+                    with st.status(f"{step.title}...", expanded=False) as status:
+                        result = _run_step(cp_node, step)
+                        if result.success:
+                            st.code(result.stdout[-2000:] if result.stdout else "(no output)", language="text")
+                            status.update(label=f"{step.title} — done", state="complete")
+                        else:
+                            st.error(f"FAILED: {step.title}")
+                            st.code(result.stderr or result.stdout, language="text")
+                            status.update(label=f"{step.title} — FAILED", state="error")
+                            overall_success = False
+                            if step.fatal:
+                                break
+                progress.progress(100, text="Control plane initialization complete" if overall_success else "Control plane init FAILED")
+                if overall_success:
+                    st.success("Control plane initialized!")
+                else:
+                    st.error("Control plane initialization failed!")
 
-            # Step 3: Join worker nodes
-            if step3 and worker_nodes and cp_nodes:
+            # ── Step 3: Join workers (granular per-step) ─────────────────
+            if step3 and worker_nodes and cp_nodes and overall_success:
                 st.markdown("---")
                 st.markdown("### Step 3: Join Worker Nodes")
                 join_cmd = retrieve_join_command(cp_nodes[0])
                 if join_cmd:
+                    worker_join_steps = get_worker_join_steps(join_cmd)
                     for node in worker_nodes:
-                        with st.status(f"Joining {node.get('hostname', node['ip_address'])}...", expanded=True):
-                            result = join_worker_node(node, join_cmd)
-                            if result.success:
-                                st.success(f"Worker {node['ip_address']} joined!")
-                            else:
-                                st.error(f"Failed to join {node['ip_address']}")
-                                st.code(result.stderr, language="text")
+                        node_label = f"{node.get('hostname', node['ip_address'])}"
+                        st.markdown(f"#### Worker: {node_label}")
+                        for step in worker_join_steps:
+                            with st.status(f"{step.title} on {node_label}...", expanded=False) as status:
+                                result = _run_step(node, step)
+                                if result.success:
+                                    st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text")
+                                    status.update(label=f"{step.title} — done", state="complete")
+                                    st.success(f"Worker {node['ip_address']} joined!")
+                                else:
+                                    st.error(f"FAILED to join {node['ip_address']}")
+                                    st.code(result.stderr or result.stdout, language="text")
+                                    status.update(label=f"{step.title} — FAILED", state="error")
                 else:
                     st.error("Could not retrieve join command from control plane.")
 
-            # Step 4: Best practices
-            if step4 and cp_nodes:
+            # ── Step 4: Best practices (granular per-step) ───────────────
+            if step4 and cp_nodes and overall_success:
                 st.markdown("---")
-                st.markdown("### Step 4: Best Practices")
-                with st.status("Applying security and resource best practices...", expanded=True):
-                    result = apply_best_practices(cp_nodes[0])
+                st.markdown("### Step 4: Apply Best Practices")
+                bp_steps = get_best_practices_steps()
+                progress = st.progress(0, text="Applying best practices...")
+                for idx, step in enumerate(bp_steps):
+                    pct = int((idx / len(bp_steps)) * 100)
+                    progress.progress(pct, text=f"[{idx+1}/{len(bp_steps)}] {step.title}")
+                    with st.status(f"{step.title}...", expanded=False) as status:
+                        result = _run_step(cp_nodes[0], step)
+                        if result.success:
+                            st.code(result.stdout[-1000:] if result.stdout else "(no output)", language="text")
+                            status.update(label=f"{step.title} — done", state="complete")
+                        else:
+                            st.error(f"FAILED: {step.title}")
+                            st.code(result.stderr or result.stdout, language="text")
+                            status.update(label=f"{step.title} — FAILED", state="error")
+                            if step.fatal:
+                                overall_success = False
+                                break
+                progress.progress(100, text="Best practices applied" if overall_success else "Best practices FAILED")
+                if overall_success:
+                    st.success("Best practices applied!")
+
+            # ── Final cluster status ─────────────────────────────────────
+            st.markdown("---")
+            st.markdown("### Cluster Status")
+            if cp_nodes and overall_success:
+                with st.status("Checking cluster status...", expanded=True) as status:
+                    result = get_cluster_status(cp_nodes[0])
                     if result.success:
-                        st.success("Best practices applied!")
+                        update_profile_status(profile.name, "active")
+                        st.success("Cluster is active!")
                         st.code(result.stdout, language="text")
+                        status.update(label="Cluster is active", state="complete")
                     else:
-                        st.error("Failed to apply best practices")
+                        update_profile_status(profile.name, "error")
+                        st.error("Could not verify cluster status")
                         st.code(result.stderr, language="text")
-
-            # Final status
-            st.markdown("---")
-            st.markdown("### Cluster Status")
-            if cp_nodes:
-                result = get_cluster_status(cp_nodes[0])
-                if result.success:
-                    update_profile_status(profile.name, "active")
-                    st.success("Cluster is active!")
-                    st.code(result.stdout, language="text")
-                else:
-                    update_profile_status(profile.name, "error")
-                    st.error("Could not verify cluster status")
-                    st.code(result.stderr, language="text")
+                        status.update(label="Status check failed", state="error")
+            elif not overall_success:
+                update_profile_status(profile.name, "error")
+                st.error("Provisioning did not complete successfully. Check the errors above.")
 
     # ── View Scripts ──────────────────────────────────────────────────────
     with tab_scripts:
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 830a56e..d7935e1 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -2,8 +2,8 @@
 
 import subprocess
 import time
-from dataclasses import dataclass
-from typing import Optional
+from dataclasses import dataclass, field
+from typing import List, Optional
 
 from modules.llm_client import query_llm
 from modules.profile_manager import ClusterProfile
@@ -526,6 +526,525 @@ def generate_best_practices_script() -> str:
 """
 
 
+# ══════════════════════════════════════════════════════════════════════════
+#  Step-based provisioning — granular SSH execution with per-step progress
+# ══════════════════════════════════════════════════════════════════════════
+
+
+@dataclass
+class ProvisionStep:
+    """A single discrete provisioning step to be executed over SSH."""
+
+    name: str  # short identifier, e.g. "disable_swap"
+    title: str  # human-readable label for the UI
+    script: str  # shell snippet to execute
+    timeout: int = 300  # per-step timeout in seconds
+    fatal: bool = True  # if True, abort provisioning on failure
+
+
+def _run_step(node: dict, step: ProvisionStep) -> SSHResult:
+    """Execute a single ProvisionStep on a node via SSH."""
+    return run_ssh_command(
+        ip_address=node["ip_address"],
+        command=step.script,
+        ssh_user=node.get("ssh_user", "root"),
+        ssh_port=node.get("ssh_port", 22),
+        ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=step.timeout,
+    )
+
+
+def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
+    """Return the ordered list of discrete steps for common node setup."""
+    proxy_block = _proxy_env_block(profile)
+    steps: List[ProvisionStep] = []
+
+    # 0. Proxy (optional)
+    if proxy_block:
+        steps.append(ProvisionStep(
+            name="configure_proxy",
+            title="Configure Proxy Settings",
+            script=f"""set -euo pipefail
+echo '>> Configuring proxy settings...'
+{proxy_block}
+# Persist proxy in /etc/environment for all users
+cat >> /etc/environment <<'PROXYEOF'
+{proxy_block}
+PROXYEOF
+echo 'Proxy configured.'
+""",
+            timeout=30,
+        ))
+
+    # 1. System prerequisites
+    steps.append(ProvisionStep(
+        name="system_prerequisites",
+        title="System Prerequisites (swap, modules, sysctl, firewall)",
+        script="""set -euo pipefail
+echo '>> Disabling swap...'
+swapoff -a
+sed -i '/\\bswap\\b/d' /etc/fstab
+
+echo '>> Loading kernel modules...'
+cat > /etc/modules-load.d/k8s.conf <<EOF
+overlay
+br_netfilter
+EOF
+modprobe overlay
+modprobe br_netfilter
+
+echo '>> Setting sysctl parameters...'
+cat > /etc/sysctl.d/99-kubernetes.conf <<EOF
+net.bridge.bridge-nf-call-iptables  = 1
+net.bridge.bridge-nf-call-ip6tables = 1
+net.ipv4.ip_forward                 = 1
+EOF
+sysctl --system
+
+echo '>> Disabling SELinux (if present)...'
+if command -v setenforce &>/dev/null; then
+    setenforce 0 || true
+    sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/null || true
+fi
+
+echo '>> Configuring firewalld (if present)...'
+if systemctl is-active --quiet firewalld; then
+    firewall-cmd --permanent --add-port=6443/tcp
+    firewall-cmd --permanent --add-port=2379-2380/tcp
+    firewall-cmd --permanent --add-port=10250/tcp
+    firewall-cmd --permanent --add-port=10259/tcp
+    firewall-cmd --permanent --add-port=10257/tcp
+    firewall-cmd --permanent --add-port=30000-32767/tcp
+    firewall-cmd --permanent --add-port=8472/udp
+    firewall-cmd --reload
+fi
+echo 'System prerequisites configured.'
+""",
+        timeout=120,
+    ))
+
+    # 2. Custom storage directories (optional)
+    dir_cmds = []
+    if profile.crio_root != "/var/lib/containers/storage":
+        dir_cmds.append(f'mkdir -p "{profile.crio_root}"')
+        dir_cmds.append(f'mkdir -p "{profile.crio_runroot}"')
+    if profile.kubelet_root != "/var/lib/kubelet":
+        dir_cmds.append(f'mkdir -p "{profile.kubelet_root}"')
+    if profile.log_root != "/var/log":
+        dir_cmds.append(f'mkdir -p "{profile.log_root}/pods"')
+        dir_cmds.append(f'mkdir -p "{profile.log_root}/containers"')
+    if dir_cmds:
+        steps.append(ProvisionStep(
+            name="create_custom_dirs",
+            title="Create Custom Storage Directories",
+            script="set -euo pipefail\necho '>> Creating custom storage directories...'\n"
+                   + "\n".join(dir_cmds) + "\necho 'Custom directories created.'",
+            timeout=30,
+        ))
+
+    # 3. Install CRI-O
+    steps.append(ProvisionStep(
+        name="install_crio",
+        title=f"Install CRI-O {profile.crio_version}",
+        script=f"""set -euo pipefail
+echo '>> Installing CRI-O {profile.crio_version}...'
+
+OS="$(. /etc/os-release && echo "$ID")"
+VERSION_ID="$(. /etc/os-release && echo "$VERSION_ID")"
+
+if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then
+    apt-get update -y
+    apt-get install -y software-properties-common curl gnupg2
+    CRIO_VERSION="{profile.crio_version}"
+    curl -fsSL "https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/Release.key" | \\
+        gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
+    echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/ /" | \\
+        tee /etc/apt/sources.list.d/cri-o.list
+    apt-get update -y
+    apt-get install -y cri-o
+elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then
+    CRIO_VERSION="{profile.crio_version}"
+    cat > /etc/yum.repos.d/cri-o.repo <<REPO
+[cri-o]
+name=CRI-O
+baseurl=https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/rpm/
+enabled=1
+gpgcheck=1
+gpgkey=https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/rpm/repodata/repomd.xml.key
+REPO
+    dnf install -y cri-o
+fi
+echo 'CRI-O installed.'
+""",
+        timeout=300,
+    ))
+
+    # 4. Configure CRI-O storage
+    steps.append(ProvisionStep(
+        name="configure_crio",
+        title="Configure CRI-O Storage & Start Service",
+        script=f"""set -euo pipefail
+echo '>> Configuring CRI-O storage to {profile.crio_root}...'
+systemctl daemon-reload
+mkdir -p /etc/crio/crio.conf.d
+cat > /etc/crio/crio.conf.d/01-storage.conf <<CRIOCONF
+[crio]
+  root = "{profile.crio_root}"
+  runroot = "{profile.crio_runroot}"
+  log_dir = "{profile.log_root}/crio/pods"
+CRIOCONF
+
+systemctl enable --now crio
+echo 'CRI-O configured and running (storage: {profile.crio_root}).'
+""",
+        timeout=60,
+    ))
+
+    # 5. Install kubeadm, kubelet, kubectl
+    steps.append(ProvisionStep(
+        name="install_k8s",
+        title=f"Install Kubernetes {profile.kubernetes_version} Components",
+        script=f"""set -euo pipefail
+echo '>> Installing Kubernetes {profile.kubernetes_version} components...'
+
+OS="$(. /etc/os-release && echo "$ID")"
+K8S_VERSION="{profile.kubernetes_version}"
+
+if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then
+    curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/Release.key" | \\
+        gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
+    echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/ /" | \\
+        tee /etc/apt/sources.list.d/kubernetes.list
+    apt-get update -y
+    apt-get install -y kubelet kubeadm kubectl
+    apt-mark hold kubelet kubeadm kubectl
+elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then
+    cat > /etc/yum.repos.d/kubernetes.repo <<REPO
+[kubernetes]
+name=Kubernetes
+baseurl=https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/rpm/
+enabled=1
+gpgcheck=1
+gpgkey=https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/rpm/repodata/repomd.xml.key
+REPO
+    dnf install -y kubelet kubeadm kubectl
+fi
+
+systemctl enable --now kubelet
+echo 'Kubernetes components installed.'
+""",
+        timeout=300,
+    ))
+
+    return steps
+
+
+def get_control_plane_steps(profile: ClusterProfile) -> List[ProvisionStep]:
+    """Return the ordered list of discrete steps for control-plane init."""
+    cp_nodes = profile.get_control_plane_nodes()
+    cp_ip = cp_nodes[0]["ip_address"] if cp_nodes else "CONTROL_PLANE_IP"
+
+    proxy_block = _proxy_env_block(profile)
+    audit_log_dir = f"{profile.log_root}/kubernetes"
+
+    kubelet_extra = '    container-runtime-endpoint: "unix:///var/run/crio/crio.sock"'
+    if profile.kubelet_root != "/var/lib/kubelet":
+        kubelet_extra += f'\n    root-dir: "{profile.kubelet_root}"'
+
+    steps: List[ProvisionStep] = []
+
+    # 0. Proxy on CP (optional)
+    if proxy_block:
+        steps.append(ProvisionStep(
+            name="cp_proxy",
+            title="Set Proxy Environment for kubeadm",
+            script=f"""set -euo pipefail
+echo '>> Setting proxy environment for kubeadm...'
+{proxy_block}
+echo 'Proxy environment set.'
+""",
+            timeout=15,
+        ))
+
+    # 1. kubeadm init
+    steps.append(ProvisionStep(
+        name="kubeadm_init",
+        title="Run kubeadm init",
+        script=f"""set -euo pipefail
+echo '>> Preparing kubeadm config...'
+mkdir -p "{audit_log_dir}"
+cat > /tmp/kubeadm-config.yaml <<EOF
+apiVersion: kubeadm.k8s.io/v1beta3
+kind: InitConfiguration
+localAPIEndpoint:
+  advertiseAddress: "{cp_ip}"
+  bindPort: 6443
+nodeRegistration:
+  criSocket: "unix:///var/run/crio/crio.sock"
+  kubeletExtraArgs:
+{kubelet_extra}
+---
+apiVersion: kubeadm.k8s.io/v1beta3
+kind: ClusterConfiguration
+kubernetesVersion: "v{profile.kubernetes_version}.0"
+networking:
+  podSubnet: "{profile.pod_cidr}"
+  serviceSubnet: "{profile.service_cidr}"
+  dnsDomain: "{profile.dns_domain}"
+controlPlaneEndpoint: "{cp_ip}:6443"
+apiServer:
+  extraArgs:
+    authorization-mode: "Node,RBAC"
+    enable-admission-plugins: "NodeRestriction,PodSecurity"
+    audit-log-path: "{audit_log_dir}/audit.log"
+    audit-log-maxage: "30"
+    audit-log-maxbackup: "10"
+    audit-log-maxsize: "100"
+  extraVolumes:
+  - name: audit-log
+    hostPath: "{audit_log_dir}"
+    mountPath: "{audit_log_dir}"
+    pathType: DirectoryOrCreate
+controllerManager:
+  extraArgs:
+    bind-address: "0.0.0.0"
+    terminated-pod-gc-threshold: "100"
+scheduler:
+  extraArgs:
+    bind-address: "0.0.0.0"
+etcd:
+  local:
+    extraArgs:
+      listen-metrics-urls: "http://0.0.0.0:2381"
+---
+apiVersion: kubelet.config.k8s.io/v1beta1
+kind: KubeletConfiguration
+cgroupDriver: systemd
+containerRuntimeEndpoint: "unix:///var/run/crio/crio.sock"
+evictionHard:
+  memory.available: "100Mi"
+  nodefs.available: "10%"
+  imagefs.available: "15%"
+EOF
+
+echo '>> Running kubeadm init (this may take a few minutes)...'
+kubeadm init --config=/tmp/kubeadm-config.yaml --upload-certs | tee /tmp/kubeadm-init.log
+echo 'kubeadm init complete.'
+""",
+        timeout=600,
+    ))
+
+    # 2. Configure kubectl
+    steps.append(ProvisionStep(
+        name="configure_kubectl",
+        title="Configure kubectl for root user",
+        script="""set -euo pipefail
+echo '>> Configuring kubectl...'
+mkdir -p /root/.kube
+cp /etc/kubernetes/admin.conf /root/.kube/config
+chown root:root /root/.kube/config
+kubectl get nodes
+echo 'kubectl configured.'
+""",
+        timeout=30,
+    ))
+
+    # 3. Install Flannel CNI
+    steps.append(ProvisionStep(
+        name="install_flannel",
+        title="Install Flannel CNI",
+        script="""set -euo pipefail
+echo '>> Installing Flannel CNI...'
+kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
+echo '>> Waiting for Flannel pods to be ready...'
+kubectl -n kube-flannel wait --for=condition=ready pod -l app=flannel --timeout=120s || true
+echo 'Flannel CNI installed.'
+""",
+        timeout=180,
+    ))
+
+    # 4. Pod Security Standards
+    steps.append(ProvisionStep(
+        name="pod_security",
+        title=f"Apply Pod Security Standards ({profile.pod_security_standard})",
+        script=f"""set -euo pipefail
+echo '>> Applying Pod Security Standards ({profile.pod_security_standard})...'
+kubectl label namespace default \\
+    pod-security.kubernetes.io/enforce={profile.pod_security_standard} \\
+    pod-security.kubernetes.io/warn={profile.pod_security_standard} \\
+    pod-security.kubernetes.io/audit={profile.pod_security_standard} \\
+    --overwrite
+echo 'Pod Security Standards applied.'
+""",
+        timeout=30,
+    ))
+
+    # 5. Generate join command
+    steps.append(ProvisionStep(
+        name="generate_join_cmd",
+        title="Generate Worker Join Command",
+        script="""set -euo pipefail
+echo '>> Generating worker join command...'
+kubeadm token create --print-join-command > /tmp/kubeadm-join-command.txt
+echo 'Join command:'
+cat /tmp/kubeadm-join-command.txt
+""",
+        timeout=30,
+    ))
+
+    return steps
+
+
+def get_worker_join_steps(join_command: str) -> List[ProvisionStep]:
+    """Return the step(s) to join a worker node to the cluster."""
+    return [
+        ProvisionStep(
+            name="join_cluster",
+            title="Join Cluster",
+            script=f"""set -euo pipefail
+echo '>> Joining cluster...'
+{join_command} --cri-socket unix:///var/run/crio/crio.sock
+echo 'Successfully joined the cluster.'
+""",
+            timeout=300,
+        ),
+    ]
+
+
+def get_best_practices_steps() -> List[ProvisionStep]:
+    """Return the ordered list of best-practices hardening steps."""
+    return [
+        ProvisionStep(
+            name="network_policy",
+            title="Apply Default-Deny NetworkPolicy",
+            script="""set -euo pipefail
+echo '>> Creating default-deny network policy for default namespace...'
+cat <<EOF | kubectl apply -f -
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: default-deny-all
+  namespace: default
+spec:
+  podSelector: {}
+  policyTypes:
+  - Ingress
+  - Egress
+EOF
+echo 'NetworkPolicy applied.'
+""",
+            timeout=30,
+        ),
+        ProvisionStep(
+            name="resource_quota",
+            title="Set Resource Quotas",
+            script="""set -euo pipefail
+echo '>> Setting resource quotas...'
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+  name: default-quota
+  namespace: default
+spec:
+  hard:
+    requests.cpu: "4"
+    requests.memory: 8Gi
+    limits.cpu: "8"
+    limits.memory: 16Gi
+    pods: "50"
+    services: "20"
+    persistentvolumeclaims: "10"
+EOF
+echo 'ResourceQuota applied.'
+""",
+            timeout=30,
+        ),
+        ProvisionStep(
+            name="limit_range",
+            title="Set Limit Ranges",
+            script="""set -euo pipefail
+echo '>> Setting limit ranges...'
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: LimitRange
+metadata:
+  name: default-limits
+  namespace: default
+spec:
+  limits:
+  - default:
+      cpu: "500m"
+      memory: "512Mi"
+    defaultRequest:
+      cpu: "100m"
+      memory: "128Mi"
+    type: Container
+EOF
+echo 'LimitRange applied.'
+""",
+            timeout=30,
+        ),
+        ProvisionStep(
+            name="rbac_reader",
+            title="Create Read-Only ClusterRole",
+            script="""set -euo pipefail
+echo '>> Creating read-only ClusterRole...'
+cat <<EOF | kubectl apply -f -
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: cluster-reader
+rules:
+- apiGroups: [""]
+  resources: ["pods", "services", "namespaces", "nodes", "events", "configmaps"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: ["apps"]
+  resources: ["deployments", "replicasets", "statefulsets", "daemonsets"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: ["networking.k8s.io"]
+  resources: ["networkpolicies", "ingresses"]
+  verbs: ["get", "list", "watch"]
+EOF
+echo 'ClusterRole cluster-reader created.'
+""",
+            timeout=30,
+        ),
+        ProvisionStep(
+            name="audit_log_dir",
+            title="Ensure Audit Log Directory",
+            script="""set -euo pipefail
+echo '>> Ensuring audit log directory exists...'
+mkdir -p /var/log/kubernetes
+echo 'Audit log directory ready.'
+""",
+            timeout=15,
+            fatal=False,
+        ),
+    ]
+
+
+def execute_provision_steps(
+    node: dict,
+    steps: List[ProvisionStep],
+) -> List[tuple]:
+    """Execute a list of provision steps on a node.
+
+    Returns a list of (ProvisionStep, SSHResult) tuples.
+    Stops at the first fatal failure.
+    """
+    results: List[tuple] = []
+    for step in steps:
+        result = _run_step(node, step)
+        results.append((step, result))
+        if not result.success and step.fatal:
+            break
+    return results
+
+
+# ── Legacy wrapper functions (kept for backward compatibility) ────────────
+
+
 def provision_node_common(node: dict, profile: ClusterProfile) -> SSHResult:
     """Run the common setup script on a single node via SSH."""
     script = generate_common_setup_script(profile)

From 3abe2dd5e9ff7346bc49940c1b7a81f5f670b80b Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 10:48:10 +0000
Subject: [PATCH 06/31] Replace streamlit-option-menu with native st.radio for
 reliable sidebar navigation

---
 k8s-agent/app.py           | 36 +++++++++++++++---------------------
 k8s-agent/requirements.txt |  1 -
 2 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index c41ba61..8c354dd 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -8,7 +8,7 @@
 
 import json
 import streamlit as st
-from streamlit_option_menu import option_menu
+# Navigation uses native st.radio — no third-party component needed.
 
 import config
 from modules.profile_manager import (
@@ -193,26 +193,20 @@ def render_sidebar():
         st.divider()
 
         # ── Navigation ──
-        selected_page = option_menu(
-            menu_title="Navigation",
-            options=[
-                "Profile Manager",
-                "Cluster Creation",
-                "Cluster Debugger",
-                "Monitoring Setup",
-                "Log Analysis",
-                "AI Assistant",
-            ],
-            icons=[
-                "person-gear",
-                "hdd-rack",
-                "bug",
-                "graph-up",
-                "journal-text",
-                "robot",
-            ],
-            menu_icon="list",
-            default_index=0,
+        st.markdown("### Navigation")
+        nav_options = [
+            "Profile Manager",
+            "Cluster Creation",
+            "Cluster Debugger",
+            "Monitoring Setup",
+            "Log Analysis",
+            "AI Assistant",
+        ]
+        selected_page = st.radio(
+            "Go to",
+            options=nav_options,
+            index=0,
+            label_visibility="collapsed",
         )
 
         st.divider()
diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt
index 96fb8cf..c9c4741 100644
--- a/k8s-agent/requirements.txt
+++ b/k8s-agent/requirements.txt
@@ -1,4 +1,3 @@
 streamlit>=1.32.0
 requests>=2.31.0
 plotly>=5.18.0
-streamlit-option-menu>=0.3.12

From 067861e9f28b902273d51cd7b2908b9be25092c4 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 11:04:57 +0000
Subject: [PATCH 07/31] Make LLM fully optional with graceful fallbacks and add
 offline manifest uploads

- Add is_llm_configured() helper to detect when LLM is not set up
- Make all LLM imports lazy to avoid errors when LLM deps missing
- Guard all AI-powered UI features with is_llm_configured() checks
- Show informative fallback messages when LLM is not configured
- Add Offline Manifests tab for uploading Flannel YAML and other files
- Add flannel_manifest_path/prometheus_manifest_path to ClusterProfile
- SCP user-provided Flannel manifest to nodes during provisioning
- Core features (cluster creation, debugging, monitoring, logs) work without LLM
---
 k8s-agent/app.py                      | 294 ++++++++++++++++++--------
 k8s-agent/config.py                   |   8 +
 k8s-agent/modules/cluster_creator.py  |  78 ++++++-
 k8s-agent/modules/cluster_debugger.py |  15 +-
 k8s-agent/modules/llm_client.py       |  19 +-
 k8s-agent/modules/log_analyzer.py     |  15 +-
 k8s-agent/modules/monitoring_setup.py |   8 +-
 k8s-agent/modules/profile_manager.py  |   3 +
 8 files changed, 336 insertions(+), 104 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 8c354dd..18f55fb 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -11,6 +11,7 @@
 # Navigation uses native st.radio — no third-party component needed.
 
 import config
+from config import is_llm_configured
 from modules.profile_manager import (
     ClusterProfile,
     save_profile,
@@ -32,6 +33,7 @@
     apply_best_practices,
     get_cluster_status,
     get_llm_cluster_advice,
+    upload_flannel_manifest_to_node,
     ProvisionStep,
     _run_step,
     get_common_setup_steps,
@@ -495,10 +497,11 @@ def page_cluster_creation():
 
     _show_profile_summary(profile)
 
-    tab_preflight, tab_provision, tab_scripts, tab_advice = st.tabs([
+    tab_preflight, tab_provision, tab_scripts, tab_manifests, tab_advice = st.tabs([
         "Pre-flight Checks",
         "Provision Cluster",
         "View Scripts",
+        "Offline Manifests",
         "AI Advice",
     ])
 
@@ -690,17 +693,83 @@ def page_cluster_creation():
         with st.expander("Best Practices Script", expanded=False):
             st.code(generate_best_practices_script(), language="bash")
 
+    # ── Offline Manifests ───────────────────────────────────────────────────
+    with tab_manifests:
+        st.markdown("### Offline / Custom Manifests")
+        st.markdown(
+            "If your environment cannot download manifests directly (air-gapped / proxy-restricted), "
+            "upload them here. They will be used instead of the default download URLs during provisioning."
+        )
+
+        st.markdown("#### Flannel CNI Manifest")
+        flannel_file = st.file_uploader(
+            "Upload kube-flannel.yml",
+            type=["yml", "yaml"],
+            key="flannel_upload",
+            help="Download from: https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml",
+        )
+        if flannel_file is not None:
+            flannel_path = os.path.join(config.UPLOADS_DIR, "kube-flannel.yml")
+            with open(flannel_path, "wb") as f:
+                f.write(flannel_file.getvalue())
+            profile.flannel_manifest_path = flannel_path
+            save_profile(profile)
+            st.success(f"Flannel manifest saved. It will be SCP'd to nodes during provisioning.")
+
+        if profile.flannel_manifest_path:
+            st.info(f"Current Flannel manifest: `{profile.flannel_manifest_path}`")
+            if st.button("Clear Flannel manifest (use default URL)", key="clear_flannel"):
+                profile.flannel_manifest_path = ""
+                save_profile(profile)
+                st.rerun()
+        else:
+            st.info("No custom manifest — Flannel will be downloaded from the official GitHub release URL.")
+
+        st.markdown("---")
+        st.markdown("#### Other Manifests")
+        st.markdown(
+            "You can also upload any additional YAML manifests. They will be stored "
+            "and can be applied manually via the **Custom Command** feature in the Cluster Debugger."
+        )
+        extra_file = st.file_uploader(
+            "Upload additional manifest (YAML)",
+            type=["yml", "yaml"],
+            key="extra_manifest_upload",
+        )
+        if extra_file is not None:
+            extra_path = os.path.join(config.UPLOADS_DIR, extra_file.name)
+            with open(extra_path, "wb") as f:
+                f.write(extra_file.getvalue())
+            st.success(f"Saved `{extra_file.name}` to uploads.")
+
+        # List existing uploaded files
+        if os.path.exists(config.UPLOADS_DIR):
+            uploaded_files = [
+                f for f in os.listdir(config.UPLOADS_DIR)
+                if f.endswith((".yml", ".yaml"))
+            ]
+            if uploaded_files:
+                st.markdown("**Uploaded manifests:**")
+                for fname in sorted(uploaded_files):
+                    st.markdown(f"- `{fname}`")
+
     # ── AI Advice ─────────────────────────────────────────────────────────
     with tab_advice:
         st.markdown("### AI Cluster Setup Advisor")
-        context = st.text_area(
-            "Additional context or questions",
-            placeholder="e.g., We have 3 nodes with 16GB RAM each. Any special considerations?",
-        )
-        if st.button("Get AI Recommendations", type="primary"):
-            with st.spinner("Analyzing your cluster configuration..."):
-                advice = get_llm_cluster_advice(profile, context)
-                st.markdown(advice)
+        if not is_llm_configured():
+            st.info(
+                "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+                "environment variables to enable AI-powered recommendations."
+            )
+        else:
+            context = st.text_area(
+                "Additional context or questions",
+                placeholder="e.g., We have 3 nodes with 16GB RAM each. Any special considerations?",
+            )
+            if st.button("Get AI Recommendations", type="primary"):
+                with st.spinner("Analyzing your cluster configuration..."):
+                    advice = get_llm_cluster_advice(profile, context)
+                    st.markdown(advice)
 
 
 # ══════════════════════════════════════════════════════════════════════════
@@ -709,7 +778,7 @@ def page_cluster_creation():
 
 def page_cluster_debugger():
     st.markdown("## Cluster Debugger")
-    st.markdown("Diagnose issues and get AI-powered recommendations.")
+    st.markdown("Diagnose issues and get recommendations.")
 
     profile = _get_active_profile()
     if not profile:
@@ -758,13 +827,16 @@ def page_cluster_debugger():
                 with st.expander(f"{'✅' if result.success else '❌'} {name}", expanded=not result.success):
                     st.code(result.stdout if result.success else result.stderr, language="text")
 
-        if st.session_state.debug_results and st.button("Analyze with AI", type="secondary"):
-            with st.spinner("AI is analyzing diagnostics..."):
-                analysis = analyze_diagnostics(
-                    st.session_state.debug_results,
-                    profile=profile,
-                )
-                st.markdown(analysis)
+        if st.session_state.debug_results:
+            if not is_llm_configured():
+                st.info("Enable AI analysis by setting `LLM_API_URL` and `LLM_API_KEY` env vars.")
+            elif st.button("Analyze with AI", type="secondary"):
+                with st.spinner("AI is analyzing diagnostics..."):
+                    analysis = analyze_diagnostics(
+                        st.session_state.debug_results,
+                        profile=profile,
+                    )
+                    st.markdown(analysis)
 
     # ── Category Scan ─────────────────────────────────────────────────────
     with tab_category:
@@ -779,10 +851,11 @@ def page_cluster_debugger():
                 with st.expander(f"{'✅' if result.success else '❌'} {name}"):
                     st.code(result.stdout if result.success else result.stderr, language="text")
 
-            if st.button("Analyze Category with AI", key="cat_ai"):
-                with st.spinner("Analyzing..."):
-                    analysis = analyze_diagnostics(results, profile=profile)
-                    st.markdown(analysis)
+            if is_llm_configured():
+                if st.button("Analyze Category with AI", key="cat_ai"):
+                    with st.spinner("Analyzing..."):
+                        analysis = analyze_diagnostics(results, profile=profile)
+                        st.markdown(analysis)
 
     # ── Custom Command ────────────────────────────────────────────────────
     with tab_custom:
@@ -805,43 +878,53 @@ def page_cluster_debugger():
     # ── AI Debug Assistant ────────────────────────────────────────────────
     with tab_ai:
         st.markdown("### AI Debug Assistant")
-        st.markdown("Describe your issue and get AI-powered debugging help.")
+        if not is_llm_configured():
+            st.info(
+                "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+                "environment variables to enable AI-powered debugging."
+            )
+            st.markdown(
+                "You can still use the **Quick Diagnostics**, **Category Scan**, and "
+                "**Custom Command** tabs to collect diagnostic data without an LLM."
+            )
+        else:
+            st.markdown("Describe your issue and get AI-powered debugging help.")
 
-        issue = st.text_area(
-            "Describe the issue",
-            placeholder="e.g., Pods are stuck in CrashLoopBackOff in the default namespace",
-            height=120,
-        )
+            issue = st.text_area(
+                "Describe the issue",
+                placeholder="e.g., Pods are stuck in CrashLoopBackOff in the default namespace",
+                height=120,
+            )
 
-        col1, col2 = st.columns(2)
-        with col1:
-            auto_collect = st.checkbox("Auto-collect relevant diagnostics", value=True)
-        with col2:
-            check_pods = st.checkbox("Check for problematic pods", value=True)
-
-        if st.button("Debug", type="primary", key="ai_debug") and issue:
-            collected_data = ""
-
-            if check_pods:
-                with st.spinner("Checking pod issues..."):
-                    pod_result = check_pod_issues(cp_node)
-                    if pod_result.success and pod_result.stdout.strip():
-                        collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}"
-                        with st.expander("Problematic Pods"):
-                            st.code(pod_result.stdout, language="text")
-
-            if auto_collect:
-                with st.spinner("Collecting diagnostics..."):
-                    diag_results = run_category_diagnostics(cp_node, "Cluster Overview")
-                    for name, result in diag_results.items():
-                        if result.success:
-                            collected_data += f"\n\n{name}:\n{result.stdout}"
+            col1, col2 = st.columns(2)
+            with col1:
+                auto_collect = st.checkbox("Auto-collect relevant diagnostics", value=True)
+            with col2:
+                check_pods = st.checkbox("Check for problematic pods", value=True)
+
+            if st.button("Debug", type="primary", key="ai_debug") and issue:
+                collected_data = ""
+
+                if check_pods:
+                    with st.spinner("Checking pod issues..."):
+                        pod_result = check_pod_issues(cp_node)
+                        if pod_result.success and pod_result.stdout.strip():
+                            collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}"
+                            with st.expander("Problematic Pods"):
+                                st.code(pod_result.stdout, language="text")
+
+                if auto_collect:
+                    with st.spinner("Collecting diagnostics..."):
+                        diag_results = run_category_diagnostics(cp_node, "Cluster Overview")
+                        for name, result in diag_results.items():
+                            if result.success:
+                                collected_data += f"\n\n{name}:\n{result.stdout}"
 
-            with st.spinner("AI is analyzing the issue..."):
-                full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}"
-                suggestion = get_debug_suggestion(issue, collected_data)
-                st.markdown("### AI Recommendation")
-                st.markdown(suggestion)
+                with st.spinner("AI is analyzing the issue..."):
+                    full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}"
+                    suggestion = get_debug_suggestion(issue, collected_data)
+                    st.markdown("### AI Recommendation")
+                    st.markdown(suggestion)
 
 
 # ══════════════════════════════════════════════════════════════════════════
@@ -978,15 +1061,21 @@ def page_monitoring_setup():
     # ── AI Advice ─────────────────────────────────────────────────────────
     with tab_advice:
         st.markdown("### AI Monitoring Advisor")
-        if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"):
-            current_status = ""
-            status_result = get_monitoring_status(cp_node, namespace)
-            if status_result.success:
-                current_status = status_result.stdout
+        if not is_llm_configured():
+            st.info(
+                "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+                "environment variables to enable AI-powered monitoring advice."
+            )
+        else:
+            if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"):
+                current_status = ""
+                status_result = get_monitoring_status(cp_node, namespace)
+                if status_result.success:
+                    current_status = status_result.stdout
 
-            with st.spinner("Getting AI recommendations..."):
-                advice = get_monitoring_advice(profile, current_status)
-                st.markdown(advice)
+                with st.spinner("Getting AI recommendations..."):
+                    advice = get_monitoring_advice(profile, current_status)
+                    st.markdown(advice)
 
 
 # ══════════════════════════════════════════════════════════════════════════
@@ -1098,12 +1187,13 @@ def page_log_analysis():
 
                     st.code(result.stdout[-5000:], language="text")
 
-                    if analysis.error_count > 0 and st.button("Analyze with AI", key="pod_ai"):
-                        with st.spinner("AI analyzing pod logs..."):
-                            ai_analysis = llm_analyze_logs(
-                                result.stdout, f"{pod_ns}/{pod_name}"
-                            )
-                            st.markdown(ai_analysis)
+                    if analysis.error_count > 0 and is_llm_configured():
+                        if st.button("Analyze with AI", key="pod_ai"):
+                            with st.spinner("AI analyzing pod logs..."):
+                                ai_analysis = llm_analyze_logs(
+                                    result.stdout, f"{pod_ns}/{pod_name}"
+                                )
+                                st.markdown(ai_analysis)
                 else:
                     st.error("Failed to fetch pod logs")
                     st.code(result.stderr, language="text")
@@ -1142,33 +1232,45 @@ def page_log_analysis():
                 st.info("No correlated errors found across sources.")
 
             # LLM correlation analysis
-            if st.button("Deep AI Correlation Analysis", key="deep_corr"):
-                multi_logs = {
-                    src: res.stdout for src, res in results.items() if res.success
-                }
-                with st.spinner("AI is performing deep correlation analysis..."):
-                    analysis = llm_correlate_analysis(multi_logs)
-                    st.markdown(analysis)
+            if is_llm_configured():
+                if st.button("Deep AI Correlation Analysis", key="deep_corr"):
+                    multi_logs = {
+                        src: res.stdout for src, res in results.items() if res.success
+                    }
+                    with st.spinner("AI is performing deep correlation analysis..."):
+                        analysis = llm_correlate_analysis(multi_logs)
+                        st.markdown(analysis)
 
     # ── AI Log Analysis ───────────────────────────────────────────────────
     with tab_ai:
         st.markdown("### AI-Powered Log Analysis")
-        st.markdown("Paste logs or describe an issue for AI analysis.")
+        if not is_llm_configured():
+            st.info(
+                "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+                "environment variables to enable AI-powered log analysis."
+            )
+            st.markdown(
+                "You can still use the **System Logs**, **Pod Logs**, and "
+                "**Error Correlation** tabs — they work without an LLM and provide "
+                "automated pattern matching and error grouping."
+            )
+        else:
+            st.markdown("Paste logs or describe an issue for AI analysis.")
 
-        log_input = st.text_area(
-            "Paste log output",
-            height=200,
-            placeholder="Paste your Kubernetes logs here...",
-        )
-        context_input = st.text_input(
-            "Additional context",
-            placeholder="e.g., This started happening after we upgraded to K8s 1.30",
-        )
+            log_input = st.text_area(
+                "Paste log output",
+                height=200,
+                placeholder="Paste your Kubernetes logs here...",
+            )
+            context_input = st.text_input(
+                "Additional context",
+                placeholder="e.g., This started happening after we upgraded to K8s 1.30",
+            )
 
-        if st.button("Analyze Logs", type="primary", key="ai_log_analyze") and log_input:
-            with st.spinner("AI is analyzing logs..."):
-                analysis = llm_analyze_logs(log_input, context=context_input)
-                st.markdown(analysis)
+            if st.button("Analyze Logs", type="primary", key="ai_log_analyze") and log_input:
+                with st.spinner("AI is analyzing logs..."):
+                    analysis = llm_analyze_logs(log_input, context=context_input)
+                    st.markdown(analysis)
 
 
 # ══════════════════════════════════════════════════════════════════════════
@@ -1177,6 +1279,18 @@ def page_log_analysis():
 
 def page_ai_assistant():
     st.markdown("## AI Kubernetes Assistant")
+
+    if not is_llm_configured():
+        st.info(
+            "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+            "environment variables to enable the AI chat assistant."
+        )
+        st.markdown(
+            "All other features (Cluster Creation, Debugging, Monitoring, Log Analysis) "
+            "work without an LLM. Only the AI-powered analysis and chat features require it."
+        )
+        return
+
     st.markdown("Chat with the AI about any Kubernetes topic.")
 
     # Chat history
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index e46dd95..14fb427 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -12,10 +12,18 @@
 LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
 LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096"))
 
+
+def is_llm_configured() -> bool:
+    """Return True if the LLM endpoint and API key are both set."""
+    return bool(LLM_API_URL and LLM_API_KEY)
+
+
 # Application paths
 DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 PROFILES_DIR = os.path.join(DATA_DIR, "profiles")
 TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
+UPLOADS_DIR = os.path.join(DATA_DIR, "uploads")
 
 # Ensure directories exist
 os.makedirs(PROFILES_DIR, exist_ok=True)
+os.makedirs(UPLOADS_DIR, exist_ok=True)
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index d7935e1..22dcfd9 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -5,9 +5,11 @@
 from dataclasses import dataclass, field
 from typing import List, Optional
 
-from modules.llm_client import query_llm
 from modules.profile_manager import ClusterProfile
 
+# Default Flannel manifest URL — can be overridden by user-uploaded file
+FLANNEL_MANIFEST_URL = "https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml"
+
 
 @dataclass
 class SSHResult:
@@ -384,7 +386,12 @@ def generate_control_plane_init_script(profile: ClusterProfile) -> str:
 
 # ── Install Flannel CNI ───────────────────────────────────────────────────
 echo ">> Installing Flannel CNI..."
-kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
+if [ -f /tmp/kube-flannel-custom.yml ]; then
+    echo ">> Using user-provided Flannel manifest..."
+    kubectl apply -f /tmp/kube-flannel-custom.yml
+else
+    kubectl apply -f {FLANNEL_MANIFEST_URL}
+fi
 
 # Wait for Flannel to be ready
 echo ">> Waiting for Flannel pods to be ready..."
@@ -850,12 +857,22 @@ def get_control_plane_steps(profile: ClusterProfile) -> List[ProvisionStep]:
     ))
 
     # 3. Install Flannel CNI
+    flannel_manifest = profile.flannel_manifest_path or FLANNEL_MANIFEST_URL
+    # If the user uploaded a local file we SCP it first; otherwise download URL
+    if profile.flannel_manifest_path:
+        flannel_apply = (
+            "echo '>> Using user-provided Flannel manifest...'\n"
+            "kubectl apply -f /tmp/kube-flannel-custom.yml"
+        )
+    else:
+        flannel_apply = f"kubectl apply -f {FLANNEL_MANIFEST_URL}"
+
     steps.append(ProvisionStep(
         name="install_flannel",
         title="Install Flannel CNI",
-        script="""set -euo pipefail
+        script=f"""set -euo pipefail
 echo '>> Installing Flannel CNI...'
-kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
+{flannel_apply}
 echo '>> Waiting for Flannel pods to be ready...'
 kubectl -n kube-flannel wait --for=condition=ready pod -l app=flannel --timeout=120s || true
 echo 'Flannel CNI installed.'
@@ -1125,7 +1142,12 @@ def get_cluster_status(control_plane_node: dict) -> SSHResult:
 
 
 def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
-    """Ask the LLM for cluster setup advice based on the profile."""
+    """Ask the LLM for cluster setup advice based on the profile.
+
+    Returns a graceful message when the LLM is not configured.
+    """
+    from modules.llm_client import query_llm  # lazy import — LLM is optional
+
     nodes_desc = []
     for n in profile.nodes:
         nodes_desc.append(f"  - {n.get('hostname', 'unknown')} ({n['ip_address']}) — role: {n['role']}")
@@ -1159,3 +1181,49 @@ def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
 4. Network configuration tips for Flannel with CRI-O
 """
     return query_llm(prompt)
+
+
+def upload_flannel_manifest_to_node(node: dict, local_path: str) -> SSHResult:
+    """SCP a user-provided Flannel manifest to a node as /tmp/kube-flannel-custom.yml."""
+    scp_cmd = [
+        "scp",
+        "-o", "StrictHostKeyChecking=no",
+        "-o", "UserKnownHostsFile=/dev/null",
+        "-P", str(node.get("ssh_port", 22)),
+        "-i", node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        local_path,
+        f"{node.get('ssh_user', 'root')}@{node['ip_address']}:/tmp/kube-flannel-custom.yml",
+    ]
+    try:
+        proc = subprocess.run(
+            scp_cmd,
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        return SSHResult(
+            hostname=node["ip_address"],
+            command="scp flannel manifest",
+            return_code=proc.returncode,
+            stdout=proc.stdout,
+            stderr=proc.stderr,
+            success=proc.returncode == 0,
+        )
+    except subprocess.TimeoutExpired:
+        return SSHResult(
+            hostname=node["ip_address"],
+            command="scp flannel manifest",
+            return_code=-1,
+            stdout="",
+            stderr="SCP timed out after 60 seconds",
+            success=False,
+        )
+    except Exception as exc:
+        return SSHResult(
+            hostname=node["ip_address"],
+            command="scp flannel manifest",
+            return_code=-1,
+            stdout="",
+            stderr=str(exc),
+            success=False,
+        )
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index 9ee9dd1..b6798cc 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -1,7 +1,6 @@
 """Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations."""
 
 from modules.cluster_creator import run_ssh_command, SSHResult
-from modules.llm_client import query_llm, stream_llm
 from modules.profile_manager import ClusterProfile
 
 
@@ -151,7 +150,12 @@ def analyze_diagnostics(
     user_description: str = "",
     profile: ClusterProfile | None = None,
 ) -> str:
-    """Send diagnostic results to the LLM for analysis and recommendations."""
+    """Send diagnostic results to the LLM for analysis and recommendations.
+
+    Returns a graceful message when the LLM is not configured.
+    """
+    from modules.llm_client import query_llm  # lazy import — LLM is optional
+
     diag_text = format_diagnostics_for_llm(results)
 
     cluster_info = ""
@@ -190,7 +194,12 @@ def get_debug_suggestion(
     error_message: str,
     context: str = "",
 ) -> str:
-    """Get a quick debugging suggestion from the LLM for a specific error."""
+    """Get a quick debugging suggestion from the LLM for a specific error.
+
+    Returns a graceful message when the LLM is not configured.
+    """
+    from modules.llm_client import query_llm  # lazy import — LLM is optional
+
     prompt = f"""I encountered the following error in my Kubernetes cluster (CRI-O + Flannel):
 
 Error: {error_message}
diff --git a/k8s-agent/modules/llm_client.py b/k8s-agent/modules/llm_client.py
index 2b7eb44..fc77be1 100644
--- a/k8s-agent/modules/llm_client.py
+++ b/k8s-agent/modules/llm_client.py
@@ -1,4 +1,9 @@
-"""LLM client for the Infosys AI Gateway."""
+"""LLM client — optional integration with an OpenAI-compatible endpoint.
+
+All public functions gracefully return a fallback message when the LLM is not
+configured (i.e. ``LLM_API_KEY`` or ``LLM_API_URL`` is empty).  The rest of the
+application works without any LLM dependency.
+"""
 
 import json
 from typing import Generator, Optional
@@ -7,6 +12,11 @@
 
 import config
 
+_NOT_CONFIGURED_MSG = (
+    "LLM is not configured. Set the LLM_API_URL and LLM_API_KEY environment "
+    "variables to enable AI-powered features."
+)
+
 
 SYSTEM_PROMPT = """You are an expert Kubernetes platform engineer specializing in on-premises
 cluster administration. You have deep knowledge of:
@@ -63,6 +73,9 @@ def query_llm(
         "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
     }
 
+    if not config.is_llm_configured():
+        return _NOT_CONFIGURED_MSG
+
     try:
         response = requests.post(
             config.LLM_API_URL,
@@ -117,6 +130,10 @@ def stream_llm(
         "stream": True,
     }
 
+    if not config.is_llm_configured():
+        yield _NOT_CONFIGURED_MSG
+        return
+
     try:
         response = requests.post(
             config.LLM_API_URL,
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index 9abc838..fea9438 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -6,7 +6,6 @@
 from typing import Optional
 
 from modules.cluster_creator import run_ssh_command, SSHResult
-from modules.llm_client import query_llm
 
 
 @dataclass
@@ -274,7 +273,12 @@ def llm_analyze_logs(
     source: str = "",
     context: str = "",
 ) -> str:
-    """Send log output to the LLM for deep analysis."""
+    """Send log output to the LLM for deep analysis.
+
+    Returns a graceful message when the LLM is not configured.
+    """
+    from modules.llm_client import query_llm  # lazy import — LLM is optional
+
     truncated = log_text[-8000:] if len(log_text) > 8000 else log_text
 
     prompt = f"""Analyze the following Kubernetes logs and provide a detailed assessment.
@@ -301,7 +305,12 @@ def llm_correlate_analysis(
     multi_source_logs: dict[str, str],
     issue_description: str = "",
 ) -> str:
-    """Send logs from multiple sources to the LLM for cross-source correlation."""
+    """Send logs from multiple sources to the LLM for cross-source correlation.
+
+    Returns a graceful message when the LLM is not configured.
+    """
+    from modules.llm_client import query_llm  # lazy import — LLM is optional
+
     log_sections = []
     for source, log_text in multi_source_logs.items():
         truncated = log_text[-3000:] if len(log_text) > 3000 else log_text
diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py
index fdf8b42..a2272d7 100644
--- a/k8s-agent/modules/monitoring_setup.py
+++ b/k8s-agent/modules/monitoring_setup.py
@@ -1,7 +1,6 @@
 """Monitoring Setup — Prometheus, Grafana, and dashboard provisioning via SSH."""
 
 from modules.cluster_creator import run_ssh_command, SSHResult
-from modules.llm_client import query_llm
 from modules.profile_manager import ClusterProfile
 
 
@@ -420,7 +419,12 @@ def get_monitoring_advice(
     profile: ClusterProfile,
     current_status: str = "",
 ) -> str:
-    """Ask the LLM for monitoring setup advice."""
+    """Ask the LLM for monitoring setup advice.
+
+    Returns a graceful message when the LLM is not configured.
+    """
+    from modules.llm_client import query_llm  # lazy import — LLM is optional
+
     prompt = f"""I have a Kubernetes cluster with the following setup:
 - Kubernetes: {profile.kubernetes_version}
 - Runtime: CRI-O {profile.crio_version}
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
index 9754801..90cd9bf 100644
--- a/k8s-agent/modules/profile_manager.py
+++ b/k8s-agent/modules/profile_manager.py
@@ -51,6 +51,9 @@ class ClusterProfile:
     no_proxy: str = ""
     http_proxy_alt: str = ""  # alternate proxy
     https_proxy_alt: str = ""  # alternate proxy
+    # Offline manifest paths — user-provided files for air-gapped environments
+    flannel_manifest_path: str = ""  # local path to kube-flannel.yml
+    prometheus_manifest_path: str = ""  # local path to prometheus manifest
 
     def get_control_plane_nodes(self) -> list[dict]:
         return [n for n in self.nodes if n.get("role") == "control-plane"]

From 58342262d21025b5946f31c715f11eeb28396ef9 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:34:53 +0000
Subject: [PATCH 08/31] Add imported cluster support, upgrade planner, version
 1.35, PSS explanations, graphical events timeline

---
 k8s-agent/app.py                      | 1148 +++++++++++++++++++++++--
 k8s-agent/modules/cluster_creator.py  |   84 ++
 k8s-agent/modules/cluster_debugger.py |  138 ++-
 k8s-agent/modules/log_analyzer.py     |  115 ++-
 k8s-agent/modules/monitoring_setup.py |  117 +--
 k8s-agent/modules/profile_manager.py  |    3 +
 6 files changed, 1441 insertions(+), 164 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 18f55fb..0000017 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -34,6 +34,7 @@
     get_cluster_status,
     get_llm_cluster_advice,
     upload_flannel_manifest_to_node,
+    run_kubectl,
     ProvisionStep,
     _run_step,
     get_common_setup_steps,
@@ -43,7 +44,9 @@
 )
 from modules.cluster_debugger import (
     DIAGNOSTIC_COMMANDS,
+    KUBECTL_DIAGNOSTIC_COMMANDS,
     CATEGORY_MAP,
+    get_available_commands,
     run_diagnostic,
     run_category_diagnostics,
     run_all_diagnostics,
@@ -66,6 +69,7 @@
 )
 from modules.log_analyzer import (
     LOG_SOURCES,
+    get_available_log_sources,
     collect_logs,
     collect_pod_logs,
     collect_multi_source_logs,
@@ -182,11 +186,16 @@ def render_sidebar():
                         f"**Status:** <span class='{status_class}'>{profile.status.upper()}</span>",
                         unsafe_allow_html=True,
                     )
-                    st.caption(
-                        f"K8s {profile.kubernetes_version} | CRI-O {profile.crio_version} | "
-                        f"{len(profile.get_control_plane_nodes())} CP + "
-                        f"{len(profile.get_worker_nodes())} Workers"
-                    )
+                    if profile.cluster_source == "imported":
+                        st.caption(
+                            f"K8s {profile.kubernetes_version} | Imported Cluster"
+                        )
+                    else:
+                        st.caption(
+                            f"K8s {profile.kubernetes_version} | CRI-O {profile.crio_version} | "
+                            f"{len(profile.get_control_plane_nodes())} CP + "
+                            f"{len(profile.get_worker_nodes())} Workers"
+                        )
             else:
                 st.session_state.active_profile = None
         else:
@@ -199,9 +208,11 @@ def render_sidebar():
         nav_options = [
             "Profile Manager",
             "Cluster Creation",
+            "Resource Viewer",
             "Cluster Debugger",
             "Monitoring Setup",
             "Log Analysis",
+            "Upgrade Planner",
             "AI Assistant",
         ]
         selected_page = st.radio(
@@ -247,7 +258,9 @@ def page_profile_manager():
     st.markdown("## Cluster Profile Manager")
     st.markdown("Create, edit, and manage profiles for your on-prem Kubernetes clusters.")
 
-    tab_create, tab_list, tab_import = st.tabs(["Create Profile", "Manage Profiles", "Import / Export"])
+    tab_create, tab_import_cluster, tab_list, tab_import = st.tabs([
+        "Create Profile", "Import Existing Cluster", "Manage Profiles", "Import / Export",
+    ])
 
     # ── Create Profile ────────────────────────────────────────────────────
     with tab_create:
@@ -258,13 +271,46 @@ def page_profile_manager():
             with col1:
                 name = st.text_input("Profile Name *", placeholder="production-cluster")
                 description = st.text_area("Description", placeholder="Production on-prem cluster")
-                k8s_version = st.selectbox("Kubernetes Version", ["1.30", "1.29", "1.28", "1.27"], index=0)
-                crio_version = st.selectbox("CRI-O Version", ["1.30", "1.29", "1.28", "1.27"], index=0)
+                k8s_version = st.selectbox(
+                    "Kubernetes Version",
+                    ["1.35", "1.34", "1.33", "1.32", "1.31", "1.30", "1.29", "1.28", "1.27"],
+                    index=0,
+                )
+                crio_version = st.selectbox(
+                    "CRI-O Version",
+                    ["1.35", "1.34", "1.33", "1.32", "1.31", "1.30", "1.29", "1.28", "1.27"],
+                    index=0,
+                )
                 pod_security = st.selectbox(
                     "Pod Security Standard",
                     ["restricted", "baseline", "privileged"],
                     index=0,
+                    help="Controls what pods are allowed to run in the cluster.",
                 )
+                # Explain each PSS level
+                with st.expander("What do these Pod Security Standards mean?"):
+                    st.markdown(
+                        "**Restricted** (most secure)\n"
+                        "- Heavily restricted policy following Pod hardening best practices.\n"
+                        "- Disallows privilege escalation, host namespaces, host paths, and most Linux capabilities.\n"
+                        "- Containers must run as non-root with a read-only root filesystem.\n"
+                        "- Only allows seccomp profile RuntimeDefault or Localhost.\n"
+                        "- Best for: production workloads, multi-tenant clusters, security-sensitive environments.\n\n"
+                        "**Baseline** (moderate)\n"
+                        "- Minimally restrictive policy that prevents known privilege escalations.\n"
+                        "- Allows most default Kubernetes configurations but blocks hostNetwork, hostPID, hostIPC.\n"
+                        "- Containers can run as root but cannot use privileged mode.\n"
+                        "- Allows all seccomp profiles.\n"
+                        "- Best for: general workloads, development/staging, teams new to PSS.\n\n"
+                        "**Privileged** (unrestricted)\n"
+                        "- Completely unrestricted policy — no security restrictions enforced.\n"
+                        "- Allows privileged containers, host namespaces, host paths, any capabilities.\n"
+                        "- Containers can run as root with full access to the host.\n"
+                        "- Best for: system-level workloads (monitoring agents, CNI plugins, storage drivers), "
+                        "trusted single-tenant clusters.\n\n"
+                        "**Recommendation:** Start with *Restricted* and relax to *Baseline* only for "
+                        "workloads that require it. Avoid *Privileged* unless absolutely necessary."
+                    )
 
             with col2:
                 pod_cidr = st.text_input("Pod CIDR", value="10.244.0.0/16")
@@ -409,6 +455,64 @@ def page_profile_manager():
                     st.success(f"Profile '{name}' created successfully!")
                     st.rerun()
 
+    # ── Import Existing Cluster ──────────────────────────────────────────
+    with tab_import_cluster:
+        st.markdown("### Import Existing Kubernetes Cluster")
+        st.markdown(
+            "Connect to an existing K8s cluster by uploading its **kubeconfig** file. "
+            "This lets you use the Debugger, Monitoring, Log Analysis, and Resource Viewer "
+            "without provisioning a new cluster."
+        )
+
+        with st.form("import_cluster_form"):
+            import_name = st.text_input(
+                "Profile Name *",
+                placeholder="my-existing-cluster",
+            )
+            import_desc = st.text_area(
+                "Description",
+                placeholder="Production cluster running in datacenter A",
+            )
+            kubeconfig_file = st.file_uploader(
+                "Upload kubeconfig file",
+                type=["yaml", "yml", "conf", "config"],
+                key="kubeconfig_upload",
+                help="Usually found at ~/.kube/config on your cluster's control-plane node.",
+            )
+            k8s_ver = st.text_input(
+                "Kubernetes Version (optional)",
+                placeholder="1.30",
+                value="1.30",
+            )
+
+            submitted_import = st.form_submit_button(
+                "Import Cluster", type="primary", use_container_width=True,
+            )
+
+            if submitted_import:
+                if not import_name:
+                    st.error("Profile name is required.")
+                elif not kubeconfig_file:
+                    st.error("Please upload a kubeconfig file.")
+                else:
+                    kubeconfig_content = kubeconfig_file.read().decode("utf-8")
+                    profile = ClusterProfile(
+                        name=import_name,
+                        description=import_desc,
+                        kubernetes_version=k8s_ver or "1.30",
+                        status="active",
+                        cluster_source="imported",
+                        kubeconfig_content=kubeconfig_content,
+                    )
+                    save_profile(profile)
+                    st.session_state.active_profile = import_name
+                    st.success(
+                        f"Cluster '{import_name}' imported! "
+                        "Select it from the sidebar to start using Debugger, Monitoring, "
+                        "Resource Viewer, etc."
+                    )
+                    st.rerun()
+
     # ── Manage Profiles ───────────────────────────────────────────────────
     with tab_list:
         profiles = list_profiles()
@@ -784,11 +888,16 @@ def page_cluster_debugger():
     if not profile:
         return
 
-    cp_nodes = profile.get_control_plane_nodes()
-    if not cp_nodes:
-        st.error("No control-plane node defined in this profile.")
-        return
-    cp_node = cp_nodes[0]
+    # For imported clusters we don't need a CP node — commands run locally via kubeconfig
+    cp_node = None
+    if profile.cluster_source != "imported":
+        cp_nodes = profile.get_control_plane_nodes()
+        if not cp_nodes:
+            st.error("No control-plane node defined in this profile.")
+            return
+        cp_node = cp_nodes[0]
+
+    available_commands = get_available_commands(profile)
 
     tab_quick, tab_category, tab_custom, tab_ai = st.tabs([
         "Quick Diagnostics",
@@ -804,7 +913,7 @@ def page_cluster_debugger():
         with col1:
             selected_checks = st.multiselect(
                 "Select checks to run",
-                options=list(DIAGNOSTIC_COMMANDS.keys()),
+                options=list(available_commands.keys()),
                 default=["Node Status", "Pod Status (All Namespaces)", "Events (Recent)"],
             )
         with col2:
@@ -813,12 +922,12 @@ def page_cluster_debugger():
         if st.button("Run Diagnostics", type="primary"):
             if run_all:
                 with st.spinner("Running all diagnostics..."):
-                    results = run_all_diagnostics(cp_node)
+                    results = run_all_diagnostics(cp_node, profile=profile)
             else:
                 results = {}
                 for check in selected_checks:
                     with st.spinner(f"Running: {check}..."):
-                        results[check] = run_diagnostic(cp_node, check)
+                        results[check] = run_diagnostic(cp_node, check, profile=profile)
 
             st.session_state.debug_results = results
 
@@ -845,7 +954,7 @@ def page_cluster_debugger():
 
         if st.button("Run Category Scan", type="primary", key="cat_scan"):
             with st.spinner(f"Running {category} diagnostics..."):
-                results = run_category_diagnostics(cp_node, category)
+                results = run_category_diagnostics(cp_node, category, profile=profile)
 
             for name, result in results.items():
                 with st.expander(f"{'✅' if result.success else '❌'} {name}"):
@@ -860,7 +969,10 @@ def page_cluster_debugger():
     # ── Custom Command ────────────────────────────────────────────────────
     with tab_custom:
         st.markdown("### Run Custom Command")
-        st.warning("Commands execute on the control-plane node via SSH.")
+        if profile.cluster_source == "imported":
+            st.info("Commands run locally via kubectl using the imported kubeconfig.")
+        else:
+            st.warning("Commands execute on the control-plane node via SSH.")
         custom_cmd = st.text_area(
             "Command",
             placeholder="kubectl get pods -A -o wide",
@@ -868,7 +980,7 @@ def page_cluster_debugger():
         )
         if st.button("Execute", type="primary", key="exec_custom") and custom_cmd:
             with st.spinner("Executing..."):
-                result = run_custom_command(cp_node, custom_cmd)
+                result = run_custom_command(cp_node, custom_cmd, profile=profile)
                 if result.success:
                     st.code(result.stdout, language="text")
                 else:
@@ -907,7 +1019,7 @@ def page_cluster_debugger():
 
                 if check_pods:
                     with st.spinner("Checking pod issues..."):
-                        pod_result = check_pod_issues(cp_node)
+                        pod_result = check_pod_issues(cp_node, profile=profile)
                         if pod_result.success and pod_result.stdout.strip():
                             collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}"
                             with st.expander("Problematic Pods"):
@@ -915,10 +1027,10 @@ def page_cluster_debugger():
 
                 if auto_collect:
                     with st.spinner("Collecting diagnostics..."):
-                        diag_results = run_category_diagnostics(cp_node, "Cluster Overview")
-                        for name, result in diag_results.items():
-                            if result.success:
-                                collected_data += f"\n\n{name}:\n{result.stdout}"
+                        diag_results = run_category_diagnostics(cp_node, "Cluster Overview", profile=profile)
+                    for name, result in diag_results.items():
+                        if result.success:
+                            collected_data += f"\n\n{name}:\n{result.stdout}"
 
                 with st.spinner("AI is analyzing the issue..."):
                     full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}"
@@ -939,11 +1051,14 @@ def page_monitoring_setup():
     if not profile:
         return
 
-    cp_nodes = profile.get_control_plane_nodes()
-    if not cp_nodes:
-        st.error("No control-plane node defined in this profile.")
-        return
-    cp_node = cp_nodes[0]
+    # For imported clusters we don't need a CP node
+    cp_node = None
+    if profile.cluster_source != "imported":
+        cp_nodes = profile.get_control_plane_nodes()
+        if not cp_nodes:
+            st.error("No control-plane node defined in this profile.")
+            return
+        cp_node = cp_nodes[0]
 
     namespace = st.text_input("Monitoring Namespace", value="monitoring")
 
@@ -970,7 +1085,7 @@ def page_monitoring_setup():
         if st.button("Install Prometheus + Grafana", type="primary", use_container_width=True):
             if install_helm_first:
                 with st.status("Installing Helm...", expanded=True):
-                    result = install_helm(cp_node)
+                    result = install_helm(cp_node, profile=profile)
                     if result.success:
                         st.success("Helm ready!")
                     else:
@@ -978,7 +1093,7 @@ def page_monitoring_setup():
                         st.code(result.stderr, language="text")
 
             with st.status("Installing kube-prometheus-stack (this may take several minutes)...", expanded=True):
-                result = install_prometheus_stack(cp_node, namespace)
+                result = install_prometheus_stack(cp_node, namespace, profile=profile)
                 if result.success:
                     st.success("Prometheus + Grafana installed!")
                     st.code(result.stdout[-2000:], language="text")
@@ -988,7 +1103,7 @@ def page_monitoring_setup():
 
             if install_alerts_too:
                 with st.status("Installing alert rules...", expanded=True):
-                    result = install_alert_rules(cp_node, namespace)
+                    result = install_alert_rules(cp_node, namespace, profile=profile)
                     if result.success:
                         st.success("Alert rules installed!")
                     else:
@@ -1009,7 +1124,7 @@ def page_monitoring_setup():
 
         if st.button("Import Dashboards", type="primary") and selected_dashboards:
             with st.status("Importing dashboards...", expanded=True):
-                result = install_dashboards(cp_node, selected_dashboards, namespace)
+                result = install_dashboards(cp_node, selected_dashboards, namespace, profile=profile)
                 if result.success:
                     st.success(f"Imported {len(selected_dashboards)} dashboards!")
                     st.code(result.stdout, language="text")
@@ -1027,7 +1142,7 @@ def page_monitoring_setup():
 
         if st.button("Install Alert Rules", type="primary", key="install_alerts"):
             with st.spinner("Installing alert rules..."):
-                result = install_alert_rules(cp_node, namespace)
+                result = install_alert_rules(cp_node, namespace, profile=profile)
                 if result.success:
                     st.success("Alert rules installed!")
                     st.code(result.stdout, language="text")
@@ -1040,7 +1155,7 @@ def page_monitoring_setup():
         st.markdown("### Monitoring Stack Status")
         if st.button("Check Status", type="primary", key="mon_status"):
             with st.spinner("Checking monitoring stack..."):
-                result = get_monitoring_status(cp_node, namespace)
+                result = get_monitoring_status(cp_node, namespace, profile=profile)
                 if result.success:
                     st.code(result.stdout, language="text")
                 else:
@@ -1069,7 +1184,7 @@ def page_monitoring_setup():
         else:
             if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"):
                 current_status = ""
-                status_result = get_monitoring_status(cp_node, namespace)
+                status_result = get_monitoring_status(cp_node, namespace, profile=profile)
                 if status_result.success:
                     current_status = status_result.stdout
 
@@ -1090,11 +1205,16 @@ def page_log_analysis():
     if not profile:
         return
 
-    cp_nodes = profile.get_control_plane_nodes()
-    if not cp_nodes:
-        st.error("No control-plane node defined in this profile.")
-        return
-    cp_node = cp_nodes[0]
+    # For imported clusters we don't need a CP node
+    cp_node = None
+    if profile.cluster_source != "imported":
+        cp_nodes = profile.get_control_plane_nodes()
+        if not cp_nodes:
+            st.error("No control-plane node defined in this profile.")
+            return
+        cp_node = cp_nodes[0]
+
+    available_log_sources = get_available_log_sources(profile)
 
     tab_system, tab_pod, tab_correlation, tab_ai = st.tabs([
         "System Logs",
@@ -1108,10 +1228,13 @@ def page_log_analysis():
         st.markdown("### System Component Logs")
         col1, col2, col3 = st.columns(3)
         with col1:
+            default_sources = [s for s in ["Kubelet", "CRI-O", "Events"] if s in available_log_sources]
+            if not default_sources:
+                default_sources = available_log_sources[:3] if available_log_sources else []
             sources = st.multiselect(
                 "Log Sources",
-                options=list(LOG_SOURCES.keys()),
-                default=["Kubelet", "CRI-O", "Events"],
+                options=available_log_sources,
+                default=default_sources,
             )
         with col2:
             log_lines = st.number_input("Lines to fetch", min_value=50, max_value=1000, value=200)
@@ -1127,7 +1250,7 @@ def page_log_analysis():
             log_data = {}
             for source in sources:
                 with st.spinner(f"Collecting {source} logs..."):
-                    result = collect_logs(cp_node, source, log_lines, since, since_k8s)
+                    result = collect_logs(cp_node, source, log_lines, since, since_k8s, profile=profile)
                     if result.success:
                         log_data[source] = result.stdout
                         analysis = analyze_logs(result.stdout, source)
@@ -1171,7 +1294,7 @@ def page_log_analysis():
             with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."):
                 result = collect_pod_logs(
                     cp_node, pod_ns, pod_name, container, pod_lines,
-                    "1h", pod_previous,
+                    "1h", pod_previous, profile=profile,
                 )
                 if result.success:
                     analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}")
@@ -1203,16 +1326,19 @@ def page_log_analysis():
         st.markdown("### Cross-Source Error Correlation")
         st.markdown("Collect logs from multiple sources and correlate errors across them.")
 
+        default_corr = [s for s in ["Kubelet", "CRI-O", "API Server", "Events"] if s in available_log_sources]
+        if not default_corr:
+            default_corr = available_log_sources[:4] if available_log_sources else []
         corr_sources = st.multiselect(
             "Sources to correlate",
-            options=list(LOG_SOURCES.keys()),
-            default=["Kubelet", "CRI-O", "API Server", "Events"],
+            options=available_log_sources,
+            default=default_corr,
             key="corr_sources",
         )
 
         if st.button("Collect & Correlate", type="primary", key="correlate"):
             with st.spinner("Collecting logs from multiple sources..."):
-                results = collect_multi_source_logs(cp_node, corr_sources, lines=150)
+                results = collect_multi_source_logs(cp_node, corr_sources, lines=150, profile=profile)
 
             correlated = correlate_errors(results)
 
@@ -1277,6 +1403,866 @@ def page_log_analysis():
 #  PAGE: AI Assistant
 # ══════════════════════════════════════════════════════════════════════════
 
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Resource Viewer
+# ══════════════════════════════════════════════════════════════════════════
+
+# Resource definitions: (display_name, kubectl_command, supports_namespace)
+_RESOURCE_TYPES = {
+    "Pods": ("get pods", True),
+    "Deployments": ("get deployments", True),
+    "Services": ("get services", True),
+    "ConfigMaps": ("get configmaps", True),
+    "Secrets": ("get secrets", True),
+    "StatefulSets": ("get statefulsets", True),
+    "DaemonSets": ("get daemonsets", True),
+    "ReplicaSets": ("get replicasets", True),
+    "Jobs": ("get jobs", True),
+    "CronJobs": ("get cronjobs", True),
+    "Ingresses": ("get ingress", True),
+    "NetworkPolicies": ("get networkpolicies", True),
+    "PersistentVolumeClaims": ("get pvc", True),
+    "PersistentVolumes": ("get pv", False),
+    "StorageClasses": ("get storageclasses", False),
+    "Namespaces": ("get namespaces", False),
+    "Nodes": ("get nodes", False),
+    "ServiceAccounts": ("get serviceaccounts", True),
+    "DestinationRules": ("get destinationrules", True),
+    "VirtualServices": ("get virtualservices", True),
+    "HorizontalPodAutoscalers": ("get hpa", True),
+    "PodDisruptionBudgets": ("get pdb", True),
+    "Endpoints": ("get endpoints", True),
+}
+
+
+def page_resource_viewer():
+    st.markdown("## Resource Viewer")
+    st.markdown("Browse live Kubernetes resources from your cluster.")
+
+    profile = _get_active_profile()
+    if not profile:
+        return
+
+    if not profile.kubeconfig_content and not profile.get_control_plane_nodes():
+        st.error(
+            "This profile has no kubeconfig and no control-plane node. "
+            "Import a kubeconfig or add nodes in the Profile Manager."
+        )
+        return
+
+    tab_resources, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+        "Cluster Resources",
+        "Node Health",
+        "RBAC Viewer",
+        "Helm Releases",
+        "Events Timeline",
+    ])
+
+    # ── Cluster Resources ────────────────────────────────────────────────
+    with tab_resources:
+        st.markdown("### Browse Cluster Resources")
+
+        col1, col2, col3 = st.columns([2, 2, 1])
+        with col1:
+            resource_type = st.selectbox(
+                "Resource Type",
+                options=list(_RESOURCE_TYPES.keys()),
+                index=0,
+            )
+        with col2:
+            cmd_base, ns_supported = _RESOURCE_TYPES[resource_type]
+            if ns_supported:
+                ns_choice = st.radio(
+                    "Namespace",
+                    ["All Namespaces", "Specific"],
+                    horizontal=True,
+                    key="res_ns_choice",
+                )
+                if ns_choice == "Specific":
+                    namespace = st.text_input("Namespace", value="default", key="res_ns")
+                else:
+                    namespace = ""
+            else:
+                namespace = ""
+                st.info(f"{resource_type} is a cluster-scoped resource.")
+        with col3:
+            output_format = st.selectbox(
+                "Output",
+                ["wide", "yaml", "json", "name"],
+                index=0,
+                key="res_output",
+            )
+
+        if st.button("Fetch Resources", type="primary", key="fetch_res"):
+            kubectl_cmd = cmd_base
+            if ns_supported and not namespace:
+                kubectl_cmd += " -A"
+            elif ns_supported and namespace:
+                kubectl_cmd += f" -n {namespace}"
+            kubectl_cmd += f" -o {output_format}"
+
+            with st.spinner(f"Fetching {resource_type}..."):
+                result = run_kubectl(profile, kubectl_cmd, timeout=30)
+                if result.success:
+                    st.code(result.stdout or "(no resources found)", language="text")
+                else:
+                    if "the server doesn't have a resource type" in result.stderr:
+                        st.warning(
+                            f"{resource_type} is not available on this cluster "
+                            "(CRD may not be installed)."
+                        )
+                    else:
+                        st.error("Failed to fetch resources")
+                    st.code(result.stderr, language="text")
+
+        # Describe a specific resource
+        st.markdown("---")
+        st.markdown("#### Describe a Resource")
+        desc_col1, desc_col2 = st.columns(2)
+        with desc_col1:
+            desc_name = st.text_input(
+                "Resource name",
+                placeholder="e.g., my-pod-xyz",
+                key="desc_name",
+            )
+        with desc_col2:
+            desc_ns = st.text_input(
+                "Namespace (if applicable)",
+                value="default",
+                key="desc_ns",
+            )
+
+        if st.button("Describe", key="describe_res") and desc_name:
+            # Determine the singular resource type for describe
+            res_singular = resource_type.rstrip("s")
+            if resource_type == "Ingresses":
+                res_singular = "ingress"
+            elif resource_type == "Namespaces":
+                res_singular = "namespace"
+            elif resource_type == "StorageClasses":
+                res_singular = "storageclass"
+            elif resource_type == "Endpoints":
+                res_singular = "endpoints"
+
+            desc_cmd = f"describe {res_singular.lower()} {desc_name}"
+            if ns_supported and desc_ns:
+                desc_cmd += f" -n {desc_ns}"
+
+            with st.spinner(f"Describing {desc_name}..."):
+                result = run_kubectl(profile, desc_cmd, timeout=30)
+                if result.success:
+                    st.code(result.stdout, language="yaml")
+                else:
+                    st.error("Describe failed")
+                    st.code(result.stderr, language="text")
+
+    # ── Node Health ──────────────────────────────────────────────────────
+    with tab_node_health:
+        st.markdown("### Node Health Overview")
+        st.markdown("View node status, resource usage, and conditions.")
+
+        if st.button("Refresh Node Health", type="primary", key="node_health"):
+            col_status, col_top = st.columns(2)
+
+            with col_status:
+                st.markdown("#### Node Status")
+                with st.spinner("Fetching nodes..."):
+                    result = run_kubectl(profile, "get nodes -o wide", timeout=15)
+                    if result.success:
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.error("Failed to get nodes")
+                        st.code(result.stderr, language="text")
+
+            with col_top:
+                st.markdown("#### Resource Usage")
+                with st.spinner("Fetching node metrics..."):
+                    result = run_kubectl(profile, "top nodes", timeout=15)
+                    if result.success:
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.warning("kubectl top requires metrics-server to be installed.")
+                        st.code(result.stderr, language="text")
+
+            st.markdown("---")
+            st.markdown("#### Node Conditions")
+            with st.spinner("Checking node conditions..."):
+                result = run_kubectl(
+                    profile,
+                    'get nodes -o custom-columns='
+                    '"NAME:.metadata.name,'
+                    'READY:.status.conditions[?(@.type==\\"Ready\\")].status,'
+                    'DISK:.status.conditions[?(@.type==\\"DiskPressure\\")].status,'
+                    'MEMORY:.status.conditions[?(@.type==\\"MemoryPressure\\")].status,'
+                    'PID:.status.conditions[?(@.type==\\"PIDPressure\\")].status"',
+                    timeout=15,
+                )
+                if result.success:
+                    st.code(result.stdout, language="text")
+                else:
+                    st.code(result.stderr, language="text")
+
+            st.markdown("#### Pod Distribution per Node")
+            with st.spinner("Fetching pod distribution..."):
+                result = run_kubectl(
+                    profile,
+                    'get pods -A -o custom-columns='
+                    '"NODE:.spec.nodeName,NAMESPACE:.metadata.namespace,'
+                    'POD:.metadata.name,STATUS:.status.phase" '
+                    '--sort-by=.spec.nodeName',
+                    timeout=15,
+                )
+                if result.success:
+                    st.code(result.stdout, language="text")
+                else:
+                    st.code(result.stderr, language="text")
+
+    # ── RBAC Viewer ──────────────────────────────────────────────────────
+    with tab_rbac:
+        st.markdown("### RBAC Viewer")
+        st.markdown("Browse Roles, ClusterRoles, Bindings, and ServiceAccounts.")
+
+        rbac_type = st.selectbox(
+            "RBAC Resource",
+            [
+                "ClusterRoles",
+                "ClusterRoleBindings",
+                "Roles (namespaced)",
+                "RoleBindings (namespaced)",
+                "ServiceAccounts",
+            ],
+            key="rbac_type",
+        )
+
+        rbac_ns = ""
+        if "(namespaced)" in rbac_type or rbac_type == "ServiceAccounts":
+            rbac_ns = st.text_input(
+                "Namespace",
+                value="default",
+                key="rbac_ns",
+                help="Leave blank for all namespaces",
+            )
+
+        if st.button("Fetch RBAC Resources", type="primary", key="fetch_rbac"):
+            cmd_map = {
+                "ClusterRoles": "get clusterroles",
+                "ClusterRoleBindings": "get clusterrolebindings",
+                "Roles (namespaced)": "get roles",
+                "RoleBindings (namespaced)": "get rolebindings",
+                "ServiceAccounts": "get serviceaccounts",
+            }
+            rbac_cmd = cmd_map[rbac_type]
+            if rbac_ns:
+                rbac_cmd += f" -n {rbac_ns}"
+            elif "(namespaced)" in rbac_type or rbac_type == "ServiceAccounts":
+                rbac_cmd += " -A"
+
+            with st.spinner(f"Fetching {rbac_type}..."):
+                result = run_kubectl(profile, rbac_cmd, timeout=15)
+                if result.success:
+                    st.code(result.stdout or "(none found)", language="text")
+                else:
+                    st.error("Failed to fetch RBAC resources")
+                    st.code(result.stderr, language="text")
+
+        # Describe a specific RBAC resource
+        st.markdown("---")
+        st.markdown("#### Inspect RBAC Resource")
+        rbac_name = st.text_input(
+            "Resource name to describe",
+            placeholder="e.g., cluster-admin",
+            key="rbac_desc_name",
+        )
+        if st.button("Describe RBAC", key="desc_rbac") and rbac_name:
+            type_map = {
+                "ClusterRoles": "clusterrole",
+                "ClusterRoleBindings": "clusterrolebinding",
+                "Roles (namespaced)": "role",
+                "RoleBindings (namespaced)": "rolebinding",
+                "ServiceAccounts": "serviceaccount",
+            }
+            desc_cmd = f"describe {type_map[rbac_type]} {rbac_name}"
+            if rbac_ns:
+                desc_cmd += f" -n {rbac_ns}"
+
+            with st.spinner(f"Describing {rbac_name}..."):
+                result = run_kubectl(profile, desc_cmd, timeout=15)
+                if result.success:
+                    st.code(result.stdout, language="yaml")
+                else:
+                    st.error("Describe failed")
+                    st.code(result.stderr, language="text")
+
+    # ── Helm Releases ────────────────────────────────────────────────────
+    with tab_helm:
+        st.markdown("### Helm Release Manager")
+        st.markdown("List, inspect, and manage Helm releases on your cluster.")
+
+        helm_tab_list, helm_tab_install, helm_tab_history = st.tabs([
+            "List Releases", "Install Chart", "Release History",
+        ])
+
+        with helm_tab_list:
+            helm_ns_all = st.checkbox("All namespaces", value=True, key="helm_ns_all")
+            helm_ns = ""
+            if not helm_ns_all:
+                helm_ns = st.text_input("Namespace", value="default", key="helm_ns")
+
+            if st.button("List Helm Releases", type="primary", key="helm_list"):
+                helm_cmd = "helm list"
+                if helm_ns_all:
+                    helm_cmd += " -A"
+                elif helm_ns:
+                    helm_cmd += f" -n {helm_ns}"
+                helm_cmd += " -o table"
+
+                with st.spinner("Fetching Helm releases..."):
+                    result = run_kubectl(profile, helm_cmd.replace("kubectl ", ""), timeout=15)
+                    if result.success:
+                        st.code(result.stdout or "(no releases found)", language="text")
+                    else:
+                        st.warning("Helm may not be installed on this cluster.")
+                        st.code(result.stderr, language="text")
+
+        with helm_tab_install:
+            st.markdown("#### Install a Helm Chart")
+            hcol1, hcol2 = st.columns(2)
+            with hcol1:
+                helm_release_name = st.text_input("Release Name", placeholder="my-release", key="helm_rel")
+                helm_chart = st.text_input("Chart", placeholder="prometheus-community/kube-prometheus-stack", key="helm_chart")
+            with hcol2:
+                helm_install_ns = st.text_input("Namespace", value="default", key="helm_install_ns")
+                helm_create_ns = st.checkbox("Create namespace if not exists", value=True, key="helm_create_ns")
+            helm_values = st.text_area(
+                "Values (YAML, optional)",
+                placeholder="# Custom values.yaml content here",
+                height=150,
+                key="helm_values",
+            )
+
+            if st.button("Install Chart", type="primary", key="helm_install") and helm_release_name and helm_chart:
+                install_cmd = f"helm install {helm_release_name} {helm_chart} -n {helm_install_ns}"
+                if helm_create_ns:
+                    install_cmd += " --create-namespace"
+                # If user provided values, write to temp file
+                if helm_values.strip():
+                    values_path = os.path.join(config.UPLOADS_DIR, f"helm-values-{helm_release_name}.yaml")
+                    with open(values_path, "w") as vf:
+                        vf.write(helm_values)
+                    install_cmd += f" -f {values_path}"
+
+                with st.spinner(f"Installing {helm_chart}..."):
+                    result = run_kubectl(profile, install_cmd.replace("kubectl ", ""), timeout=120)
+                    if result.success:
+                        st.success(f"Release '{helm_release_name}' installed!")
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.error("Helm install failed")
+                        st.code(result.stderr, language="text")
+
+        with helm_tab_history:
+            st.markdown("#### Release History")
+            hist_name = st.text_input("Release name", placeholder="my-release", key="helm_hist_name")
+            hist_ns = st.text_input("Namespace", value="default", key="helm_hist_ns")
+
+            if st.button("Get History", key="helm_hist") and hist_name:
+                hist_cmd = f"helm history {hist_name} -n {hist_ns}"
+                with st.spinner("Fetching history..."):
+                    result = run_kubectl(profile, hist_cmd.replace("kubectl ", ""), timeout=15)
+                    if result.success:
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.error("Could not get release history")
+                        st.code(result.stderr, language="text")
+
+            st.markdown("---")
+            st.markdown("#### Rollback Release")
+            rb_name = st.text_input("Release name", placeholder="my-release", key="helm_rb_name")
+            rb_ns = st.text_input("Namespace", value="default", key="helm_rb_ns")
+            rb_rev = st.number_input("Revision number", min_value=1, value=1, key="helm_rb_rev")
+
+            if st.button("Rollback", key="helm_rollback") and rb_name:
+                rb_cmd = f"helm rollback {rb_name} {rb_rev} -n {rb_ns}"
+                with st.spinner(f"Rolling back {rb_name} to revision {rb_rev}..."):
+                    result = run_kubectl(profile, rb_cmd.replace("kubectl ", ""), timeout=60)
+                    if result.success:
+                        st.success(f"Rolled back '{rb_name}' to revision {rb_rev}")
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.error("Rollback failed")
+                        st.code(result.stderr, language="text")
+
+    # ── Events Timeline ──────────────────────────────────────────────────
+    with tab_events:
+        st.markdown("### Cluster Events Timeline")
+        st.markdown("View recent Kubernetes events with graphical analysis.")
+
+        ev_col1, ev_col2, ev_col3 = st.columns(3)
+        with ev_col1:
+            ev_ns_all = st.checkbox("All namespaces", value=True, key="ev_ns_all")
+            ev_ns = ""
+            if not ev_ns_all:
+                ev_ns = st.text_input("Namespace", value="default", key="ev_ns")
+        with ev_col2:
+            ev_type = st.selectbox(
+                "Event Type",
+                ["All", "Normal", "Warning"],
+                key="ev_type",
+            )
+        with ev_col3:
+            ev_sort = st.selectbox(
+                "Sort by",
+                ["Last Timestamp", "First Timestamp", "Count"],
+                key="ev_sort",
+            )
+
+        if st.button("Fetch Events", type="primary", key="fetch_events"):
+            # Fetch events in JSON for graphical display
+            ev_json_cmd = "get events"
+            if ev_ns_all:
+                ev_json_cmd += " -A"
+            elif ev_ns:
+                ev_json_cmd += f" -n {ev_ns}"
+            if ev_type != "All":
+                ev_json_cmd += f" --field-selector type={ev_type}"
+            ev_json_cmd += " -o json"
+
+            with st.spinner("Fetching events..."):
+                result = run_kubectl(profile, ev_json_cmd, timeout=15)
+
+            if result.success and result.stdout.strip():
+                try:
+                    events_data = json.loads(result.stdout)
+                    items = events_data.get("items", [])
+
+                    if not items:
+                        st.info("No events found.")
+                    else:
+                        # Parse events into structured data
+                        ev_records = []
+                        for item in items:
+                            ev_records.append({
+                                "Namespace": item.get("metadata", {}).get("namespace", ""),
+                                "Type": item.get("type", ""),
+                                "Reason": item.get("reason", ""),
+                                "Object": item.get("involvedObject", {}).get("name", ""),
+                                "Kind": item.get("involvedObject", {}).get("kind", ""),
+                                "Message": (item.get("message", "") or "")[:120],
+                                "Count": item.get("count", 1),
+                                "Last Seen": item.get("lastTimestamp", item.get("eventTime", "")),
+                            })
+
+                        import pandas as pd
+
+                        df = pd.DataFrame(ev_records)
+
+                        # ── Graphical Summary ────────────────────────
+                        st.markdown("#### Event Summary Charts")
+
+                        chart_col1, chart_col2 = st.columns(2)
+
+                        with chart_col1:
+                            st.markdown("**Events by Type**")
+                            type_counts = df["Type"].value_counts().reset_index()
+                            type_counts.columns = ["Type", "Count"]
+                            st.bar_chart(type_counts.set_index("Type"))
+
+                        with chart_col2:
+                            st.markdown("**Events by Reason (Top 10)**")
+                            reason_counts = df["Reason"].value_counts().head(10).reset_index()
+                            reason_counts.columns = ["Reason", "Count"]
+                            st.bar_chart(reason_counts.set_index("Reason"))
+
+                        chart_col3, chart_col4 = st.columns(2)
+
+                        with chart_col3:
+                            st.markdown("**Events by Namespace (Top 10)**")
+                            ns_counts = df["Namespace"].value_counts().head(10).reset_index()
+                            ns_counts.columns = ["Namespace", "Count"]
+                            st.bar_chart(ns_counts.set_index("Namespace"))
+
+                        with chart_col4:
+                            st.markdown("**Events by Object Kind**")
+                            kind_counts = df["Kind"].value_counts().reset_index()
+                            kind_counts.columns = ["Kind", "Count"]
+                            st.bar_chart(kind_counts.set_index("Kind"))
+
+                        # ── Timeline Chart ────────────────────────────
+                        st.markdown("---")
+                        st.markdown("#### Event Timeline")
+                        if df["Last Seen"].notna().any() and df["Last Seen"].str.strip().any():
+                            try:
+                                df["Timestamp"] = pd.to_datetime(
+                                    df["Last Seen"], errors="coerce", utc=True,
+                                )
+                                ts_df = df.dropna(subset=["Timestamp"])
+                                if not ts_df.empty:
+                                    ts_df = ts_df.set_index("Timestamp")
+                                    # Events over time grouped by type
+                                    timeline = ts_df.groupby(
+                                        [pd.Grouper(freq="1min"), "Type"]
+                                    ).size().unstack(fill_value=0)
+                                    if not timeline.empty:
+                                        st.line_chart(timeline)
+                                    else:
+                                        st.info("Not enough timestamp data for timeline chart.")
+                                else:
+                                    st.info("Could not parse event timestamps for timeline.")
+                            except Exception:
+                                st.info("Could not render timeline chart from event data.")
+                        else:
+                            st.info("No timestamp data available for timeline chart.")
+
+                        # ── High-Count Events ─────────────────────────
+                        st.markdown("---")
+                        st.markdown("#### High-Frequency Events")
+                        high_count = df[df["Count"] > 1].sort_values("Count", ascending=False).head(20)
+                        if not high_count.empty:
+                            st.dataframe(
+                                high_count[["Namespace", "Type", "Reason", "Object", "Count", "Message"]],
+                                use_container_width=True,
+                                hide_index=True,
+                            )
+                        else:
+                            st.info("No repeated events found.")
+
+                        # ── Full Events Table ─────────────────────────
+                        st.markdown("---")
+                        st.markdown("#### All Events")
+                        st.dataframe(df, use_container_width=True, hide_index=True)
+
+                except (json.JSONDecodeError, KeyError):
+                    # Fallback to text display
+                    st.code(result.stdout, language="text")
+            elif result.success:
+                st.info("No events found.")
+            else:
+                st.error("Failed to fetch events")
+                st.code(result.stderr, language="text")
+
+        # Warning events summary
+        st.markdown("---")
+        st.markdown("#### Warning Events Summary")
+        if st.button("Show Warning Events", key="warn_events"):
+            warn_cmd = (
+                "get events -A --field-selector type=Warning "
+                "-o custom-columns="
+                "'NAMESPACE:.metadata.namespace,"
+                "LAST_SEEN:.lastTimestamp,"
+                "COUNT:.count,"
+                "REASON:.reason,"
+                "OBJECT:.involvedObject.name,"
+                "MESSAGE:.message' "
+                "--sort-by=.lastTimestamp"
+            )
+            with st.spinner("Fetching warning events..."):
+                result = run_kubectl(profile, warn_cmd, timeout=15)
+                if result.success:
+                    st.code(result.stdout or "(no warning events)", language="text")
+                else:
+                    st.code(result.stderr, language="text")
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Upgrade Planner
+# ══════════════════════════════════════════════════════════════════════════
+
+_K8S_VERSIONS_DETAIL = [
+    {
+        "version": "1.35",
+        "release": "2026-04",
+        "end_of_life": "2027-08",
+        "highlights": "Sidecar containers GA, improved pod lifecycle management, dynamic resource allocation enhancements.",
+    },
+    {
+        "version": "1.34",
+        "release": "2025-12",
+        "end_of_life": "2027-04",
+        "highlights": "Structured authorization config GA, recursive read-only mounts, traffic distribution improvements.",
+    },
+    {
+        "version": "1.33",
+        "release": "2025-08",
+        "end_of_life": "2027-01",
+        "highlights": "In-place pod resize beta, multi-network pods alpha, nftables kube-proxy backend.",
+    },
+    {
+        "version": "1.32",
+        "release": "2025-04",
+        "end_of_life": "2026-08",
+        "highlights": "Dynamic resource allocation (DRA) beta, auto-remove PV claims, job success policy GA.",
+    },
+    {
+        "version": "1.31",
+        "release": "2024-12",
+        "end_of_life": "2026-04",
+        "highlights": "AppArmor GA, nftables proxy GA, improved ingress connectivity reliability, cgroup v2 enhancements.",
+    },
+    {
+        "version": "1.30",
+        "release": "2024-04",
+        "end_of_life": "2025-08",
+        "highlights": "Contextual logging GA, CEL admission improvements, pod scheduling readiness.",
+    },
+    {
+        "version": "1.29",
+        "release": "2023-12",
+        "end_of_life": "2025-02",
+        "highlights": "KMS v2 GA, ReadWriteOncePod GA, networking improvements, node memory manager.",
+    },
+    {
+        "version": "1.28",
+        "release": "2023-08",
+        "end_of_life": "2024-10",
+        "highlights": "Sidecar containers alpha, recovery from non-graceful node shutdown, mixed version proxy.",
+    },
+    {
+        "version": "1.27",
+        "release": "2023-04",
+        "end_of_life": "2024-06",
+        "highlights": "In-place pod resize alpha, VPA improvements, SeccompDefault GA.",
+    },
+]
+
+
+def page_upgrade_planner():
+    st.markdown("## Upgrade Planner")
+    st.markdown("Plan and prepare Kubernetes version upgrades for your cluster.")
+
+    profile = _get_active_profile()
+    if not profile:
+        return
+
+    current_ver = profile.kubernetes_version
+
+    tab_overview, tab_preflight, tab_plan, tab_changelog = st.tabs([
+        "Version Overview",
+        "Pre-flight Checks",
+        "Upgrade Steps",
+        "Changelog & Compatibility",
+    ])
+
+    # ── Version Overview ─────────────────────────────────────────────────
+    with tab_overview:
+        st.markdown("### Kubernetes Version Matrix")
+        st.info(f"Your current cluster version: **{current_ver}**")
+
+        # Build a table
+        rows = []
+        for v in _K8S_VERSIONS_DETAIL:
+            status = ""
+            if v["version"] == current_ver:
+                status = "CURRENT"
+            elif v["version"] > current_ver:
+                status = "UPGRADE AVAILABLE"
+            else:
+                status = "OLDER"
+            rows.append({
+                "Version": v["version"],
+                "Status": status,
+                "Release Date": v["release"],
+                "End of Life": v["end_of_life"],
+                "Highlights": v["highlights"],
+            })
+
+        st.dataframe(rows, use_container_width=True, hide_index=True)
+
+        # Upgrade target selection
+        st.markdown("---")
+        available_upgrades = [
+            v["version"] for v in _K8S_VERSIONS_DETAIL if v["version"] > current_ver
+        ]
+        if available_upgrades:
+            target_version = st.selectbox(
+                "Select target upgrade version",
+                available_upgrades,
+                key="upgrade_target",
+            )
+            skipped = [
+                v for v in _K8S_VERSIONS_DETAIL
+                if current_ver < v["version"] <= target_version
+            ]
+            if len(skipped) > 1:
+                st.warning(
+                    f"You are skipping {len(skipped) - 1} minor version(s). "
+                    "Kubernetes supports upgrading one minor version at a time. "
+                    "Plan incremental upgrades for production clusters."
+                )
+            st.markdown("#### Upgrade Path")
+            path_versions = [current_ver] + [v["version"] for v in reversed(skipped)]
+            st.markdown(" → ".join([f"**{v}**" for v in path_versions]))
+        else:
+            st.success("You are running the latest version!")
+
+    # ── Pre-flight Checks ────────────────────────────────────────────────
+    with tab_preflight:
+        st.markdown("### Pre-Upgrade Checks")
+        st.markdown("Run these checks before starting the upgrade process.")
+
+        checks = [
+            ("Cluster Health", "get nodes -o wide"),
+            ("All Pods Running", "get pods -A --field-selector 'status.phase!=Running,status.phase!=Succeeded'"),
+            ("etcd Health", "get --raw=/healthz"),
+            ("API Server Version", "version"),
+            ("PodDisruptionBudgets", "get pdb -A"),
+            ("Deprecated APIs", "api-resources --api-group=extensions"),
+            ("Persistent Volumes", "get pv"),
+            ("Component Statuses", "get cs 2>/dev/null || echo 'Deprecated in newer versions'"),
+        ]
+
+        if st.button("Run All Pre-flight Checks", type="primary", key="preflight"):
+            all_ok = True
+            for name, cmd in checks:
+                with st.status(f"Checking: {name}...", expanded=False) as status:
+                    result = run_kubectl(profile, cmd, timeout=15)
+                    if result.success:
+                        st.code(result.stdout or "(no output)", language="text")
+                        status.update(label=f"{name} — OK", state="complete")
+                    else:
+                        st.code(result.stderr, language="text")
+                        status.update(label=f"{name} — ISSUE", state="error")
+                        all_ok = False
+
+            if all_ok:
+                st.success("All pre-flight checks passed! The cluster looks ready for upgrade.")
+            else:
+                st.warning(
+                    "Some checks reported issues. Review the output above before proceeding."
+                )
+
+        st.markdown("---")
+        st.markdown("#### Backup Checklist")
+        st.markdown(
+            "Before upgrading, ensure you have:\n\n"
+            "- [ ] **etcd snapshot backup**: `ETCDCTL_API=3 etcdctl snapshot save /backup/etcd-snapshot.db`\n"
+            "- [ ] **Cluster state export**: `kubectl get all -A -o yaml > cluster-backup.yaml`\n"
+            "- [ ] **PV/PVC data backed up** (if applicable)\n"
+            "- [ ] **CNI configuration backed up**: `/etc/cni/net.d/`\n"
+            "- [ ] **kubeadm config backed up**: `kubeadm config view > kubeadm-config.yaml`\n"
+            "- [ ] **VM/node snapshots taken** (if running on VMs)\n"
+        )
+
+    # ── Upgrade Steps ────────────────────────────────────────────────────
+    with tab_plan:
+        st.markdown("### Step-by-Step Upgrade Plan")
+
+        target = st.selectbox(
+            "Target Version",
+            [v["version"] for v in _K8S_VERSIONS_DETAIL if v["version"] > current_ver] or [current_ver],
+            key="upgrade_plan_target",
+        )
+
+        st.markdown(f"#### Upgrading from {current_ver} → {target}")
+
+        st.markdown(
+            f"""
+**Phase 1: Prepare (Control Plane)**
+```bash
+# 1. Update package repositories
+sudo apt-get update
+
+# 2. Check available kubeadm versions
+apt-cache madison kubeadm | grep {target}
+
+# 3. Upgrade kubeadm
+sudo apt-mark unhold kubeadm
+sudo apt-get install -y kubeadm={target}.*
+sudo apt-mark hold kubeadm
+
+# 4. Verify kubeadm version
+kubeadm version
+
+# 5. Check upgrade plan
+sudo kubeadm upgrade plan
+```
+
+**Phase 2: Upgrade Control Plane**
+```bash
+# 1. Drain the control-plane node
+kubectl drain <cp-node> --ignore-daemonsets --delete-emptydir-data
+
+# 2. Apply the upgrade
+sudo kubeadm upgrade apply v{target}.0
+
+# 3. Upgrade kubelet & kubectl
+sudo apt-mark unhold kubelet kubectl
+sudo apt-get install -y kubelet={target}.* kubectl={target}.*
+sudo apt-mark hold kubelet kubectl
+
+# 4. Restart kubelet
+sudo systemctl daemon-reload
+sudo systemctl restart kubelet
+
+# 5. Uncordon the node
+kubectl uncordon <cp-node>
+```
+
+**Phase 3: Upgrade Worker Nodes** (repeat for each worker)
+```bash
+# On each worker node:
+# 1. Drain the worker
+kubectl drain <worker-node> --ignore-daemonsets --delete-emptydir-data
+
+# 2. Upgrade kubeadm, kubelet, kubectl
+sudo apt-mark unhold kubeadm kubelet kubectl
+sudo apt-get install -y kubeadm={target}.* kubelet={target}.* kubectl={target}.*
+sudo apt-mark hold kubeadm kubelet kubectl
+
+# 3. Upgrade node config
+sudo kubeadm upgrade node
+
+# 4. Restart kubelet
+sudo systemctl daemon-reload
+sudo systemctl restart kubelet
+
+# 5. Uncordon
+kubectl uncordon <worker-node>
+```
+
+**Phase 4: Upgrade CRI-O** (on each node)
+```bash
+# Update CRI-O to match the K8s version
+sudo apt-get install -y cri-o={target}.*
+sudo systemctl restart crio
+sudo systemctl restart kubelet
+```
+
+**Phase 5: Verify**
+```bash
+kubectl get nodes -o wide
+kubectl get pods -A
+kubectl version
+```
+"""
+        )
+
+    # ── Changelog & Compatibility ────────────────────────────────────────
+    with tab_changelog:
+        st.markdown("### Version Changelog & Compatibility Notes")
+
+        for v in _K8S_VERSIONS_DETAIL:
+            marker = " ← CURRENT" if v["version"] == current_ver else ""
+            with st.expander(f"Kubernetes {v['version']}{marker}", expanded=(v["version"] == current_ver)):
+                st.markdown(f"**Release Date:** {v['release']}")
+                st.markdown(f"**End of Life:** {v['end_of_life']}")
+                st.markdown(f"**Key Highlights:** {v['highlights']}")
+                st.markdown("---")
+                st.markdown(
+                    f"**Compatibility:**\n"
+                    f"- CRI-O: {v['version']}.x\n"
+                    f"- Flannel: Compatible (check release notes for CNI spec changes)\n"
+                    f"- etcd: 3.5.x+ recommended\n"
+                    f"- CoreDNS: 1.11.x+ recommended\n"
+                )
+                st.markdown(
+                    f"**Upgrade Notes:**\n"
+                    f"- Always upgrade one minor version at a time\n"
+                    f"- Check deprecated API versions before upgrading\n"
+                    f"- Run `kubeadm upgrade plan` to verify compatibility\n"
+                    f"- Back up etcd before starting\n"
+                )
+
+
 def page_ai_assistant():
     st.markdown("## AI Kubernetes Assistant")
 
@@ -1334,32 +2320,42 @@ def _get_active_profile() -> ClusterProfile | None:
 
 def _show_profile_summary(profile: ClusterProfile):
     """Display a compact profile summary."""
-    cols = st.columns(5)
-    cols[0].metric("Profile", profile.name)
-    cols[1].metric("K8s Version", profile.kubernetes_version)
-    cols[2].metric("Runtime", f"CRI-O {profile.crio_version}")
-    cols[3].metric("CNI", "Flannel")
-    cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W")
-
-    with st.expander("Storage & Proxy Details", expanded=False):
-        scol1, scol2, scol3 = st.columns(3)
-        with scol1:
-            st.markdown(f"**CRI-O Root:** `{profile.crio_root}`")
-            st.markdown(f"**CRI-O RunRoot:** `{profile.crio_runroot}`")
-        with scol2:
-            st.markdown(f"**Kubelet Dir:** `{profile.kubelet_root}`")
-            st.markdown(f"**Log Root:** `{profile.log_root}`")
-        with scol3:
-            if profile.http_proxy or profile.https_proxy:
-                st.markdown(f"**HTTP Proxy:** `{profile.http_proxy or 'N/A'}`")
-                st.markdown(f"**HTTPS Proxy:** `{profile.https_proxy or 'N/A'}`")
-                if profile.no_proxy:
-                    st.markdown(f"**No Proxy:** `{profile.no_proxy}`")
-            if profile.http_proxy_alt or profile.https_proxy_alt:
-                st.markdown(f"**Alt HTTP Proxy:** `{profile.http_proxy_alt or 'N/A'}`")
-                st.markdown(f"**Alt HTTPS Proxy:** `{profile.https_proxy_alt or 'N/A'}`")
-            if not (profile.http_proxy or profile.https_proxy or profile.http_proxy_alt or profile.https_proxy_alt):
-                st.markdown("**Proxy:** Not configured")
+    if profile.cluster_source == "imported":
+        cols = st.columns(4)
+        cols[0].metric("Profile", profile.name)
+        cols[1].metric("K8s Version", profile.kubernetes_version)
+        cols[2].metric("Source", "Imported (kubeconfig)")
+        cols[3].metric("Status", profile.status.upper())
+        with st.expander("Cluster Details", expanded=False):
+            st.markdown(f"**Description:** {profile.description or 'N/A'}")
+            st.markdown(f"**Kubeconfig:** {'Loaded' if profile.kubeconfig_content else 'Not loaded'}")
+    else:
+        cols = st.columns(5)
+        cols[0].metric("Profile", profile.name)
+        cols[1].metric("K8s Version", profile.kubernetes_version)
+        cols[2].metric("Runtime", f"CRI-O {profile.crio_version}")
+        cols[3].metric("CNI", "Flannel")
+        cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W")
+
+        with st.expander("Storage & Proxy Details", expanded=False):
+            scol1, scol2, scol3 = st.columns(3)
+            with scol1:
+                st.markdown(f"**CRI-O Root:** `{profile.crio_root}`")
+                st.markdown(f"**CRI-O RunRoot:** `{profile.crio_runroot}`")
+            with scol2:
+                st.markdown(f"**Kubelet Dir:** `{profile.kubelet_root}`")
+                st.markdown(f"**Log Root:** `{profile.log_root}`")
+            with scol3:
+                if profile.http_proxy or profile.https_proxy:
+                    st.markdown(f"**HTTP Proxy:** `{profile.http_proxy or 'N/A'}`")
+                    st.markdown(f"**HTTPS Proxy:** `{profile.https_proxy or 'N/A'}`")
+                    if profile.no_proxy:
+                        st.markdown(f"**No Proxy:** `{profile.no_proxy}`")
+                if profile.http_proxy_alt or profile.https_proxy_alt:
+                    st.markdown(f"**Alt HTTP Proxy:** `{profile.http_proxy_alt or 'N/A'}`")
+                    st.markdown(f"**Alt HTTPS Proxy:** `{profile.https_proxy_alt or 'N/A'}`")
+                if not (profile.http_proxy or profile.https_proxy or profile.http_proxy_alt or profile.https_proxy_alt):
+                    st.markdown("**Proxy:** Not configured")
 
 
 # ── Main Router ───────────────────────────────────────────────────────────
@@ -1371,12 +2367,16 @@ def main():
         page_profile_manager()
     elif page == "Cluster Creation":
         page_cluster_creation()
+    elif page == "Resource Viewer":
+        page_resource_viewer()
     elif page == "Cluster Debugger":
         page_cluster_debugger()
     elif page == "Monitoring Setup":
         page_monitoring_setup()
     elif page == "Log Analysis":
         page_log_analysis()
+    elif page == "Upgrade Planner":
+        page_upgrade_planner()
     elif page == "AI Assistant":
         page_ai_assistant()
 
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 22dcfd9..79c94cd 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -1,10 +1,12 @@
 """Cluster Creator — SSH-based K8s cluster provisioning with CRI-O + Flannel."""
 
+import os
 import subprocess
 import time
 from dataclasses import dataclass, field
 from typing import List, Optional
 
+import config
 from modules.profile_manager import ClusterProfile
 
 # Default Flannel manifest URL — can be overridden by user-uploaded file
@@ -1183,6 +1185,88 @@ def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
     return query_llm(prompt)
 
 
+def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSHResult:
+    """Run a kubectl (or helm) command against the cluster.
+
+    For imported clusters (with kubeconfig), commands run locally.
+    For provisioned clusters, commands run via SSH on the control-plane node.
+
+    If `command` starts with 'helm ', it is treated as a helm command and
+    the KUBECONFIG env var is set instead of prefixing with 'kubectl'.
+    """
+    is_helm = command.strip().startswith("helm ")
+
+    if profile.kubeconfig_content:
+        # Write kubeconfig to a file and run locally
+        kubeconfig_path = os.path.join(
+            config.DATA_DIR, "kubeconfigs", f"{profile.name}.kubeconfig"
+        )
+        os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+        with open(kubeconfig_path, "w") as f:
+            f.write(profile.kubeconfig_content)
+
+        if is_helm:
+            full_cmd = f"KUBECONFIG={kubeconfig_path} {command}"
+        else:
+            full_cmd = f"kubectl --kubeconfig={kubeconfig_path} {command}"
+        try:
+            proc = subprocess.run(
+                full_cmd,
+                shell=True,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+            )
+            return SSHResult(
+                hostname="local (kubeconfig)",
+                command=full_cmd,
+                return_code=proc.returncode,
+                stdout=proc.stdout,
+                stderr=proc.stderr,
+                success=proc.returncode == 0,
+            )
+        except subprocess.TimeoutExpired:
+            return SSHResult(
+                hostname="local (kubeconfig)",
+                command=full_cmd,
+                return_code=-1,
+                stdout="",
+                stderr=f"Command timed out after {timeout}s",
+                success=False,
+            )
+        except Exception as exc:
+            return SSHResult(
+                hostname="local (kubeconfig)",
+                command=full_cmd,
+                return_code=-1,
+                stdout="",
+                stderr=str(exc),
+                success=False,
+            )
+    else:
+        # Provisioned cluster — SSH to control-plane
+        cp_nodes = profile.get_control_plane_nodes()
+        if not cp_nodes:
+            return SSHResult(
+                hostname="N/A",
+                command=command,
+                return_code=-1,
+                stdout="",
+                stderr="No control-plane node defined",
+                success=False,
+            )
+        cp = cp_nodes[0]
+        remote_cmd = command if is_helm else f"kubectl {command}"
+        return run_ssh_command(
+            ip_address=cp["ip_address"],
+            command=remote_cmd,
+            ssh_user=cp.get("ssh_user", "root"),
+            ssh_port=cp.get("ssh_port", 22),
+            ssh_key_path=cp.get("ssh_key_path", "~/.ssh/id_rsa"),
+            timeout=timeout,
+        )
+
+
 def upload_flannel_manifest_to_node(node: dict, local_path: str) -> SSHResult:
     """SCP a user-provided Flannel manifest to a node as /tmp/kube-flannel-custom.yml."""
     scp_cmd = [
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index b6798cc..390e6ed 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -1,11 +1,36 @@
-"""Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations."""
+"""Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations.
+
+Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based).
+"""
+
+import os
+import subprocess
 
 from modules.cluster_creator import run_ssh_command, SSHResult
 from modules.profile_manager import ClusterProfile
+import config
 
 
 # ── Diagnostic command definitions ────────────────────────────────────────
 
+# kubectl-only commands (work for both imported and provisioned clusters)
+KUBECTL_DIAGNOSTIC_COMMANDS = {
+    "Node Status": "get nodes -o wide",
+    "Pod Status (All Namespaces)": "get pods -A -o wide",
+    "Events (Recent)": "get events -A --sort-by=.lastTimestamp",
+    "Component Status": "get componentstatuses",
+    "System Pods": "-n kube-system get pods -o wide",
+    "Node Resources": "top nodes",
+    "Pod Resources": "top pods -A",
+    "Cluster Info": "cluster-info",
+    "Flannel Status": "-n kube-flannel get pods -o wide",
+    "Network Policies": "get networkpolicies -A",
+    "Services": "get svc -A",
+    "PVCs": "get pvc -A",
+    "Ingresses": "get ingress -A",
+}
+
+# Full SSH commands (backward-compat for provisioned clusters)
 DIAGNOSTIC_COMMANDS = {
     "Node Status": "kubectl get nodes -o wide",
     "Pod Status (All Namespaces)": "kubectl get pods -A -o wide",
@@ -30,6 +55,39 @@
     "Certificate Expiry": "kubeadm certs check-expiration 2>/dev/null || echo 'Not a kubeadm node or kubeadm not found'",
 }
 
+
+def _run_local_kubectl(kubeconfig_content: str, kubectl_args: str, timeout: int = 60) -> SSHResult:
+    """Run a kubectl command locally using the given kubeconfig content."""
+    kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_debug_temp.kubeconfig")
+    os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+    with open(kubeconfig_path, "w") as f:
+        f.write(kubeconfig_content)
+    full_cmd = f"kubectl --kubeconfig={kubeconfig_path} {kubectl_args}"
+    try:
+        proc = subprocess.run(full_cmd, shell=True, capture_output=True, text=True, timeout=timeout)
+        return SSHResult(
+            hostname="local", command=full_cmd, return_code=proc.returncode,
+            stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
+        )
+    except subprocess.TimeoutExpired:
+        return SSHResult(
+            hostname="local", command=full_cmd, return_code=-1,
+            stdout="", stderr=f"Command timed out after {timeout}s", success=False,
+        )
+    except Exception as e:
+        return SSHResult(
+            hostname="local", command=full_cmd, return_code=-1,
+            stdout="", stderr=str(e), success=False,
+        )
+
+
+def get_available_commands(profile: ClusterProfile) -> dict[str, str]:
+    """Return available diagnostic commands based on cluster source."""
+    if profile.cluster_source == "imported":
+        return dict(KUBECTL_DIAGNOSTIC_COMMANDS)
+    return dict(DIAGNOSTIC_COMMANDS)
+
+
 CATEGORY_MAP = {
     "Cluster Overview": [
         "Node Status",
@@ -73,10 +131,33 @@
 
 
 def run_diagnostic(
-    control_plane_node: dict,
+    control_plane_node: dict | None,
     command_name: str,
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
-    """Run a single diagnostic command on the control-plane node."""
+    """Run a single diagnostic command.
+
+    For imported clusters, uses kubectl locally with kubeconfig.
+    For provisioned clusters, uses SSH to the control-plane node.
+    """
+    # Imported cluster path
+    if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+        kubectl_args = KUBECTL_DIAGNOSTIC_COMMANDS.get(command_name)
+        if kubectl_args is None:
+            return SSHResult(
+                hostname="local", command=command_name, return_code=1,
+                stdout="",
+                stderr=f"Command '{command_name}' requires SSH (not available for imported clusters).",
+                success=False,
+            )
+        return _run_local_kubectl(profile.kubeconfig_content, kubectl_args, timeout=60)
+
+    # Provisioned cluster path (SSH)
+    if not control_plane_node:
+        return SSHResult(
+            hostname="unknown", command=command_name, return_code=1,
+            stdout="", stderr="No control-plane node available.", success=False,
+        )
     command = DIAGNOSTIC_COMMANDS.get(command_name)
     if not command:
         return SSHResult(
@@ -98,30 +179,47 @@ def run_diagnostic(
 
 
 def run_category_diagnostics(
-    control_plane_node: dict,
+    control_plane_node: dict | None,
     category: str,
+    profile: ClusterProfile | None = None,
 ) -> dict[str, SSHResult]:
     """Run all diagnostic commands for a given category."""
     results = {}
     command_names = CATEGORY_MAP.get(category, [])
     for name in command_names:
-        results[name] = run_diagnostic(control_plane_node, name)
+        results[name] = run_diagnostic(control_plane_node, name, profile=profile)
     return results
 
 
-def run_all_diagnostics(control_plane_node: dict) -> dict[str, SSHResult]:
+def run_all_diagnostics(
+    control_plane_node: dict | None,
+    profile: ClusterProfile | None = None,
+) -> dict[str, SSHResult]:
     """Run every diagnostic command."""
+    commands = get_available_commands(profile) if profile else DIAGNOSTIC_COMMANDS
     results = {}
-    for name in DIAGNOSTIC_COMMANDS:
-        results[name] = run_diagnostic(control_plane_node, name)
+    for name in commands:
+        results[name] = run_diagnostic(control_plane_node, name, profile=profile)
     return results
 
 
 def run_custom_command(
-    control_plane_node: dict,
+    control_plane_node: dict | None,
     command: str,
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
-    """Run a custom command on the control-plane node."""
+    """Run a custom command. For imported clusters, runs kubectl locally."""
+    if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+        cmd = command.strip()
+        if cmd.startswith("kubectl "):
+            cmd = cmd[len("kubectl "):]
+        return _run_local_kubectl(profile.kubeconfig_content, cmd, timeout=60)
+
+    if not control_plane_node:
+        return SSHResult(
+            hostname="unknown", command=command, return_code=1,
+            stdout="", stderr="No control-plane node available.", success=False,
+        )
     return run_ssh_command(
         ip_address=control_plane_node["ip_address"],
         command=command,
@@ -211,9 +309,27 @@ def get_debug_suggestion(
     return query_llm(prompt)
 
 
-def check_pod_issues(control_plane_node: dict, namespace: str = "") -> SSHResult:
+def check_pod_issues(
+    control_plane_node: dict | None,
+    namespace: str = "",
+    profile: ClusterProfile | None = None,
+) -> SSHResult:
     """Check for pods in non-running states."""
     ns_flag = f"-n {namespace}" if namespace else "-A"
+
+    if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+        kubectl_args = (
+            f"get pods {ns_flag} "
+            "--field-selector=status.phase!=Running,status.phase!=Succeeded -o wide"
+        )
+        return _run_local_kubectl(profile.kubeconfig_content, kubectl_args, timeout=60)
+
+    if not control_plane_node:
+        return SSHResult(
+            hostname="unknown", command="check_pod_issues", return_code=1,
+            stdout="", stderr="No control-plane node available.", success=False,
+        )
+
     command = (
         f"kubectl get pods {ns_flag} --field-selector="
         "'status.phase!=Running,status.phase!=Succeeded' -o wide 2>/dev/null; "
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index fea9438..c58c86d 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -1,11 +1,62 @@
-"""Log Analyzer — Kubernetes log collection, parsing, error correlation, and analysis."""
+"""Log Analyzer — Kubernetes log collection, parsing, error correlation, and analysis.
 
+Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based).
+"""
+
+import os
 import re
+import subprocess
 from collections import Counter
 from dataclasses import dataclass, field
 from typing import Optional
 
 from modules.cluster_creator import run_ssh_command, SSHResult
+from modules.profile_manager import ClusterProfile
+import config
+
+
+def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 60) -> SSHResult:
+    """Run a shell command locally with KUBECONFIG set from profile content."""
+    kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_log_temp.kubeconfig")
+    os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+    with open(kubeconfig_path, "w") as f:
+        f.write(kubeconfig_content)
+    env = dict(os.environ, KUBECONFIG=kubeconfig_path)
+    try:
+        proc = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
+        return SSHResult(
+            hostname="local", command=command, return_code=proc.returncode,
+            stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
+        )
+    except subprocess.TimeoutExpired:
+        return SSHResult(
+            hostname="local", command=command, return_code=-1,
+            stdout="", stderr=f"Command timed out after {timeout}s", success=False,
+        )
+    except Exception as e:
+        return SSHResult(
+            hostname="local", command=command, return_code=-1,
+            stdout="", stderr=str(e), success=False,
+        )
+
+
+def _run_on_cluster(control_plane_node: dict | None, command: str, profile: ClusterProfile | None = None, timeout: int = 60) -> SSHResult:
+    """Route command to local shell or SSH based on cluster source."""
+    if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+        return _run_local_shell(profile.kubeconfig_content, command, timeout=timeout)
+    if not control_plane_node:
+        return SSHResult(
+            hostname="unknown", command=command, return_code=1,
+            stdout="", stderr="No control-plane node available.", success=False,
+        )
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=timeout,
+    )
 
 
 @dataclass
@@ -34,6 +85,9 @@ class LogAnalysisResult:
 
 # ── Log collection commands ───────────────────────────────────────────────
 
+# SSH-only sources (journalctl requires node access)
+SSH_ONLY_LOG_SOURCES = {"Kubelet", "CRI-O"}
+
 LOG_SOURCES = {
     "Kubelet": "journalctl -u kubelet --no-pager -n {lines} --since '{since}'",
     "CRI-O": "journalctl -u crio --no-pager -n {lines} --since '{since}'",
@@ -46,22 +100,39 @@ class LogAnalysisResult:
     "Events": "kubectl get events -A --sort-by='.lastTimestamp' | tail -{lines}",
 }
 
+def get_available_log_sources(profile: ClusterProfile | None = None) -> list[str]:
+    """Return log sources available for the given cluster type."""
+    if profile and profile.cluster_source == "imported":
+        return [s for s in LOG_SOURCES if s not in SSH_ONLY_LOG_SOURCES]
+    return list(LOG_SOURCES.keys())
+
+
 POD_LOG_COMMAND = "kubectl logs {pod_ref} --tail={lines} --since={since_k8s} {container_flag}"
 POD_PREVIOUS_LOG_COMMAND = "kubectl logs {pod_ref} --previous --tail={lines} {container_flag} 2>/dev/null || echo 'No previous logs available'"
 
 
 def collect_logs(
-    control_plane_node: dict,
+    control_plane_node: dict | None,
     source: str,
     lines: int = 200,
     since: str = "1 hour ago",
     since_k8s: str = "1h",
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
     """Collect logs from a specific source on the cluster."""
+    # Block SSH-only sources for imported clusters
+    if profile and profile.cluster_source == "imported" and source in SSH_ONLY_LOG_SOURCES:
+        return SSHResult(
+            hostname="local", command=source, return_code=1,
+            stdout="",
+            stderr=f"'{source}' logs require SSH access (not available for imported clusters).",
+            success=False,
+        )
+
     cmd_template = LOG_SOURCES.get(source)
     if not cmd_template:
         return SSHResult(
-            hostname=control_plane_node["ip_address"],
+            hostname=control_plane_node["ip_address"] if control_plane_node else "local",
             command=source,
             return_code=1,
             stdout="",
@@ -75,24 +146,18 @@ def collect_logs(
         since_k8s=since_k8s,
     )
 
-    return run_ssh_command(
-        ip_address=control_plane_node["ip_address"],
-        command=command,
-        ssh_user=control_plane_node.get("ssh_user", "root"),
-        ssh_port=control_plane_node.get("ssh_port", 22),
-        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
-        timeout=60,
-    )
+    return _run_on_cluster(control_plane_node, command, profile=profile, timeout=60)
 
 
 def collect_pod_logs(
-    control_plane_node: dict,
+    control_plane_node: dict | None,
     namespace: str,
     pod_name: str,
     container: str = "",
     lines: int = 200,
     since_k8s: str = "1h",
     previous: bool = False,
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
     """Collect logs from a specific pod."""
     pod_ref = f"-n {namespace} {pod_name}"
@@ -112,28 +177,22 @@ def collect_pod_logs(
             container_flag=container_flag,
         )
 
-    return run_ssh_command(
-        ip_address=control_plane_node["ip_address"],
-        command=command,
-        ssh_user=control_plane_node.get("ssh_user", "root"),
-        ssh_port=control_plane_node.get("ssh_port", 22),
-        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
-        timeout=60,
-    )
+    return _run_on_cluster(control_plane_node, command, profile=profile, timeout=60)
 
 
 def collect_multi_source_logs(
-    control_plane_node: dict,
+    control_plane_node: dict | None,
     sources: list[str],
     lines: int = 100,
     since: str = "1 hour ago",
     since_k8s: str = "1h",
+    profile: ClusterProfile | None = None,
 ) -> dict[str, SSHResult]:
     """Collect logs from multiple sources."""
     results = {}
     for source in sources:
         results[source] = collect_logs(
-            control_plane_node, source, lines, since, since_k8s
+            control_plane_node, source, lines, since, since_k8s, profile=profile
         )
     return results
 
@@ -338,17 +397,11 @@ def llm_correlate_analysis(
 
 
 def get_pod_list(
-    control_plane_node: dict,
+    control_plane_node: dict | None,
     namespace: str = "",
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
     """Get list of pods for the log analysis UI."""
     ns_flag = f"-n {namespace}" if namespace else "-A"
     command = f"kubectl get pods {ns_flag} -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,CONTAINERS:.spec.containers[*].name' --no-headers"
-    return run_ssh_command(
-        ip_address=control_plane_node["ip_address"],
-        command=command,
-        ssh_user=control_plane_node.get("ssh_user", "root"),
-        ssh_port=control_plane_node.get("ssh_port", 22),
-        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
-        timeout=30,
-    )
+    return _run_on_cluster(control_plane_node, command, profile=profile, timeout=30)
diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py
index a2272d7..3df9ba4 100644
--- a/k8s-agent/modules/monitoring_setup.py
+++ b/k8s-agent/modules/monitoring_setup.py
@@ -1,7 +1,58 @@
-"""Monitoring Setup — Prometheus, Grafana, and dashboard provisioning via SSH."""
+"""Monitoring Setup — Prometheus, Grafana, and dashboard provisioning.
+
+Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based).
+"""
+
+import os
+import subprocess
 
 from modules.cluster_creator import run_ssh_command, SSHResult
 from modules.profile_manager import ClusterProfile
+import config
+
+
+def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 120) -> SSHResult:
+    """Run a shell command locally with KUBECONFIG set from profile content."""
+    kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_monitor_temp.kubeconfig")
+    os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+    with open(kubeconfig_path, "w") as f:
+        f.write(kubeconfig_content)
+    env = dict(os.environ, KUBECONFIG=kubeconfig_path)
+    try:
+        proc = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
+        return SSHResult(
+            hostname="local", command=command, return_code=proc.returncode,
+            stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
+        )
+    except subprocess.TimeoutExpired:
+        return SSHResult(
+            hostname="local", command=command, return_code=-1,
+            stdout="", stderr=f"Command timed out after {timeout}s", success=False,
+        )
+    except Exception as e:
+        return SSHResult(
+            hostname="local", command=command, return_code=-1,
+            stdout="", stderr=str(e), success=False,
+        )
+
+
+def _run_on_cluster(control_plane_node: dict | None, command: str, profile: ClusterProfile | None = None, timeout: int = 120) -> SSHResult:
+    """Route command to local kubectl or SSH based on cluster source."""
+    if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+        return _run_local_shell(profile.kubeconfig_content, command, timeout=timeout)
+    if not control_plane_node:
+        return SSHResult(
+            hostname="unknown", command=command, return_code=1,
+            stdout="", stderr="No control-plane node available.", success=False,
+        )
+    return run_ssh_command(
+        ip_address=control_plane_node["ip_address"],
+        command=command,
+        ssh_user=control_plane_node.get("ssh_user", "root"),
+        ssh_port=control_plane_node.get("ssh_port", 22),
+        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+        timeout=timeout,
+    )
 
 
 def generate_helm_install_script() -> str:
@@ -324,67 +375,44 @@ def generate_alerting_rules_script(namespace: str = "monitoring") -> str:
 """
 
 
-def install_helm(control_plane_node: dict) -> SSHResult:
-    """Install Helm on the control-plane node."""
-    return run_ssh_command(
-        ip_address=control_plane_node["ip_address"],
-        command=generate_helm_install_script(),
-        ssh_user=control_plane_node.get("ssh_user", "root"),
-        ssh_port=control_plane_node.get("ssh_port", 22),
-        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
-        timeout=120,
-    )
+def install_helm(control_plane_node: dict | None = None, profile: ClusterProfile | None = None) -> SSHResult:
+    """Install Helm on the control-plane node or locally for imported clusters."""
+    return _run_on_cluster(control_plane_node, generate_helm_install_script(), profile=profile, timeout=120)
 
 
 def install_prometheus_stack(
-    control_plane_node: dict,
+    control_plane_node: dict | None = None,
     namespace: str = "monitoring",
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
     """Install the full kube-prometheus-stack."""
-    return run_ssh_command(
-        ip_address=control_plane_node["ip_address"],
-        command=generate_prometheus_install_script(namespace),
-        ssh_user=control_plane_node.get("ssh_user", "root"),
-        ssh_port=control_plane_node.get("ssh_port", 22),
-        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
-        timeout=900,
-    )
+    return _run_on_cluster(control_plane_node, generate_prometheus_install_script(namespace), profile=profile, timeout=900)
 
 
 def install_dashboards(
-    control_plane_node: dict,
-    dashboard_keys: list[str],
+    control_plane_node: dict | None = None,
+    dashboard_keys: list[str] | None = None,
     namespace: str = "monitoring",
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
     """Import selected Grafana dashboards."""
-    return run_ssh_command(
-        ip_address=control_plane_node["ip_address"],
-        command=generate_dashboard_import_script(dashboard_keys, namespace),
-        ssh_user=control_plane_node.get("ssh_user", "root"),
-        ssh_port=control_plane_node.get("ssh_port", 22),
-        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
-        timeout=300,
-    )
+    dashboard_keys = dashboard_keys or []
+    return _run_on_cluster(control_plane_node, generate_dashboard_import_script(dashboard_keys, namespace), profile=profile, timeout=300)
 
 
 def install_alert_rules(
-    control_plane_node: dict,
+    control_plane_node: dict | None = None,
     namespace: str = "monitoring",
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
     """Install Prometheus alerting rules."""
-    return run_ssh_command(
-        ip_address=control_plane_node["ip_address"],
-        command=generate_alerting_rules_script(namespace),
-        ssh_user=control_plane_node.get("ssh_user", "root"),
-        ssh_port=control_plane_node.get("ssh_port", 22),
-        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
-        timeout=60,
-    )
+    return _run_on_cluster(control_plane_node, generate_alerting_rules_script(namespace), profile=profile, timeout=60)
 
 
 def get_monitoring_status(
-    control_plane_node: dict,
+    control_plane_node: dict | None = None,
     namespace: str = "monitoring",
+    profile: ClusterProfile | None = None,
 ) -> SSHResult:
     """Check the status of the monitoring stack."""
     command = f"""
@@ -405,14 +433,7 @@ def get_monitoring_status(
 echo ">> ServiceMonitors:"
 kubectl -n {namespace} get servicemonitors 2>/dev/null || echo "No ServiceMonitors found"
 """
-    return run_ssh_command(
-        ip_address=control_plane_node["ip_address"],
-        command=command,
-        ssh_user=control_plane_node.get("ssh_user", "root"),
-        ssh_port=control_plane_node.get("ssh_port", 22),
-        ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
-        timeout=30,
-    )
+    return _run_on_cluster(control_plane_node, command, profile=profile, timeout=30)
 
 
 def get_monitoring_advice(
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
index 90cd9bf..d5707cd 100644
--- a/k8s-agent/modules/profile_manager.py
+++ b/k8s-agent/modules/profile_manager.py
@@ -54,6 +54,9 @@ class ClusterProfile:
     # Offline manifest paths — user-provided files for air-gapped environments
     flannel_manifest_path: str = ""  # local path to kube-flannel.yml
     prometheus_manifest_path: str = ""  # local path to prometheus manifest
+    # Kubeconfig for existing clusters (imported, not provisioned)
+    kubeconfig_content: str = ""  # raw kubeconfig YAML content
+    cluster_source: str = "provisioned"  # "provisioned" or "imported"
 
     def get_control_plane_nodes(self) -> list[dict]:
         return [n for n in self.nodes if n.get("role") == "control-plane"]

From fc3b900ceccbbec2004e5683341fe8dc2d2d5558 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:57:16 +0000
Subject: [PATCH 09/31] Fix kubeconfig import: move file_uploader outside
 st.form to prevent silent reset on submit

---
 k8s-agent/app.py | 96 ++++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 47 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 0000017..d2d0bd0 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -464,54 +464,56 @@ def page_profile_manager():
             "without provisioning a new cluster."
         )
 
-        with st.form("import_cluster_form"):
-            import_name = st.text_input(
-                "Profile Name *",
-                placeholder="my-existing-cluster",
-            )
-            import_desc = st.text_area(
-                "Description",
-                placeholder="Production cluster running in datacenter A",
-            )
-            kubeconfig_file = st.file_uploader(
-                "Upload kubeconfig file",
-                type=["yaml", "yml", "conf", "config"],
-                key="kubeconfig_upload",
-                help="Usually found at ~/.kube/config on your cluster's control-plane node.",
-            )
-            k8s_ver = st.text_input(
-                "Kubernetes Version (optional)",
-                placeholder="1.30",
-                value="1.30",
-            )
-
-            submitted_import = st.form_submit_button(
-                "Import Cluster", type="primary", use_container_width=True,
-            )
+        # NOTE: file_uploader is kept OUTSIDE st.form because Streamlit
+        # resets the uploaded file on form submission, causing the import
+        # to silently do nothing.
+        import_name = st.text_input(
+            "Profile Name *",
+            placeholder="my-existing-cluster",
+            key="import_cluster_name",
+        )
+        import_desc = st.text_area(
+            "Description",
+            placeholder="Production cluster running in datacenter A",
+            key="import_cluster_desc",
+        )
+        kubeconfig_file = st.file_uploader(
+            "Upload kubeconfig file",
+            type=["yaml", "yml", "conf", "config", "txt"],
+            key="kubeconfig_upload",
+            help="Usually found at ~/.kube/config on your cluster's control-plane node. "
+                 "If your file has no extension, rename it to config.yaml or config.txt before uploading.",
+        )
+        k8s_ver = st.text_input(
+            "Kubernetes Version (optional)",
+            placeholder="1.30",
+            value="1.30",
+            key="import_cluster_k8s_ver",
+        )
 
-            if submitted_import:
-                if not import_name:
-                    st.error("Profile name is required.")
-                elif not kubeconfig_file:
-                    st.error("Please upload a kubeconfig file.")
-                else:
-                    kubeconfig_content = kubeconfig_file.read().decode("utf-8")
-                    profile = ClusterProfile(
-                        name=import_name,
-                        description=import_desc,
-                        kubernetes_version=k8s_ver or "1.30",
-                        status="active",
-                        cluster_source="imported",
-                        kubeconfig_content=kubeconfig_content,
-                    )
-                    save_profile(profile)
-                    st.session_state.active_profile = import_name
-                    st.success(
-                        f"Cluster '{import_name}' imported! "
-                        "Select it from the sidebar to start using Debugger, Monitoring, "
-                        "Resource Viewer, etc."
-                    )
-                    st.rerun()
+        if st.button("Import Cluster", type="primary", use_container_width=True, key="import_cluster_btn"):
+            if not import_name:
+                st.error("Profile name is required.")
+            elif not kubeconfig_file:
+                st.error("Please upload a kubeconfig file.")
+            else:
+                kubeconfig_content = kubeconfig_file.read().decode("utf-8")
+                profile = ClusterProfile(
+                    name=import_name,
+                    description=import_desc,
+                    kubernetes_version=k8s_ver or "1.30",
+                    status="imported",
+                    cluster_source="imported",
+                    kubeconfig_content=kubeconfig_content,
+                )
+                save_profile(profile)
+                st.session_state.active_profile = import_name
+                st.success(
+                    f"Cluster '{import_name}' imported! "
+                    "Select it from the sidebar to start using Debugger, Monitoring, "
+                    "Resource Viewer, etc."
+                )
+                st.rerun()
 
     # ── Manage Profiles ───────────────────────────────────────────────────
     with tab_list:

From d68e2699bb16ddda8a838f7ac9fbf243c237518e Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 13:04:09 +0000
Subject: [PATCH 10/31] Add kubectl path detection and namespace auto-fetch for
 imported clusters

---
 k8s-agent/app.py                      | 138 ++++++++++++++++++++++----
 k8s-agent/config.py                   |  77 ++++++++++++++
 k8s-agent/modules/cluster_debugger.py |  20 +++-
 k8s-agent/modules/log_analyzer.py     |  29 +++++-
 k8s-agent/modules/monitoring_setup.py |  29 +++++-
 5 files changed, 265 insertions(+), 28 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index d2d0bd0..b2342e6 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -11,7 +11,7 @@
 # Navigation uses native st.radio — no third-party component needed.
 
 import config
-from config import is_llm_configured
+from config import is_llm_configured, get_kubectl_path, fetch_namespaces, get_kubeconfig_path
 from modules.profile_manager import (
     ClusterProfile,
     save_profile,
@@ -899,6 +899,17 @@ def page_cluster_debugger():
             return
         cp_node = cp_nodes[0]
 
+    # kubectl availability check for imported clusters
+    if profile.cluster_source == "imported" and not get_kubectl_path():
+        st.error(
+            "**kubectl not found** on this machine.\n\n"
+            "Install it with:\n```\n"
+            "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+            "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
+            "Or see: https://kubernetes.io/docs/tasks/tools/"
+        )
+        return
+
     available_commands = get_available_commands(profile)
 
     tab_quick, tab_category, tab_custom, tab_ai = st.tabs([
@@ -1062,7 +1073,29 @@ def page_monitoring_setup():
             return
         cp_node = cp_nodes[0]
 
-    namespace = st.text_input("Monitoring Namespace", value="monitoring")
+    # kubectl availability check for imported clusters
+    if profile.cluster_source == "imported" and not get_kubectl_path():
+        st.error(
+            "**kubectl not found** on this machine.\n\n"
+            "Install it with:\n```\n"
+            "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+            "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
+            "Or see: https://kubernetes.io/docs/tasks/tools/"
+        )
+        return
+
+    # Namespace selection — auto-fetch from cluster for imported clusters
+    if profile.cluster_source == "imported" and profile.kubeconfig_content:
+        cluster_ns = fetch_namespaces(profile.kubeconfig_content)
+        if cluster_ns:
+            # Ensure "monitoring" is an option even if it doesn't exist yet
+            ns_options = cluster_ns if "monitoring" in cluster_ns else cluster_ns + ["monitoring"]
+            default_idx = ns_options.index("monitoring") if "monitoring" in ns_options else 0
+            namespace = st.selectbox("Monitoring Namespace", options=ns_options, index=default_idx, key="mon_ns")
+        else:
+            namespace = st.text_input("Monitoring Namespace", value="monitoring", key="mon_ns_txt")
+    else:
+        namespace = st.text_input("Monitoring Namespace", value="monitoring", key="mon_ns_txt")
 
     tab_install, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([
         "Install Stack",
@@ -1216,6 +1249,22 @@ def page_log_analysis():
             return
         cp_node = cp_nodes[0]
 
+    # kubectl availability check for imported clusters
+    if profile.cluster_source == "imported" and not get_kubectl_path():
+        st.error(
+            "**kubectl not found** on this machine.\n\n"
+            "Install it with:\n```\n"
+            "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+            "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
+            "Or see: https://kubernetes.io/docs/tasks/tools/"
+        )
+        return
+
+    # Pre-fetch namespaces for imported clusters (used by Pod Logs tab)
+    _cluster_namespaces: list[str] = []
+    if profile.cluster_source == "imported" and profile.kubeconfig_content:
+        _cluster_namespaces = fetch_namespaces(profile.kubeconfig_content)
+
     available_log_sources = get_available_log_sources(profile)
 
     tab_system, tab_pod, tab_correlation, tab_ai = st.tabs([
@@ -1285,7 +1334,12 @@ def page_log_analysis():
         st.markdown("### Pod Logs")
         col1, col2 = st.columns(2)
         with col1:
-            pod_ns = st.text_input("Namespace", value="default", key="pod_ns")
+            if _cluster_namespaces:
+                pod_ns = st.selectbox("Namespace", options=_cluster_namespaces,
+                                      index=_cluster_namespaces.index("default") if "default" in _cluster_namespaces else 0,
+                                      key="pod_ns")
+            else:
+                pod_ns = st.text_input("Namespace", value="default", key="pod_ns")
             pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz", key="pod_name_input")
         with col2:
             container = st.text_input("Container (optional)", key="pod_container")
@@ -1452,6 +1506,22 @@ def page_resource_viewer():
         )
         return
 
+    # kubectl availability check for imported clusters
+    if profile.cluster_source == "imported" and not get_kubectl_path():
+        st.error(
+            "**kubectl not found** on this machine.\n\n"
+            "Install it with:\n```\n"
+            "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+            "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
+            "Or see: https://kubernetes.io/docs/tasks/tools/"
+        )
+        return
+
+    # Pre-fetch namespaces for imported clusters
+    _rv_namespaces: list[str] = []
+    if profile.cluster_source == "imported" and profile.kubeconfig_content:
+        _rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
+
     tab_resources, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
         "Cluster Resources",
         "Node Health",
@@ -1481,7 +1551,14 @@ def page_resource_viewer():
                     key="res_ns_choice",
                 )
                 if ns_choice == "Specific":
-                    namespace = st.text_input("Namespace", value="default", key="res_ns")
+                    if _rv_namespaces:
+                        namespace = st.selectbox(
+                            "Namespace", options=_rv_namespaces,
+                            index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+                            key="res_ns",
+                        )
+                    else:
+                        namespace = st.text_input("Namespace", value="default", key="res_ns")
                 else:
                     namespace = ""
             else:
@@ -1528,11 +1605,19 @@ def page_resource_viewer():
                 key="desc_name",
             )
         with desc_col2:
-            desc_ns = st.text_input(
-                "Namespace (if applicable)",
-                value="default",
-                key="desc_ns",
-            )
+            if _rv_namespaces:
+                desc_ns = st.selectbox(
+                    "Namespace (if applicable)",
+                    options=_rv_namespaces,
+                    index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+                    key="desc_ns",
+                )
+            else:
+                desc_ns = st.text_input(
+                    "Namespace (if applicable)",
+                    value="default",
+                    key="desc_ns",
+                )
 
         if st.button("Describe", key="describe_res") and desc_name:
             # Determine the singular resource type for describe
@@ -1638,12 +1723,21 @@ def page_resource_viewer():
 
         rbac_ns = ""
         if "(namespaced)" in rbac_type or rbac_type == "ServiceAccounts":
-            rbac_ns = st.text_input(
-                "Namespace",
-                value="default",
-                key="rbac_ns",
-                help="Leave blank for all namespaces",
-            )
+            if _rv_namespaces:
+                rbac_ns = st.selectbox(
+                    "Namespace (blank = all)",
+                    options=[""] + _rv_namespaces,
+                    index=0,
+                    key="rbac_ns",
+                    format_func=lambda x: "All Namespaces" if x == "" else x,
+                )
+            else:
+                rbac_ns = st.text_input(
+                    "Namespace",
+                    value="default",
+                    key="rbac_ns",
+                    help="Leave blank for all namespaces",
+                )
 
         if st.button("Fetch RBAC Resources", type="primary", key="fetch_rbac"):
             cmd_map = {
@@ -1708,7 +1802,12 @@ def page_resource_viewer():
             helm_ns_all = st.checkbox("All namespaces", value=True, key="helm_ns_all")
             helm_ns = ""
             if not helm_ns_all:
-                helm_ns = st.text_input("Namespace", value="default", key="helm_ns")
+                if _rv_namespaces:
+                    helm_ns = st.selectbox("Namespace", options=_rv_namespaces,
+                                           index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+                                           key="helm_ns")
+                else:
+                    helm_ns = st.text_input("Namespace", value="default", key="helm_ns")
 
             if st.button("List Helm Releases", type="primary", key="helm_list"):
                 helm_cmd = "helm list"
@@ -1733,7 +1832,12 @@ def page_resource_viewer():
                 helm_release_name = st.text_input("Release Name", placeholder="my-release", key="helm_rel")
                 helm_chart = st.text_input("Chart", placeholder="prometheus-community/kube-prometheus-stack", key="helm_chart")
             with hcol2:
-                helm_install_ns = st.text_input("Namespace", value="default", key="helm_install_ns")
+                if _rv_namespaces:
+                    helm_install_ns = st.selectbox("Namespace", options=_rv_namespaces,
+                                                   index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+                                                   key="helm_install_ns")
+                else:
+                    helm_install_ns = st.text_input("Namespace", value="default", key="helm_install_ns")
                 helm_create_ns = st.checkbox("Create namespace if not exists", value=True, key="helm_create_ns")
             helm_values = st.text_area(
                 "Values (YAML, optional)",
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index 14fb427..2317be1 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -1,6 +1,9 @@
 """Configuration for the K8s Agent application."""
 
 import os
+import shutil
+import subprocess
+
 
 # LLM Configuration
 LLM_API_URL = os.getenv(
@@ -27,3 +30,77 @@ def is_llm_configured() -> bool:
 # Ensure directories exist
 os.makedirs(PROFILES_DIR, exist_ok=True)
 os.makedirs(UPLOADS_DIR, exist_ok=True)
+
+
+# ── kubectl / helm path detection ─────────────────────────────────────────
+
+# Common install locations to check when kubectl/helm are not in PATH
+_KUBECTL_SEARCH_PATHS = [
+    "/usr/local/bin/kubectl",
+    "/usr/bin/kubectl",
+    "/snap/bin/kubectl",
+    os.path.expanduser("~/.local/bin/kubectl"),
+    os.path.expanduser("~/bin/kubectl"),
+    "/opt/bin/kubectl",
+]
+
+_HELM_SEARCH_PATHS = [
+    "/usr/local/bin/helm",
+    "/usr/bin/helm",
+    "/snap/bin/helm",
+    os.path.expanduser("~/.local/bin/helm"),
+    os.path.expanduser("~/bin/helm"),
+    "/opt/bin/helm",
+]
+
+
+def _find_binary(name: str, search_paths: list[str]) -> str:
+    """Find a binary by name, checking PATH first then common locations."""
+    found = shutil.which(name)
+    if found:
+        return found
+    for path in search_paths:
+        if os.path.isfile(path) and os.access(path, os.X_OK):
+            return path
+    return ""
+
+
+def get_kubectl_path() -> str:
+    """Return the full path to kubectl, or empty string if not found."""
+    return _find_binary("kubectl", _KUBECTL_SEARCH_PATHS)
+
+
+def get_helm_path() -> str:
+    """Return the full path to helm, or empty string if not found."""
+    return _find_binary("helm", _HELM_SEARCH_PATHS)
+
+
+def get_kubeconfig_path(profile_name: str = "_temp") -> str:
+    """Return the path where a kubeconfig file should be written for local commands."""
+    kc_dir = os.path.join(DATA_DIR, "kubeconfigs")
+    os.makedirs(kc_dir, exist_ok=True)
+    return os.path.join(kc_dir, f"{profile_name}.kubeconfig")
+
+
+def fetch_namespaces(kubeconfig_content: str) -> list[str]:
+    """Fetch all namespaces from a cluster using kubectl with the given kubeconfig.
+
+    Returns a list of namespace names, or an empty list on failure.
+    """
+    kubectl = get_kubectl_path()
+    if not kubectl:
+        return []
+    kc_path = get_kubeconfig_path("_ns_fetch")
+    os.makedirs(os.path.dirname(kc_path), exist_ok=True)
+    with open(kc_path, "w") as f:
+        f.write(kubeconfig_content)
+    try:
+        proc = subprocess.run(
+            f"{kubectl} --kubeconfig={kc_path} get namespaces -o jsonpath='{{.items[*].metadata.name}}'",
+            shell=True, capture_output=True, text=True, timeout=15,
+        )
+        if proc.returncode == 0 and proc.stdout.strip():
+            return sorted(proc.stdout.strip().split())
+        return []
+    except Exception:
+        return []
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index 390e6ed..45feb91 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -58,11 +58,25 @@
 
 def _run_local_kubectl(kubeconfig_content: str, kubectl_args: str, timeout: int = 60) -> SSHResult:
     """Run a kubectl command locally using the given kubeconfig content."""
-    kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_debug_temp.kubeconfig")
-    os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+    kubectl = config.get_kubectl_path()
+    if not kubectl:
+        return SSHResult(
+            hostname="local", command="kubectl " + kubectl_args, return_code=1,
+            stdout="",
+            stderr=(
+                "kubectl not found on this machine.\n\n"
+                "Install kubectl:\n"
+                "  curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+                "  chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n"
+                "Or on macOS: brew install kubectl\n"
+                "Or see: https://kubernetes.io/docs/tasks/tools/"
+            ),
+            success=False,
+        )
+    kubeconfig_path = config.get_kubeconfig_path("_debug_temp")
     with open(kubeconfig_path, "w") as f:
         f.write(kubeconfig_content)
-    full_cmd = f"kubectl --kubeconfig={kubeconfig_path} {kubectl_args}"
+    full_cmd = f"{kubectl} --kubeconfig={kubeconfig_path} {kubectl_args}"
     try:
         proc = subprocess.run(full_cmd, shell=True, capture_output=True, text=True, timeout=timeout)
         return SSHResult(
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index c58c86d..08a25e1 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -16,14 +16,35 @@
 
 
 def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 60) -> SSHResult:
-    """Run a shell command locally with KUBECONFIG set from profile content."""
-    kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_log_temp.kubeconfig")
-    os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+    """Run a shell command locally with KUBECONFIG set from profile content.
+
+    Replaces bare ``kubectl`` and ``helm`` references with their full paths
+    so the command works even when these binaries are not in $PATH.
+    """
+    kubectl = config.get_kubectl_path()
+    helm = config.get_helm_path()
+    if not kubectl:
+        return SSHResult(
+            hostname="local", command=command, return_code=1,
+            stdout="",
+            stderr=(
+                "kubectl not found on this machine.\n\n"
+                "Install kubectl:\n"
+                "  curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+                "  chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n"
+                "Or on macOS: brew install kubectl\n"
+                "Or see: https://kubernetes.io/docs/tasks/tools/"
+            ),
+            success=False,
+        )
+    # Replace bare kubectl/helm with full paths
+    resolved_cmd = command.replace("kubectl ", f"{kubectl} ").replace("helm ", f"{helm} " if helm else "helm ")
+    kubeconfig_path = config.get_kubeconfig_path("_log_temp")
     with open(kubeconfig_path, "w") as f:
         f.write(kubeconfig_content)
     env = dict(os.environ, KUBECONFIG=kubeconfig_path)
     try:
-        proc = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
+        proc = subprocess.run(resolved_cmd, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
         return SSHResult(
             hostname="local", command=command, return_code=proc.returncode,
             stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py
index 3df9ba4..c62be44 100644
--- a/k8s-agent/modules/monitoring_setup.py
+++ b/k8s-agent/modules/monitoring_setup.py
@@ -12,14 +12,35 @@
 
 
 def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 120) -> SSHResult:
-    """Run a shell command locally with KUBECONFIG set from profile content."""
-    kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_monitor_temp.kubeconfig")
-    os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+    """Run a shell command locally with KUBECONFIG set from profile content.
+
+    Replaces bare ``kubectl`` and ``helm`` references with their full paths
+    so the command works even when these binaries are not in $PATH.
+    """
+    kubectl = config.get_kubectl_path()
+    helm = config.get_helm_path()
+    if not kubectl:
+        return SSHResult(
+            hostname="local", command=command, return_code=1,
+            stdout="",
+            stderr=(
+                "kubectl not found on this machine.\n\n"
+                "Install kubectl:\n"
+                "  curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+                "  chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n"
+                "Or on macOS: brew install kubectl\n"
+                "Or see: https://kubernetes.io/docs/tasks/tools/"
+            ),
+            success=False,
+        )
+    # Replace bare kubectl/helm with full paths
+    resolved_cmd = command.replace("kubectl ", f"{kubectl} ").replace("helm ", f"{helm} " if helm else "helm ")
+    kubeconfig_path = config.get_kubeconfig_path("_monitor_temp")
     with open(kubeconfig_path, "w") as f:
         f.write(kubeconfig_content)
     env = dict(os.environ, KUBECONFIG=kubeconfig_path)
     try:
-        proc = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
+        proc = subprocess.run(resolved_cmd, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
         return SSHResult(
             hostname="local", command=command, return_code=proc.returncode,
             stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,

From 07a1b725053b95b746009bcfac7249cb70fa45a3 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 13:22:00 +0000
Subject: [PATCH 11/31] Improve kubectl detection: drop os.access check, add
 subprocess which fallback

---
 k8s-agent/config.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index 2317be1..90eb95b 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -55,13 +55,37 @@ def is_llm_configured() -> bool:
 
 
 def _find_binary(name: str, search_paths: list[str]) -> str:
-    """Find a binary by name, checking PATH first then common locations."""
+    """Find a binary by name, checking PATH first then common locations.
+
+    Strategy:
+      1. ``shutil.which`` — honours $PATH as seen by the Python process.
+      2. Probe well-known install directories with ``os.path.isfile``.
+         (Skip the ``os.access`` X_OK check because some SELinux / mount
+         configurations report False even though the file *is* executable.)
+      3. Last resort: ask the OS via ``/usr/bin/which`` in a subprocess,
+         which may see a different PATH than the Python process (e.g. when
+         Streamlit is started through systemd or a virtualenv wrapper).
+    """
+    # 1. shutil.which
     found = shutil.which(name)
     if found:
         return found
+    # 2. well-known paths — only check existence (skip os.access)
     for path in search_paths:
-        if os.path.isfile(path) and os.access(path, os.X_OK):
+        if os.path.isfile(path):
             return path
+    # 3. subprocess fallback — works when shell PATH differs from Python PATH
+    for which_cmd in ("which", "/usr/bin/which", "/bin/which"):
+        try:
+            proc = subprocess.run(
+                f"{which_cmd} {name}",
+                shell=True, capture_output=True, text=True, timeout=5,
+            )
+            result = proc.stdout.strip()
+            if proc.returncode == 0 and result and os.path.isfile(result):
+                return result
+        except Exception:
+            continue
     return ""
 
 

From b239963413cf2a84bdd9790b17eba3c73dd9f68a Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 04:47:46 +0000
Subject: [PATCH 12/31] Add metrics install, deployment scaling, pod shell,
 resource dropdown, fix disk usage for imported clusters

---
 k8s-agent/app.py                      | 494 +++++++++++++++++++++++---
 k8s-agent/modules/cluster_creator.py  |  28 +-
 k8s-agent/modules/cluster_debugger.py |   1 +
 3 files changed, 469 insertions(+), 54 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index b2342e6..6ac3fe4 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -899,16 +899,12 @@ def page_cluster_debugger():
             return
         cp_node = cp_nodes[0]
 
-    # kubectl availability check for imported clusters
+    # kubectl availability warning for imported clusters
     if profile.cluster_source == "imported" and not get_kubectl_path():
-        st.error(
-            "**kubectl not found** on this machine.\n\n"
-            "Install it with:\n```\n"
-            "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
-            "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
-            "Or see: https://kubernetes.io/docs/tasks/tools/"
+        st.warning(
+            "kubectl not found on this machine. Commands will fail until kubectl is installed.\n\n"
+            "Install: `curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl && mv kubectl ~/.local/bin/`"
         )
-        return
 
     available_commands = get_available_commands(profile)
 
@@ -1073,17 +1069,6 @@ def page_monitoring_setup():
             return
         cp_node = cp_nodes[0]
 
-    # kubectl availability check for imported clusters
-    if profile.cluster_source == "imported" and not get_kubectl_path():
-        st.error(
-            "**kubectl not found** on this machine.\n\n"
-            "Install it with:\n```\n"
-            "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
-            "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
-            "Or see: https://kubernetes.io/docs/tasks/tools/"
-        )
-        return
-
     # Namespace selection — auto-fetch from cluster for imported clusters
     if profile.cluster_source == "imported" and profile.kubeconfig_content:
         cluster_ns = fetch_namespaces(profile.kubeconfig_content)
@@ -1097,8 +1082,9 @@ def page_monitoring_setup():
     else:
         namespace = st.text_input("Monitoring Namespace", value="monitoring", key="mon_ns_txt")
 
-    tab_install, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([
+    tab_install, tab_metrics, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([
         "Install Stack",
+        "Metrics Components",
         "Dashboards",
         "Alert Rules",
         "Status",
@@ -1145,6 +1131,142 @@ def page_monitoring_setup():
                         st.error("Alert rules installation failed")
                         st.code(result.stderr, language="text")
 
+    # ── Metrics Components ────────────────────────────────────────────────
+    with tab_metrics:
+        st.markdown("### Metrics Components")
+        st.markdown(
+            "Install **metrics-server** (enables `kubectl top`) and/or "
+            "**kube-state-metrics** (exposes workload/object-level metrics to Prometheus)."
+        )
+
+        met_col1, met_col2 = st.columns(2)
+
+        with met_col1:
+            st.markdown("#### metrics-server")
+            st.markdown(
+                "Provides CPU/memory usage for pods and nodes. "
+                "Required for `kubectl top` and HPA autoscaling."
+            )
+            ms_insecure = st.checkbox(
+                "Add `--kubelet-insecure-tls` flag (self-signed certs)",
+                value=True,
+                key="ms_insecure",
+            )
+            if st.button("Install metrics-server", type="primary", key="install_ms"):
+                ms_url = (
+                    "https://github.com/kubernetes-sigs/metrics-server"
+                    "/releases/latest/download/components.yaml"
+                )
+                with st.status("Installing metrics-server...", expanded=True):
+                    # Apply the manifest
+                    apply_result = run_kubectl(
+                        profile,
+                        f"apply -f {ms_url}",
+                        timeout=60,
+                    )
+                    if apply_result.success:
+                        st.write("Manifest applied successfully.")
+                        st.code(apply_result.stdout, language="text")
+                        # Patch for insecure TLS if requested
+                        if ms_insecure:
+                            patch_cmd = (
+                                "patch deployment metrics-server -n kube-system "
+                                "--type=json -p="
+                                "'[{\"op\":\"add\",\"path\":\"/spec/template/spec/containers/0/args/-\","
+                                "\"value\":\"--kubelet-insecure-tls\"}]'"
+                            )
+                            patch_result = run_kubectl(profile, patch_cmd, timeout=30)
+                            if patch_result.success:
+                                st.success("metrics-server installed with --kubelet-insecure-tls!")
+                            else:
+                                st.warning("Installed but TLS patch may have failed (already applied?).")
+                                st.code(patch_result.stderr, language="text")
+                        else:
+                            st.success("metrics-server installed!")
+                    else:
+                        st.error("metrics-server installation failed")
+                        st.code(apply_result.stderr, language="text")
+
+            # Check status
+            if st.button("Check metrics-server status", key="ms_status"):
+                with st.spinner("Checking..."):
+                    result = run_kubectl(
+                        profile,
+                        "get deployment metrics-server -n kube-system -o wide",
+                        timeout=15,
+                    )
+                    if result.success:
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.warning("metrics-server not found or not ready.")
+                        st.code(result.stderr, language="text")
+
+        with met_col2:
+            st.markdown("#### kube-state-metrics")
+            st.markdown(
+                "Exposes object-level metrics (Deployments, Pods, Nodes, etc.) "
+                "to Prometheus for dashboards and alerting."
+            )
+            ksm_ns = namespace  # reuse the monitoring namespace
+            if st.button("Install kube-state-metrics", type="primary", key="install_ksm"):
+                with st.status("Installing kube-state-metrics...", expanded=True):
+                    # Use helm if available, otherwise apply raw manifest
+                    helm_cmd = (
+                        f"helm install kube-state-metrics "
+                        f"oci://registry-1.docker.io/bitnamicharts/kube-state-metrics "
+                        f"-n {ksm_ns} --create-namespace"
+                    )
+                    result = run_kubectl(profile, helm_cmd, timeout=120)
+                    if result.success:
+                        st.success("kube-state-metrics installed via Helm!")
+                        st.code(result.stdout, language="text")
+                    else:
+                        st.warning("Helm install failed, trying kubectl apply...")
+                        st.code(result.stderr, language="text")
+                        # Fallback: direct manifest from GitHub
+                        ksm_url = (
+                            "https://raw.githubusercontent.com/kubernetes/"
+                            "kube-state-metrics/main/examples/standard/service.yaml"
+                        )
+                        apply_result = run_kubectl(
+                            profile,
+                            f"apply -f https://raw.githubusercontent.com/kubernetes/kube-state-metrics/main/examples/standard/ 2>/dev/null || echo 'Manual install required'",
+                            timeout=60,
+                        )
+                        if apply_result.success:
+                            st.success("kube-state-metrics applied!")
+                            st.code(apply_result.stdout, language="text")
+                        else:
+                            st.error(
+                                "Could not install kube-state-metrics automatically.\n\n"
+                                "Manual install:\n"
+                                "```\nhelm repo add prometheus-community "
+                                "https://prometheus-community.github.io/helm-charts\n"
+                                "helm install kube-state-metrics "
+                                f"prometheus-community/kube-state-metrics -n {ksm_ns}\n```"
+                            )
+
+            if st.button("Check kube-state-metrics status", key="ksm_status"):
+                with st.spinner("Checking..."):
+                    result = run_kubectl(
+                        profile,
+                        f"get pods -n {ksm_ns} -l app.kubernetes.io/name=kube-state-metrics -o wide",
+                        timeout=15,
+                    )
+                    if result.success and result.stdout.strip():
+                        st.code(result.stdout, language="text")
+                    else:
+                        # Try broader search
+                        result2 = run_kubectl(
+                            profile,
+                            "get pods -A -l app.kubernetes.io/name=kube-state-metrics -o wide",
+                            timeout=15,
+                        )
+                        if result2.success and result2.stdout.strip():
+                            st.code(result2.stdout, language="text")
+                        else:
+                            st.warning("kube-state-metrics not found on the cluster.")
+
     # ── Dashboards ────────────────────────────────────────────────────────
     with tab_dashboards:
         st.markdown("### Grafana Dashboards")
@@ -1249,17 +1371,6 @@ def page_log_analysis():
             return
         cp_node = cp_nodes[0]
 
-    # kubectl availability check for imported clusters
-    if profile.cluster_source == "imported" and not get_kubectl_path():
-        st.error(
-            "**kubectl not found** on this machine.\n\n"
-            "Install it with:\n```\n"
-            "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
-            "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
-            "Or see: https://kubernetes.io/docs/tasks/tools/"
-        )
-        return
-
     # Pre-fetch namespaces for imported clusters (used by Pod Logs tab)
     _cluster_namespaces: list[str] = []
     if profile.cluster_source == "imported" and profile.kubeconfig_content:
@@ -1506,24 +1617,15 @@ def page_resource_viewer():
         )
         return
 
-    # kubectl availability check for imported clusters
-    if profile.cluster_source == "imported" and not get_kubectl_path():
-        st.error(
-            "**kubectl not found** on this machine.\n\n"
-            "Install it with:\n```\n"
-            "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
-            "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
-            "Or see: https://kubernetes.io/docs/tasks/tools/"
-        )
-        return
-
     # Pre-fetch namespaces for imported clusters
     _rv_namespaces: list[str] = []
     if profile.cluster_source == "imported" and profile.kubeconfig_content:
         _rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
 
-    tab_resources, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+    tab_resources, tab_scaling, tab_shell, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
         "Cluster Resources",
+        "Scaling",
+        "Pod Shell",
         "Node Health",
         "RBAC Viewer",
         "Helm Releases",
@@ -1597,27 +1699,66 @@ def page_resource_viewer():
         # Describe a specific resource
         st.markdown("---")
         st.markdown("#### Describe a Resource")
-        desc_col1, desc_col2 = st.columns(2)
-        with desc_col1:
-            desc_name = st.text_input(
-                "Resource name",
-                placeholder="e.g., my-pod-xyz",
-                key="desc_name",
-            )
+        desc_col1, desc_col2, desc_col3 = st.columns([2, 2, 1])
         with desc_col2:
             if _rv_namespaces:
                 desc_ns = st.selectbox(
-                    "Namespace (if applicable)",
+                    "Namespace",
                     options=_rv_namespaces,
                     index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
                     key="desc_ns",
                 )
             else:
                 desc_ns = st.text_input(
-                    "Namespace (if applicable)",
+                    "Namespace",
                     value="default",
                     key="desc_ns",
                 )
+        with desc_col3:
+            desc_refresh = st.button("Load names", key="desc_load_names")
+
+        # Fetch resource names for the dropdown
+        _desc_resource_names: list[str] = []
+        if desc_refresh or st.session_state.get("_desc_cached_names"):
+            if desc_refresh:
+                # Determine the kubectl get command for names
+                _desc_cmd_base, _desc_ns_supported = _RESOURCE_TYPES[resource_type]
+                _names_cmd = f"{_desc_cmd_base} -o name"
+                if _desc_ns_supported and desc_ns:
+                    _names_cmd += f" -n {desc_ns}"
+                elif _desc_ns_supported:
+                    _names_cmd += " -A"
+                _names_result = run_kubectl(profile, _names_cmd, timeout=15)
+                if _names_result.success and _names_result.stdout.strip():
+                    raw_names = _names_result.stdout.strip().split("\n")
+                    # Strip resource type prefix (e.g. "pod/my-pod" -> "my-pod")
+                    _desc_resource_names = [
+                        n.split("/", 1)[-1] if "/" in n else n
+                        for n in raw_names if n.strip()
+                    ]
+                    st.session_state["_desc_cached_names"] = _desc_resource_names
+                    st.session_state["_desc_cached_type"] = resource_type
+                else:
+                    _desc_resource_names = []
+                    st.session_state["_desc_cached_names"] = []
+            else:
+                # Use cached names if resource type matches
+                if st.session_state.get("_desc_cached_type") == resource_type:
+                    _desc_resource_names = st.session_state.get("_desc_cached_names", [])
+
+        with desc_col1:
+            if _desc_resource_names:
+                desc_name = st.selectbox(
+                    "Resource name",
+                    options=_desc_resource_names,
+                    key="desc_name_select",
+                )
+            else:
+                desc_name = st.text_input(
+                    "Resource name",
+                    placeholder="Click 'Load names' or type a name",
+                    key="desc_name",
+                )
 
         if st.button("Describe", key="describe_res") and desc_name:
             # Determine the singular resource type for describe
@@ -1643,6 +1784,255 @@ def page_resource_viewer():
                     st.error("Describe failed")
                     st.code(result.stderr, language="text")
 
+    # ── Scaling ──────────────────────────────────────────────────────────
+    with tab_scaling:
+        st.markdown("### Deployment Scaling")
+        st.markdown("Scale deployment replicas up or down.")
+
+        sc_col1, sc_col2 = st.columns(2)
+        with sc_col1:
+            if _rv_namespaces:
+                sc_ns = st.selectbox(
+                    "Namespace",
+                    options=_rv_namespaces,
+                    index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+                    key="sc_ns",
+                )
+            else:
+                sc_ns = st.text_input("Namespace", value="default", key="sc_ns")
+        with sc_col2:
+            sc_load = st.button("Load Deployments", key="sc_load")
+
+        # Fetch deployments for the dropdown
+        _sc_deployments: list[str] = []
+        _sc_dep_info: dict[str, str] = {}
+        if sc_load or st.session_state.get("_sc_cached_deps"):
+            if sc_load:
+                dep_result = run_kubectl(
+                    profile,
+                    f"get deployments -n {sc_ns} -o custom-columns=NAME:.metadata.name,REPLICAS:.spec.replicas,AVAILABLE:.status.availableReplicas --no-headers",
+                    timeout=15,
+                )
+                if dep_result.success and dep_result.stdout.strip():
+                    for line in dep_result.stdout.strip().split("\n"):
+                        parts = line.split()
+                        if parts:
+                            dep_name = parts[0]
+                            _sc_deployments.append(dep_name)
+                            replicas = parts[1] if len(parts) > 1 else "?"
+                            available = parts[2] if len(parts) > 2 else "?"
+                            _sc_dep_info[dep_name] = f"{replicas} replicas ({available} available)"
+                    st.session_state["_sc_cached_deps"] = _sc_deployments
+                    st.session_state["_sc_cached_dep_info"] = _sc_dep_info
+                    st.session_state["_sc_cached_ns"] = sc_ns
+                else:
+                    st.session_state["_sc_cached_deps"] = []
+                    st.session_state["_sc_cached_dep_info"] = {}
+                    if dep_result.success:
+                        st.info(f"No deployments found in namespace '{sc_ns}'.")
+                    else:
+                        st.error("Failed to list deployments")
+                        st.code(dep_result.stderr, language="text")
+            else:
+                if st.session_state.get("_sc_cached_ns") == sc_ns:
+                    _sc_deployments = st.session_state.get("_sc_cached_deps", [])
+                    _sc_dep_info = st.session_state.get("_sc_cached_dep_info", {})
+
+        if _sc_deployments:
+            sc_dep_col1, sc_dep_col2, sc_dep_col3 = st.columns([3, 1, 1])
+            with sc_dep_col1:
+                sc_selected = st.selectbox(
+                    "Deployment",
+                    options=_sc_deployments,
+                    format_func=lambda d: f"{d}  ({_sc_dep_info.get(d, '')})",
+                    key="sc_selected",
+                )
+            with sc_dep_col2:
+                sc_replicas = st.number_input(
+                    "Target replicas",
+                    min_value=0,
+                    max_value=100,
+                    value=1,
+                    key="sc_replicas",
+                )
+            with sc_dep_col3:
+                st.markdown("<br>", unsafe_allow_html=True)
+                if st.button("Scale", type="primary", key="sc_apply"):
+                    scale_cmd = f"scale deployment {sc_selected} --replicas={sc_replicas} -n {sc_ns}"
+                    with st.spinner(f"Scaling {sc_selected} to {sc_replicas} replicas..."):
+                        result = run_kubectl(profile, scale_cmd, timeout=30)
+                        if result.success:
+                            st.success(f"Scaled **{sc_selected}** to **{sc_replicas}** replicas!")
+                            st.code(result.stdout, language="text")
+                            # Refresh to show updated state
+                            verify = run_kubectl(
+                                profile,
+                                f"get deployment {sc_selected} -n {sc_ns} -o wide",
+                                timeout=15,
+                            )
+                            if verify.success:
+                                st.code(verify.stdout, language="text")
+                        else:
+                            st.error("Scaling failed")
+                            st.code(result.stderr, language="text")
+
+            # Quick scale buttons
+            st.markdown("---")
+            st.markdown("#### Quick Actions")
+            qa_col1, qa_col2, qa_col3, qa_col4 = st.columns(4)
+            with qa_col1:
+                if st.button("Scale to 0 (stop)", key="sc_0"):
+                    with st.spinner("Scaling to 0..."):
+                        result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=0 -n {sc_ns}", timeout=30)
+                        st.success("Scaled to 0") if result.success else st.error(result.stderr)
+            with qa_col2:
+                if st.button("Scale to 1", key="sc_1"):
+                    with st.spinner("Scaling to 1..."):
+                        result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=1 -n {sc_ns}", timeout=30)
+                        st.success("Scaled to 1") if result.success else st.error(result.stderr)
+            with qa_col3:
+                if st.button("Scale to 3", key="sc_3"):
+                    with st.spinner("Scaling to 3..."):
+                        result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=3 -n {sc_ns}", timeout=30)
+                        st.success("Scaled to 3") if result.success else st.error(result.stderr)
+            with qa_col4:
+                if st.button("Scale to 5", key="sc_5"):
+                    with st.spinner("Scaling to 5..."):
+                        result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=5 -n {sc_ns}", timeout=30)
+                        st.success("Scaled to 5") if result.success else st.error(result.stderr)
+        elif not sc_load:
+            st.info("Click **Load Deployments** to see deployments in the selected namespace.")
+
+    # ── Pod Shell ────────────────────────────────────────────────────────
+    with tab_shell:
+        st.markdown("### Pod Shell (Exec)")
+        st.markdown("Execute commands inside a running pod/container.")
+
+        sh_col1, sh_col2 = st.columns(2)
+        with sh_col1:
+            if _rv_namespaces:
+                sh_ns = st.selectbox(
+                    "Namespace",
+                    options=_rv_namespaces,
+                    index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+                    key="sh_ns",
+                )
+            else:
+                sh_ns = st.text_input("Namespace", value="default", key="sh_ns")
+        with sh_col2:
+            sh_load = st.button("Load Pods", key="sh_load")
+
+        # Fetch running pods
+        _sh_pods: list[str] = []
+        _sh_containers: dict[str, list[str]] = {}
+        if sh_load or st.session_state.get("_sh_cached_pods"):
+            if sh_load:
+                pod_result = run_kubectl(
+                    profile,
+                    f"get pods -n {sh_ns} --field-selector=status.phase=Running -o jsonpath="
+                    "'{range .items[*]}{.metadata.name}{\"\\n\"}{end}'",
+                    timeout=15,
+                )
+                if pod_result.success and pod_result.stdout.strip():
+                    _sh_pods = [p.strip() for p in pod_result.stdout.strip().split("\n") if p.strip()]
+                    st.session_state["_sh_cached_pods"] = _sh_pods
+                    st.session_state["_sh_cached_ns"] = sh_ns
+                    # Fetch container names for each pod
+                    _sh_containers = {}
+                    for pod_name in _sh_pods[:20]:  # limit to first 20 for perf
+                        ctr_result = run_kubectl(
+                            profile,
+                            f"get pod {pod_name} -n {sh_ns} -o jsonpath="
+                            "'{range .spec.containers[*]}{.name}{\"\\n\"}{end}'",
+                            timeout=10,
+                        )
+                        if ctr_result.success and ctr_result.stdout.strip():
+                            _sh_containers[pod_name] = [
+                                c.strip() for c in ctr_result.stdout.strip().split("\n") if c.strip()
+                            ]
+                        else:
+                            _sh_containers[pod_name] = []
+                    st.session_state["_sh_cached_containers"] = _sh_containers
+                else:
+                    st.session_state["_sh_cached_pods"] = []
+                    st.session_state["_sh_cached_containers"] = {}
+                    if pod_result.success:
+                        st.info(f"No running pods found in namespace '{sh_ns}'.")
+                    else:
+                        st.error("Failed to list pods")
+                        st.code(pod_result.stderr, language="text")
+            else:
+                if st.session_state.get("_sh_cached_ns") == sh_ns:
+                    _sh_pods = st.session_state.get("_sh_cached_pods", [])
+                    _sh_containers = st.session_state.get("_sh_cached_containers", {})
+
+        if _sh_pods:
+            sh_pod_col1, sh_pod_col2 = st.columns(2)
+            with sh_pod_col1:
+                sh_selected_pod = st.selectbox("Pod", options=_sh_pods, key="sh_pod")
+            with sh_pod_col2:
+                containers = _sh_containers.get(sh_selected_pod, [])
+                if containers:
+                    sh_selected_ctr = st.selectbox("Container", options=containers, key="sh_ctr")
+                else:
+                    sh_selected_ctr = st.text_input("Container (optional)", key="sh_ctr")
+
+            st.info(
+                "**Note:** This runs non-interactive commands via `kubectl exec`. "
+                "For a fully interactive shell, use your terminal:\n\n"
+                f"`kubectl exec -it {sh_selected_pod} -n {sh_ns}"
+                f"{' -c ' + sh_selected_ctr if sh_selected_ctr else ''} -- /bin/sh`"
+            )
+
+            sh_cmd = st.text_input(
+                "Command to execute",
+                value="sh -c 'hostname && cat /etc/os-release && df -h'",
+                key="sh_cmd",
+                help="Enter the command to run inside the container",
+            )
+
+            sh_preset_col1, sh_preset_col2, sh_preset_col3, sh_preset_col4 = st.columns(4)
+            with sh_preset_col1:
+                if st.button("env", key="sh_p_env"):
+                    sh_cmd = "env"
+            with sh_preset_col2:
+                if st.button("ps aux", key="sh_p_ps"):
+                    sh_cmd = "ps aux"
+            with sh_preset_col3:
+                if st.button("df -h", key="sh_p_df"):
+                    sh_cmd = "df -h"
+            with sh_preset_col4:
+                if st.button("cat /etc/resolv.conf", key="sh_p_dns"):
+                    sh_cmd = "cat /etc/resolv.conf"
+
+            if st.button("Execute", type="primary", key="sh_exec") and sh_cmd:
+                ctr_flag = f" -c {sh_selected_ctr}" if sh_selected_ctr else ""
+                exec_cmd = f"exec {sh_selected_pod} -n {sh_ns}{ctr_flag} -- {sh_cmd}"
+                with st.spinner(f"Executing in {sh_selected_pod}..."):
+                    result = run_kubectl(profile, exec_cmd, timeout=30)
+                    if result.success:
+                        st.code(result.stdout or "(no output)", language="text")
+                    else:
+                        st.error("Exec failed")
+                        st.code(result.stderr, language="text")
+
+            # Pod logs quick access
+            st.markdown("---")
+            st.markdown("#### Quick Pod Logs")
+            log_lines = st.number_input("Tail lines", min_value=10, max_value=500, value=50, key="sh_log_lines")
+            if st.button("View Logs", key="sh_logs"):
+                ctr_flag = f" -c {sh_selected_ctr}" if sh_selected_ctr else ""
+                log_cmd = f"logs {sh_selected_pod} -n {sh_ns}{ctr_flag} --tail={log_lines}"
+                with st.spinner("Fetching logs..."):
+                    result = run_kubectl(profile, log_cmd, timeout=30)
+                    if result.success:
+                        st.code(result.stdout or "(no logs)", language="text")
+                    else:
+                        st.error("Failed to fetch logs")
+                        st.code(result.stderr, language="text")
+        elif not sh_load:
+            st.info("Click **Load Pods** to see running pods in the selected namespace.")
+
     # ── Node Health ──────────────────────────────────────────────────────
     with tab_node_health:
         st.markdown("### Node Health Overview")
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 79c94cd..644bf78 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -1205,10 +1205,34 @@ def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSH
         with open(kubeconfig_path, "w") as f:
             f.write(profile.kubeconfig_content)
 
+        kubectl = config.get_kubectl_path()
+        helm = config.get_helm_path()
+
         if is_helm:
-            full_cmd = f"KUBECONFIG={kubeconfig_path} {command}"
+            bin_path = helm or "helm"
+            resolved = command.strip()
+            if resolved.startswith("helm "):
+                resolved = bin_path + resolved[4:]
+            full_cmd = f"KUBECONFIG={kubeconfig_path} {resolved}"
         else:
-            full_cmd = f"kubectl --kubeconfig={kubeconfig_path} {command}"
+            if not kubectl:
+                return SSHResult(
+                    hostname="local (kubeconfig)",
+                    command=command,
+                    return_code=1,
+                    stdout="",
+                    stderr=(
+                        "kubectl not found on this machine.\n\n"
+                        "Install kubectl:\n"
+                        "  curl -LO https://dl.k8s.io/release/"
+                        "$(curl -Ls https://dl.k8s.io/release/stable.txt)"
+                        "/bin/linux/amd64/kubectl\n"
+                        "  chmod +x kubectl && mv kubectl ~/.local/bin/\n\n"
+                        "Or see: https://kubernetes.io/docs/tasks/tools/"
+                    ),
+                    success=False,
+                )
+            full_cmd = f"{kubectl} --kubeconfig={kubeconfig_path} {command}"
         try:
             proc = subprocess.run(
                 full_cmd,
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index 45feb91..c5216f1 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -28,6 +28,7 @@
     "Services": "get svc -A",
     "PVCs": "get pvc -A",
     "Ingresses": "get ingress -A",
+    "Disk Usage (Nodes)": "top nodes",
 }
 
 # Full SSH commands (backward-compat for provisioned clusters)

From 2285ea5904e02e04f714855233976318d5c8f773 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:47:49 +0000
Subject: [PATCH 13/31] Add Node Containers (crictl) tab: view containers per
 node via SSH or kubectl

---
 k8s-agent/app.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 117 insertions(+), 1 deletion(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 6ac3fe4..19bf862 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -22,6 +22,7 @@
 )
 from modules.cluster_creator import (
     test_ssh_connectivity,
+    run_ssh_command,
     generate_common_setup_script,
     generate_control_plane_init_script,
     generate_worker_join_script,
@@ -1622,10 +1623,11 @@ def page_resource_viewer():
     if profile.cluster_source == "imported" and profile.kubeconfig_content:
         _rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
 
-    tab_resources, tab_scaling, tab_shell, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+    tab_resources, tab_scaling, tab_shell, tab_crictl, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
         "Cluster Resources",
         "Scaling",
         "Pod Shell",
+        "Node Containers",
         "Node Health",
         "RBAC Viewer",
         "Helm Releases",
@@ -2033,6 +2035,120 @@ def page_resource_viewer():
         elif not sh_load:
             st.info("Click **Load Pods** to see running pods in the selected namespace.")
 
+    # ── Node Containers (crictl) ────────────────────────────────────────
+    with tab_crictl:
+        st.markdown("### Node Containers (crictl)")
+        st.markdown("View containers running on each node using `crictl ps -a`.")
+
+        if profile.cluster_source == "imported":
+            # Imported clusters — no SSH, but we can still get node list and show
+            # container info via kubectl debug or just list pods per node
+            st.info(
+                "**crictl** requires SSH access to each node and is available for "
+                "provisioned clusters. For imported clusters, container-level "
+                "information is shown via kubectl below."
+            )
+            if st.button("Show containers per node (kubectl)", type="primary", key="crictl_kubectl"):
+                with st.spinner("Fetching node list..."):
+                    node_result = run_kubectl(
+                        profile,
+                        "get nodes -o jsonpath='{range .items[*]}{.metadata.name}{\"\\n\"}{end}'",
+                        timeout=15,
+                    )
+                if node_result.success and node_result.stdout.strip():
+                    node_names = [n.strip() for n in node_result.stdout.strip().split("\n") if n.strip()]
+                    for node_name in node_names:
+                        with st.expander(f"Node: **{node_name}**", expanded=True):
+                            with st.spinner(f"Fetching containers on {node_name}..."):
+                                pod_result = run_kubectl(
+                                    profile,
+                                    f"get pods -A --field-selector spec.nodeName={node_name} "
+                                    "-o custom-columns="
+                                    "'NAMESPACE:.metadata.namespace,"
+                                    "POD:.metadata.name,"
+                                    "CONTAINERS:.spec.containers[*].name,"
+                                    "STATUS:.status.phase,"
+                                    "RESTARTS:.status.containerStatuses[0].restartCount,"
+                                    "NODE:.spec.nodeName'",
+                                    timeout=15,
+                                )
+                                if pod_result.success:
+                                    st.code(pod_result.stdout or "(no pods on this node)", language="text")
+                                else:
+                                    st.error(f"Failed to get pods on {node_name}")
+                                    st.code(pod_result.stderr, language="text")
+                else:
+                    st.error("Failed to list nodes")
+                    if node_result.stderr:
+                        st.code(node_result.stderr, language="text")
+        else:
+            # Provisioned clusters — SSH into each node and run crictl
+            all_nodes = profile.nodes
+            if not all_nodes:
+                st.warning("No nodes defined in this profile.")
+            else:
+                crictl_cmd = st.text_input(
+                    "CRI command",
+                    value="crictl ps -a",
+                    key="crictl_cmd",
+                    help="Command to run on each node (e.g. crictl ps -a, crictl images, crictl stats)",
+                )
+
+                crictl_presets = st.columns(5)
+                with crictl_presets[0]:
+                    if st.button("crictl ps -a", key="cp_ps"):
+                        crictl_cmd = "crictl ps -a"
+                with crictl_presets[1]:
+                    if st.button("crictl images", key="cp_img"):
+                        crictl_cmd = "crictl images"
+                with crictl_presets[2]:
+                    if st.button("crictl stats", key="cp_stats"):
+                        crictl_cmd = "crictl stats"
+                with crictl_presets[3]:
+                    if st.button("crictl pods", key="cp_pods"):
+                        crictl_cmd = "crictl pods"
+                with crictl_presets[4]:
+                    if st.button("crictl info", key="cp_info"):
+                        crictl_cmd = "crictl info"
+
+                # Node selection
+                node_labels = [
+                    f"{n.get('hostname', n.get('ip_address', '?'))} ({n.get('ip_address', '?')}) [{n.get('role', '?')}]"
+                    for n in all_nodes
+                ]
+                cr_select_all = st.checkbox("Run on all nodes", value=True, key="cr_all")
+
+                if not cr_select_all:
+                    selected_nodes_idx = st.multiselect(
+                        "Select nodes",
+                        options=list(range(len(all_nodes))),
+                        format_func=lambda i: node_labels[i],
+                        default=list(range(len(all_nodes))),
+                        key="cr_nodes",
+                    )
+                    selected_nodes = [all_nodes[i] for i in selected_nodes_idx]
+                else:
+                    selected_nodes = all_nodes
+
+                if st.button("Run on selected nodes", type="primary", key="crictl_run"):
+                    for node in selected_nodes:
+                        node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
+                        with st.expander(f"Node: **{node_label}** [{node.get('role', '')}]", expanded=True):
+                            with st.spinner(f"Running `{crictl_cmd}` on {node_label}..."):
+                                result = run_ssh_command(
+                                    ip_address=node["ip_address"],
+                                    command=crictl_cmd,
+                                    ssh_user=node.get("ssh_user", "root"),
+                                    ssh_port=node.get("ssh_port", 22),
+                                    ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+                                    timeout=30,
+                                )
+                                if result.success:
+                                    st.code(result.stdout or "(no output)", language="text")
+                                else:
+                                    st.error(f"Command failed on {node_label}")
+                                    st.code(result.stderr, language="text")
+
     # ── Node Health ──────────────────────────────────────────────────────
     with tab_node_health:
         st.markdown("### Node Health Overview")

From 252e941add3fe037c5067b2c4c62d026d179e485 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:52:31 +0000
Subject: [PATCH 14/31] Add cluster reset/teardown feature with re-provision
 option

---
 k8s-agent/app.py                     | 148 ++++++++++++++++++++++++++-
 k8s-agent/modules/cluster_creator.py | 143 ++++++++++++++++++++++++++
 2 files changed, 290 insertions(+), 1 deletion(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 19bf862..fcf776d 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -42,6 +42,7 @@
     get_control_plane_steps,
     get_worker_join_steps,
     get_best_practices_steps,
+    get_cluster_reset_steps,
 )
 from modules.cluster_debugger import (
     DIAGNOSTIC_COMMANDS,
@@ -604,9 +605,10 @@ def page_cluster_creation():
 
     _show_profile_summary(profile)
 
-    tab_preflight, tab_provision, tab_scripts, tab_manifests, tab_advice = st.tabs([
+    tab_preflight, tab_provision, tab_reset, tab_scripts, tab_manifests, tab_advice = st.tabs([
         "Pre-flight Checks",
         "Provision Cluster",
+        "Reset Cluster",
         "View Scripts",
         "Offline Manifests",
         "AI Advice",
@@ -783,6 +785,150 @@ def page_cluster_creation():
                 update_profile_status(profile.name, "error")
                 st.error("Provisioning did not complete successfully. Check the errors above.")
 
+    # ── Reset Cluster ────────────────────────────────────────────────────
+    with tab_reset:
+        st.markdown("### Reset / Tear Down Cluster")
+        st.markdown(
+            "Completely reset the Kubernetes cluster on all (or selected) nodes. "
+            "This will run `kubeadm reset`, stop services, remove CRI-O data, "
+            "CNI configs, etcd data, and flush iptables — preparing nodes for a "
+            "fresh cluster installation."
+        )
+
+        if profile.cluster_source == "imported":
+            st.info(
+                "Cluster reset requires SSH access to each node and is only "
+                "available for **provisioned** clusters. For imported clusters, "
+                "run `kubeadm reset` directly on each node."
+            )
+        else:
+            all_nodes = profile.nodes
+            if not all_nodes:
+                st.warning("No nodes defined in this profile.")
+            else:
+                st.error(
+                    "**WARNING:** This is a destructive operation. All Kubernetes data, "
+                    "containers, etcd data, and configuration will be permanently deleted "
+                    "from the selected nodes. This cannot be undone."
+                )
+
+                # Node selection
+                reset_node_labels = [
+                    f"{n.get('hostname', n.get('ip_address', '?'))} ({n.get('ip_address', '?')}) [{n.get('role', '?')}]"
+                    for n in all_nodes
+                ]
+                reset_all = st.checkbox("Reset ALL nodes", value=True, key="reset_all_nodes")
+
+                if not reset_all:
+                    reset_idx = st.multiselect(
+                        "Select nodes to reset",
+                        options=list(range(len(all_nodes))),
+                        format_func=lambda i: reset_node_labels[i],
+                        default=list(range(len(all_nodes))),
+                        key="reset_node_select",
+                    )
+                    reset_nodes = [all_nodes[i] for i in reset_idx]
+                else:
+                    reset_nodes = all_nodes
+
+                # Options
+                col_r1, col_r2 = st.columns(2)
+                with col_r1:
+                    remove_packages = st.checkbox(
+                        "Also remove kubeadm/kubelet/kubectl packages",
+                        value=False,
+                        key="reset_remove_pkgs",
+                    )
+                with col_r2:
+                    auto_reprovision = st.checkbox(
+                        "Re-provision cluster after reset",
+                        value=False,
+                        key="reset_reprovision",
+                        help="After reset completes, automatically start fresh provisioning using the Provision Cluster flow.",
+                    )
+
+                # Confirmation
+                confirm_text = st.text_input(
+                    'Type **RESET** to confirm',
+                    key="reset_confirm",
+                    help="Type RESET (all caps) to enable the reset button.",
+                )
+
+                reset_enabled = confirm_text.strip() == "RESET" and len(reset_nodes) > 0
+                if st.button(
+                    f"Reset {len(reset_nodes)} Node(s)",
+                    type="primary",
+                    disabled=not reset_enabled,
+                    use_container_width=True,
+                    key="reset_go",
+                ):
+                    update_profile_status(profile.name, "provisioning")
+                    reset_steps = get_cluster_reset_steps(profile)
+
+                    # Optionally add package removal step
+                    if remove_packages:
+                        reset_steps.append(
+                            ProvisionStep(
+                                name="remove_packages",
+                                title="Remove kubeadm/kubelet/kubectl packages",
+                                script="""set -uo pipefail
+echo '>> Removing Kubernetes packages...'
+if command -v yum &>/dev/null; then
+    yum remove -y kubeadm kubelet kubectl cri-o 2>/dev/null || true
+elif command -v apt-get &>/dev/null; then
+    apt-get remove -y --purge kubeadm kubelet kubectl cri-o 2>/dev/null || true
+fi
+echo 'Packages removed.'
+""",
+                                timeout=120,
+                                fatal=False,
+                            )
+                        )
+
+                    reset_success = True
+                    for node in reset_nodes:
+                        node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
+                        st.markdown(f"---\n#### Resetting: {node_label} [{node.get('role', '')}]")
+                        progress = st.progress(0, text=f"Starting reset on {node_label}...")
+                        node_ok = True
+                        for idx, step in enumerate(reset_steps):
+                            pct = int((idx / len(reset_steps)) * 100)
+                            progress.progress(pct, text=f"[{idx+1}/{len(reset_steps)}] {step.title}")
+                            with st.status(f"{step.title}...", expanded=False) as status:
+                                result = _run_step(node, step)
+                                if result.success:
+                                    st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text")
+                                    status.update(label=f"{step.title} — done", state="complete")
+                                else:
+                                    st.warning(f"{step.title} — issue encountered")
+                                    st.code(result.stderr or result.stdout, language="text")
+                                    status.update(label=f"{step.title} — issue", state="error")
+                                    node_ok = False
+                                    if step.fatal:
+                                        reset_success = False
+                                        break
+                        progress.progress(100, text=f"{'Reset complete' if node_ok else 'Reset had issues'} on {node_label}")
+                        if node_ok:
+                            st.success(f"Node {node_label} reset successfully.")
+                        else:
+                            st.warning(f"Node {node_label} reset completed with some issues. Check details above.")
+
+                    if reset_success:
+                        update_profile_status(profile.name, "draft")
+                        st.success("All selected nodes have been reset. The cluster has been torn down.")
+                        st.info("You can now go to the **Provision Cluster** tab to create a new cluster on these nodes.")
+
+                        if auto_reprovision:
+                            st.markdown("---")
+                            st.markdown("### Auto Re-provisioning")
+                            st.info(
+                                "Auto re-provision is enabled. Please switch to the **Provision Cluster** tab "
+                                "and click **Start Provisioning** to set up a fresh cluster with the current profile settings."
+                            )
+                    else:
+                        update_profile_status(profile.name, "error")
+                        st.error("Reset encountered fatal errors on some nodes. Review the output above before re-provisioning.")
+
     # ── View Scripts ──────────────────────────────────────────────────────
     with tab_scripts:
         st.markdown("### Generated Scripts")
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 644bf78..d9f694d 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -1043,6 +1043,149 @@ def get_best_practices_steps() -> List[ProvisionStep]:
     ]
 
 
+def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]:
+    """Return the ordered list of steps to fully reset/teardown a K8s node.
+
+    This runs kubeadm reset, stops services, removes CRI-O data, CNI configs,
+    and cleans up iptables — preparing the node for a fresh cluster install.
+    """
+    crio_root = profile.crio_root or "/var/lib/containers/storage"
+    kubelet_root = profile.kubelet_root or "/var/lib/kubelet"
+    log_root = profile.log_root or "/var/log"
+
+    return [
+        ProvisionStep(
+            name="drain_node",
+            title="Drain & Cordon Node (best effort)",
+            script="""set -uo pipefail
+echo '>> Attempting to drain this node (best effort)...'
+HOSTNAME=$(hostname)
+kubectl drain "$HOSTNAME" --ignore-daemonsets --delete-emptydir-data --force --timeout=60s 2>/dev/null || true
+kubectl cordon "$HOSTNAME" 2>/dev/null || true
+echo 'Drain/cordon complete (or skipped if kubectl not available).'
+""",
+            timeout=90,
+            fatal=False,
+        ),
+        ProvisionStep(
+            name="kubeadm_reset",
+            title="Run kubeadm reset",
+            script="""set -uo pipefail
+echo '>> Running kubeadm reset...'
+kubeadm reset -f --cri-socket unix:///var/run/crio/crio.sock 2>/dev/null || \
+kubeadm reset -f 2>/dev/null || \
+echo 'kubeadm reset returned non-zero (may already be reset)'
+echo 'kubeadm reset complete.'
+""",
+            timeout=120,
+        ),
+        ProvisionStep(
+            name="stop_services",
+            title="Stop kubelet & CRI-O services",
+            script="""set -uo pipefail
+echo '>> Stopping kubelet...'
+systemctl stop kubelet 2>/dev/null || true
+systemctl disable kubelet 2>/dev/null || true
+echo '>> Stopping CRI-O...'
+systemctl stop crio 2>/dev/null || true
+systemctl disable crio 2>/dev/null || true
+echo 'Services stopped.'
+""",
+            timeout=60,
+        ),
+        ProvisionStep(
+            name="clean_cni",
+            title="Remove CNI configuration & network interfaces",
+            script="""set -uo pipefail
+echo '>> Removing CNI configs...'
+rm -rf /etc/cni/net.d/*
+echo '>> Removing flannel interface...'
+ip link delete flannel.1 2>/dev/null || true
+ip link delete cni0 2>/dev/null || true
+ip link delete flannel-wg 2>/dev/null || true
+echo 'CNI cleanup complete.'
+""",
+            timeout=30,
+        ),
+        ProvisionStep(
+            name="clean_iptables",
+            title="Flush iptables rules",
+            script="""set -uo pipefail
+echo '>> Flushing iptables...'
+iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
+ip6tables -F && ip6tables -t nat -F && ip6tables -t mangle -F && ip6tables -X 2>/dev/null || true
+echo 'iptables flushed.'
+""",
+            timeout=30,
+        ),
+        ProvisionStep(
+            name="clean_kubelet_data",
+            title="Remove kubelet data",
+            script=f"""set -uo pipefail
+echo '>> Removing kubelet data at {kubelet_root}...'
+rm -rf {kubelet_root}/*
+rm -rf /etc/kubernetes/*
+rm -rf /tmp/kubeadm-join-command.txt
+echo 'Kubelet data removed.'
+""",
+            timeout=60,
+        ),
+        ProvisionStep(
+            name="clean_crio_data",
+            title="Remove CRI-O container data",
+            script=f"""set -uo pipefail
+echo '>> Removing CRI-O storage at {crio_root}...'
+rm -rf {crio_root}/*
+echo '>> Removing CRI-O run root...'
+rm -rf /run/containers/storage/*
+echo 'CRI-O data removed.'
+""",
+            timeout=60,
+            fatal=False,
+        ),
+        ProvisionStep(
+            name="clean_etcd",
+            title="Remove etcd data (control-plane only, best effort)",
+            script="""set -uo pipefail
+echo '>> Removing etcd data...'
+rm -rf /var/lib/etcd/*
+echo 'etcd data removed (if present).'
+""",
+            timeout=30,
+            fatal=False,
+        ),
+        ProvisionStep(
+            name="clean_logs",
+            title="Clean K8s-related logs",
+            script=f"""set -uo pipefail
+echo '>> Cleaning K8s logs at {log_root}...'
+rm -rf {log_root}/pods/*
+rm -rf {log_root}/containers/*
+rm -rf /var/log/kubernetes/* 2>/dev/null || true
+echo 'Logs cleaned.'
+""",
+            timeout=30,
+            fatal=False,
+        ),
+        ProvisionStep(
+            name="verify_clean",
+            title="Verify cleanup",
+            script="""set -uo pipefail
+echo '>> Verifying cleanup...'
+echo "kubelet active: $(systemctl is-active kubelet 2>/dev/null || echo 'not found')"
+echo "crio active: $(systemctl is-active crio 2>/dev/null || echo 'not found')"
+echo "kubeadm present: $(which kubeadm 2>/dev/null || echo 'not found')"
+echo "kubectl present: $(which kubectl 2>/dev/null || echo 'not found')"
+echo "CNI configs: $(ls /etc/cni/net.d/ 2>/dev/null || echo 'empty/missing')"
+echo ""
+echo "Node is ready for a fresh cluster installation."
+""",
+            timeout=30,
+            fatal=False,
+        ),
+    ]
+
+
 def execute_provision_steps(
     node: dict,
     steps: List[ProvisionStep],

From 88b1984f0af24aa6f7edc2c28af0eacdc24e3d20 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 06:54:52 +0000
Subject: [PATCH 15/31] Add Resource Requests/Limits tab: tabular view of CPU,
 memory, ephemeral-storage per namespace

---
 k8s-agent/app.py | 167 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 166 insertions(+), 1 deletion(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index fcf776d..582f377 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -1769,10 +1769,11 @@ def page_resource_viewer():
     if profile.cluster_source == "imported" and profile.kubeconfig_content:
         _rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
 
-    tab_resources, tab_scaling, tab_shell, tab_crictl, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+    tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
         "Cluster Resources",
         "Scaling",
         "Pod Shell",
+        "Resource Requests/Limits",
         "Node Containers",
         "Node Health",
         "RBAC Viewer",
@@ -2181,6 +2182,170 @@ def page_resource_viewer():
         elif not sh_load:
             st.info("Click **Load Pods** to see running pods in the selected namespace.")
 
+    # ── Resource Requests / Limits ───────────────────────────────────────
+    with tab_res_limits:
+        st.markdown("### Container Resource Requests & Limits")
+        st.markdown(
+            "View CPU, memory, and ephemeral-storage requests and limits for all "
+            "containers in a namespace (from Deployments, StatefulSets, DaemonSets, and Jobs)."
+        )
+
+        rl_col1, rl_col2 = st.columns(2)
+        with rl_col1:
+            if _rv_namespaces:
+                rl_ns = st.selectbox(
+                    "Namespace",
+                    options=_rv_namespaces,
+                    index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+                    key="rl_ns",
+                )
+            else:
+                rl_ns = st.text_input("Namespace", value="default", key="rl_ns")
+        with rl_col2:
+            rl_workload = st.selectbox(
+                "Workload Type",
+                options=["Deployments", "StatefulSets", "DaemonSets", "Jobs", "All"],
+                index=0,
+                key="rl_workload",
+            )
+
+        if st.button("Fetch Resource Requests/Limits", type="primary", key="rl_fetch"):
+            import json as _json
+
+            workload_map = {
+                "Deployments": "deploy",
+                "StatefulSets": "statefulsets",
+                "DaemonSets": "daemonsets",
+                "Jobs": "jobs",
+            }
+            if rl_workload == "All":
+                types_to_fetch = list(workload_map.items())
+            else:
+                types_to_fetch = [(rl_workload, workload_map[rl_workload])]
+
+            all_rows: list[dict] = []
+            for wl_label, wl_cmd in types_to_fetch:
+                with st.spinner(f"Fetching {wl_label}..."):
+                    result = run_kubectl(
+                        profile,
+                        f"get {wl_cmd} -n {rl_ns} -o json",
+                        timeout=30,
+                    )
+                    if result.success and result.stdout.strip():
+                        try:
+                            data = _json.loads(result.stdout)
+                            for item in data.get("items", []):
+                                workload_name = item.get("metadata", {}).get("name", "?")
+                                spec = item.get("spec", {})
+                                # For Jobs the pod template is at spec.template,
+                                # for Deployments/StatefulSets/DaemonSets it's spec.template
+                                template = spec.get("template", {})
+                                pod_spec = template.get("spec", {})
+                                containers = pod_spec.get("containers", [])
+                                init_containers = pod_spec.get("initContainers", [])
+                                for ctr in containers:
+                                    res = ctr.get("resources", {})
+                                    req = res.get("requests", {})
+                                    lim = res.get("limits", {})
+                                    all_rows.append({
+                                        "Type": wl_label,
+                                        "Workload": workload_name,
+                                        "Container": ctr.get("name", "?"),
+                                        "Init": "",
+                                        "CPU Req": req.get("cpu", "-"),
+                                        "CPU Lim": lim.get("cpu", "-"),
+                                        "Mem Req": req.get("memory", "-"),
+                                        "Mem Lim": lim.get("memory", "-"),
+                                        "Eph Req": req.get("ephemeral-storage", "-"),
+                                        "Eph Lim": lim.get("ephemeral-storage", "-"),
+                                    })
+                                for ctr in init_containers:
+                                    res = ctr.get("resources", {})
+                                    req = res.get("requests", {})
+                                    lim = res.get("limits", {})
+                                    all_rows.append({
+                                        "Type": wl_label,
+                                        "Workload": workload_name,
+                                        "Container": ctr.get("name", "?"),
+                                        "Init": "init",
+                                        "CPU Req": req.get("cpu", "-"),
+                                        "CPU Lim": lim.get("cpu", "-"),
+                                        "Mem Req": req.get("memory", "-"),
+                                        "Mem Lim": lim.get("memory", "-"),
+                                        "Eph Req": req.get("ephemeral-storage", "-"),
+                                        "Eph Lim": lim.get("ephemeral-storage", "-"),
+                                    })
+                        except _json.JSONDecodeError:
+                            st.warning(f"Could not parse JSON for {wl_label}")
+                    elif not result.success:
+                        st.warning(f"Failed to fetch {wl_label}: {result.stderr}")
+
+            if all_rows:
+                st.markdown(f"**{len(all_rows)} container(s)** found in namespace `{rl_ns}`:")
+                st.dataframe(
+                    all_rows,
+                    use_container_width=True,
+                    column_config={
+                        "Type": st.column_config.TextColumn(width="small"),
+                        "Workload": st.column_config.TextColumn(width="medium"),
+                        "Container": st.column_config.TextColumn(width="medium"),
+                        "Init": st.column_config.TextColumn(width="small"),
+                        "CPU Req": st.column_config.TextColumn(width="small"),
+                        "CPU Lim": st.column_config.TextColumn(width="small"),
+                        "Mem Req": st.column_config.TextColumn(width="small"),
+                        "Mem Lim": st.column_config.TextColumn(width="small"),
+                        "Eph Req": st.column_config.TextColumn(width="small"),
+                        "Eph Lim": st.column_config.TextColumn(width="small"),
+                    },
+                )
+
+                # Summary stats
+                st.markdown("---")
+                st.markdown("#### Summary")
+                no_cpu_req = sum(1 for r in all_rows if r["CPU Req"] == "-" and r["Init"] == "")
+                no_mem_req = sum(1 for r in all_rows if r["Mem Req"] == "-" and r["Init"] == "")
+                no_cpu_lim = sum(1 for r in all_rows if r["CPU Lim"] == "-" and r["Init"] == "")
+                no_mem_lim = sum(1 for r in all_rows if r["Mem Lim"] == "-" and r["Init"] == "")
+                non_init = sum(1 for r in all_rows if r["Init"] == "")
+                sc1, sc2, sc3, sc4 = st.columns(4)
+                with sc1:
+                    st.metric("No CPU Request", f"{no_cpu_req}/{non_init}")
+                with sc2:
+                    st.metric("No CPU Limit", f"{no_cpu_lim}/{non_init}")
+                with sc3:
+                    st.metric("No Mem Request", f"{no_mem_req}/{non_init}")
+                with sc4:
+                    st.metric("No Mem Limit", f"{no_mem_lim}/{non_init}")
+
+                if no_cpu_req > 0 or no_mem_req > 0:
+                    st.warning(
+                        f"{no_cpu_req + no_mem_req} container(s) are missing resource requests. "
+                        "This can affect scheduling and QoS class assignment."
+                    )
+                if no_cpu_lim > 0 or no_mem_lim > 0:
+                    st.info(
+                        f"{no_cpu_lim + no_mem_lim} container(s) are missing resource limits. "
+                        "Consider setting limits to prevent resource contention."
+                    )
+
+                # Download as TSV
+                tsv_lines = ["Type\tWorkload\tContainer\tInit\tCPU Req\tCPU Lim\tMem Req\tMem Lim\tEph Req\tEph Lim"]
+                for r in all_rows:
+                    tsv_lines.append(
+                        f"{r['Type']}\t{r['Workload']}\t{r['Container']}\t{r['Init']}\t"
+                        f"{r['CPU Req']}\t{r['CPU Lim']}\t{r['Mem Req']}\t{r['Mem Lim']}\t"
+                        f"{r['Eph Req']}\t{r['Eph Lim']}"
+                    )
+                st.download_button(
+                    "Download as TSV",
+                    data="\n".join(tsv_lines),
+                    file_name=f"resource_limits_{rl_ns}.tsv",
+                    mime="text/tab-separated-values",
+                    key="rl_download",
+                )
+            else:
+                st.info(f"No containers found in namespace `{rl_ns}` for the selected workload type(s).")
+
     # ── Node Containers (crictl) ────────────────────────────────────────
     with tab_crictl:
         st.markdown("### Node Containers (crictl)")

From b63824a69cca3aab33483432d6db3d136bae523a Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:02:40 +0000
Subject: [PATCH 16/31] Fix Devin Review: Disk Usage key mismatch in
 CATEGORY_MAP, add timestamp window to correlate_errors

---
 k8s-agent/modules/cluster_debugger.py |  2 +-
 k8s-agent/modules/log_analyzer.py     | 35 ++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index c5216f1..6815a42 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -28,7 +28,7 @@
     "Services": "get svc -A",
     "PVCs": "get pvc -A",
     "Ingresses": "get ingress -A",
-    "Disk Usage (Nodes)": "top nodes",
+    "Disk Usage": "top nodes",
 }
 
 # Full SSH commands (backward-compat for provisioned clusters)
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index 08a25e1..917bc3e 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -319,22 +319,49 @@ def correlate_errors(
 
     all_errors.sort(key=lambda e: e.get("timestamp", ""))
 
+    def _parse_ts(ts_str: str):
+        """Try to parse a timestamp string into a datetime object."""
+        from datetime import datetime
+        for fmt in (
+            "%Y-%m-%dT%H:%M:%S",
+            "%Y-%m-%dT%H:%M:%S.%f",
+            "%Y-%m-%dT%H:%M:%SZ",
+            "%Y-%m-%dT%H:%M:%S.%fZ",
+            "%b %d %H:%M:%S",
+            "%Y-%m-%d %H:%M:%S",
+            "%Y-%m-%d %H:%M:%S.%f",
+        ):
+            try:
+                return datetime.strptime(ts_str.strip(), fmt)
+            except (ValueError, AttributeError):
+                continue
+        return None
+
     correlated = []
     window_seconds = 30
-    used = set()
+    used: set[int] = set()
 
     for i, err in enumerate(all_errors):
         if i in used:
             continue
         group = [err]
         used.add(i)
+        err_ts = _parse_ts(err.get("timestamp", ""))
 
         for j in range(i + 1, len(all_errors)):
             if j in used:
                 continue
-            if all_errors[j].get("source") != err.get("source"):
-                group.append(all_errors[j])
-                used.add(j)
+            other = all_errors[j]
+            if other.get("source") == err.get("source"):
+                continue
+            # If both timestamps are parseable, enforce the time window
+            other_ts = _parse_ts(other.get("timestamp", ""))
+            if err_ts and other_ts:
+                diff = abs((other_ts - err_ts).total_seconds())
+                if diff > window_seconds:
+                    continue
+            group.append(other)
+            used.add(j)
 
         if len(group) > 1:
             correlated.append({

From 23324fd75b136d231f95ded532c72ac88af840a2 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:05:59 +0000
Subject: [PATCH 17/31] Add proper feedback messages for buttons: imported
 cluster guards, sudo for crictl, test summary

---
 k8s-agent/app.py | 147 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 103 insertions(+), 44 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 582f377..53e2b30 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -619,40 +619,71 @@ def page_cluster_creation():
         st.markdown("### SSH Connectivity Test")
         st.markdown("Test SSH access to all nodes before provisioning.")
 
-        if st.button("Test All Nodes", type="primary"):
-            for node in profile.nodes:
-                with st.status(f"Testing {node.get('hostname', node['ip_address'])}...", expanded=True):
-                    result = test_ssh_connectivity(node)
-                    if result.success:
-                        st.success(f"Connected to {node['ip_address']}")
-                        st.code(result.stdout, language="text")
-                    else:
-                        st.error(f"Failed to connect to {node['ip_address']}")
-                        st.code(result.stderr, language="text")
+        if profile.cluster_source == "imported":
+            st.info(
+                "SSH connectivity tests are not applicable for imported clusters. "
+                "Imported clusters connect via kubeconfig — no SSH access is needed. "
+                "Use the **Cluster Debugger** or **Resource Viewer** to verify connectivity."
+            )
+        elif not profile.nodes:
+            st.warning("No nodes defined in this profile. Add nodes in the Profile Manager first.")
+        else:
+            if st.button("Test All Nodes", type="primary"):
+                all_ok = True
+                for node in profile.nodes:
+                    with st.status(f"Testing {node.get('hostname', node['ip_address'])}...", expanded=True):
+                        result = test_ssh_connectivity(node)
+                        if result.success:
+                            st.success(f"Connected to {node['ip_address']}")
+                            st.code(result.stdout, language="text")
+                        else:
+                            all_ok = False
+                            st.error(f"Failed to connect to {node['ip_address']}")
+                            st.code(result.stderr, language="text")
+                if all_ok:
+                    st.success("All nodes are reachable via SSH. You can proceed to provisioning.")
+                else:
+                    st.error("Some nodes failed SSH connectivity. Fix the issues above before provisioning.")
 
     # ── Provision ─────────────────────────────────────────────────────────
     with tab_provision:
         st.markdown("### Automated Cluster Provisioning")
-        st.warning(
-            "This will SSH into each node and execute every provisioning step "
-            "automatically. Ensure all nodes are accessible and you have root/sudo access."
-        )
+
+        if profile.cluster_source == "imported":
+            st.info(
+                "Provisioning is not available for imported clusters. "
+                "This cluster was imported via kubeconfig and is managed externally. "
+                "Use the **Resource Viewer**, **Cluster Debugger**, or **Monitoring Setup** "
+                "pages to work with your cluster."
+            )
+        elif not profile.nodes:
+            st.warning("No nodes defined in this profile. Add nodes in the Profile Manager first.")
+        else:
+            st.warning(
+                "This will SSH into each node and execute every provisioning step "
+                "automatically. Ensure all nodes are accessible and you have root/sudo access."
+            )
 
         cp_nodes = profile.get_control_plane_nodes()
         worker_nodes = profile.get_worker_nodes()
 
-        st.markdown(f"**Control Plane:** {len(cp_nodes)} node(s) | **Workers:** {len(worker_nodes)} node(s)")
+        if profile.cluster_source != "imported" and profile.nodes:
+            st.markdown(f"**Control Plane:** {len(cp_nodes)} node(s) | **Workers:** {len(worker_nodes)} node(s)")
 
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            step1 = st.checkbox("Step 1: Common Setup (all nodes)", value=True)
-        with col2:
-            step2 = st.checkbox("Step 2: Init Control Plane", value=True)
-        with col3:
-            step3 = st.checkbox("Step 3: Join Workers", value=True)
-        step4 = st.checkbox("Step 4: Apply Best Practices", value=True)
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                step1 = st.checkbox("Step 1: Common Setup (all nodes)", value=True)
+            with col2:
+                step2 = st.checkbox("Step 2: Init Control Plane", value=True)
+            with col3:
+                step3 = st.checkbox("Step 3: Join Workers", value=True)
+            step4 = st.checkbox("Step 4: Apply Best Practices", value=True)
+        else:
+            step1 = step2 = step3 = step4 = False
 
-        if st.button("Start Provisioning", type="primary", use_container_width=True):
+        if profile.cluster_source == "imported" or not profile.nodes:
+            pass  # messages shown above
+        elif st.button("Start Provisioning", type="primary", use_container_width=True):
             update_profile_status(profile.name, "provisioning")
             overall_success = True
 
@@ -2355,9 +2386,15 @@ def page_resource_viewer():
             # Imported clusters — no SSH, but we can still get node list and show
             # container info via kubectl debug or just list pods per node
             st.info(
-                "**crictl** requires SSH access to each node and is available for "
-                "provisioned clusters. For imported clusters, container-level "
-                "information is shown via kubectl below."
+                "**crictl** requires SSH access to each node and is only available for "
+                "provisioned clusters. For imported clusters, pod and container "
+                "information per node is shown via `kubectl` below."
+            )
+            st.markdown(
+                "This view uses `kubectl get pods --field-selector spec.nodeName=<node>` "
+                "to list pods/containers on each node. For full container-level details "
+                "(container IDs, image digests, runtime state), SSH into the node and run "
+                "`sudo crictl ps -a` directly."
             )
             if st.button("Show containers per node (kubectl)", type="primary", key="crictl_kubectl"):
                 with st.spinner("Fetching node list..."):
@@ -2398,6 +2435,16 @@ def page_resource_viewer():
             if not all_nodes:
                 st.warning("No nodes defined in this profile.")
             else:
+                st.markdown(
+                    "> **Note:** `crictl` typically requires **root/sudo** access. "
+                    "If your SSH user is not root, the command will be prefixed with `sudo`."
+                )
+                use_sudo = st.checkbox(
+                    "Run with sudo (required if SSH user is not root)",
+                    value=True,
+                    key="crictl_sudo",
+                    help="Prefix the command with 'sudo' for non-root SSH users.",
+                )
                 crictl_cmd = st.text_input(
                     "CRI command",
                     value="crictl ps -a",
@@ -2442,23 +2489,35 @@ def page_resource_viewer():
                     selected_nodes = all_nodes
 
                 if st.button("Run on selected nodes", type="primary", key="crictl_run"):
-                    for node in selected_nodes:
-                        node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
-                        with st.expander(f"Node: **{node_label}** [{node.get('role', '')}]", expanded=True):
-                            with st.spinner(f"Running `{crictl_cmd}` on {node_label}..."):
-                                result = run_ssh_command(
-                                    ip_address=node["ip_address"],
-                                    command=crictl_cmd,
-                                    ssh_user=node.get("ssh_user", "root"),
-                                    ssh_port=node.get("ssh_port", 22),
-                                    ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
-                                    timeout=30,
-                                )
-                                if result.success:
-                                    st.code(result.stdout or "(no output)", language="text")
-                                else:
-                                    st.error(f"Command failed on {node_label}")
-                                    st.code(result.stderr, language="text")
+                    if not selected_nodes:
+                        st.warning("No nodes selected. Please select at least one node.")
+                    else:
+                        actual_cmd = f"sudo {crictl_cmd}" if use_sudo and not crictl_cmd.strip().startswith("sudo") else crictl_cmd
+                        all_success = True
+                        for node in selected_nodes:
+                            node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
+                            with st.expander(f"Node: **{node_label}** [{node.get('role', '')}]", expanded=True):
+                                with st.spinner(f"Running `{actual_cmd}` on {node_label}..."):
+                                    result = run_ssh_command(
+                                        ip_address=node["ip_address"],
+                                        command=actual_cmd,
+                                        ssh_user=node.get("ssh_user", "root"),
+                                        ssh_port=node.get("ssh_port", 22),
+                                        ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+                                        timeout=30,
+                                    )
+                                    if result.success:
+                                        st.code(result.stdout or "(no output)", language="text")
+                                    else:
+                                        all_success = False
+                                        st.error(f"Command failed on {node_label}")
+                                        st.code(result.stderr, language="text")
+                                        if "permission denied" in (result.stderr or "").lower():
+                                            st.info("Tip: Enable the 'Run with sudo' checkbox above if your SSH user needs elevated privileges.")
+                        if all_success:
+                            st.success(f"Command completed successfully on {len(selected_nodes)} node(s).")
+                        else:
+                            st.warning("Command failed on some nodes. Check the details above.")
 
     # ── Node Health ──────────────────────────────────────────────────────
     with tab_node_health:

From 6a43a3a3fccea33ad617b1033fa0ad33f8a21ef1 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:08:15 +0000
Subject: [PATCH 18/31] Add flash messages for Import Cluster and Create
 Profile so success/error persists after rerun

---
 k8s-agent/app.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 53e2b30..4df7f83 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -145,6 +145,7 @@ def init_session_state():
         "provisioning_log": [],
         "debug_results": {},
         "log_analysis_results": {},
+        "_flash_message": None,
     }
     for key, value in defaults.items():
         if key not in st.session_state:
@@ -260,6 +261,17 @@ def page_profile_manager():
     st.markdown("## Cluster Profile Manager")
     st.markdown("Create, edit, and manage profiles for your on-prem Kubernetes clusters.")
 
+    # Show any flash message from a previous action (e.g. after st.rerun)
+    if st.session_state.get("_flash_message"):
+        _flash = st.session_state._flash_message
+        if _flash[0] == "success":
+            st.success(_flash[1])
+        elif _flash[0] == "error":
+            st.error(_flash[1])
+        elif _flash[0] == "info":
+            st.info(_flash[1])
+        st.session_state._flash_message = None
+
     tab_create, tab_import_cluster, tab_list, tab_import = st.tabs([
         "Create Profile", "Import Existing Cluster", "Manage Profiles", "Import / Export",
     ])
@@ -454,7 +466,7 @@ def page_profile_manager():
                     )
                     path = save_profile(profile)
                     st.session_state.active_profile = name
-                    st.success(f"Profile '{name}' created successfully!")
+                    st.session_state._flash_message = ("success", f"Profile '{name}' created successfully! Select it from the sidebar to get started.")
                     st.rerun()
 
     # ── Import Existing Cluster ──────────────────────────────────────────
@@ -508,14 +520,18 @@ def page_profile_manager():
                     cluster_source="imported",
                     kubeconfig_content=kubeconfig_content,
                 )
-                save_profile(profile)
-                st.session_state.active_profile = import_name
-                st.success(
-                    f"Cluster '{import_name}' imported! "
-                    "Select it from the sidebar to start using Debugger, Monitoring, "
-                    "Resource Viewer, etc."
-                )
-                st.rerun()
+                try:
+                    save_profile(profile)
+                    st.session_state.active_profile = import_name
+                    st.session_state._flash_message = (
+                        "success",
+                        f"Cluster '{import_name}' imported successfully! "
+                        "It is now the active profile. Use the sidebar navigation to go to "
+                        "Cluster Debugger, Resource Viewer, Monitoring Setup, etc."
+                    )
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"Failed to import cluster: {e}")
 
     # ── Manage Profiles ───────────────────────────────────────────────────
     with tab_list:

From ad0ad398dcadc355db7d2dbb09eb15a66a06d7d4 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:11:40 +0000
Subject: [PATCH 19/31] Enrich Cluster Details for imported clusters: show node
 IPs, roles, kubelet version, OS, container runtime, cluster-info

---
 k8s-agent/app.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 4df7f83..2e4e13a 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -3327,6 +3327,67 @@ def _show_profile_summary(profile: ClusterProfile):
         with st.expander("Cluster Details", expanded=False):
             st.markdown(f"**Description:** {profile.description or 'N/A'}")
             st.markdown(f"**Kubeconfig:** {'Loaded' if profile.kubeconfig_content else 'Not loaded'}")
+
+            # Fetch live cluster info from kubeconfig
+            if profile.kubeconfig_content:
+                node_result = run_kubectl(
+                    profile,
+                    "get nodes -o wide --no-headers",
+                    timeout=10,
+                )
+                if node_result.success and node_result.stdout.strip():
+                    st.markdown("---")
+                    st.markdown("**Cluster Nodes:**")
+                    node_lines = [l for l in node_result.stdout.strip().split("\n") if l.strip()]
+                    node_data = []
+                    for line in node_lines:
+                        parts = line.split()
+                        if len(parts) >= 5:
+                            node_data.append({
+                                "Name": parts[0],
+                                "Status": parts[1],
+                                "Roles": parts[2] if parts[2] != "<none>" else "worker",
+                                "Age": parts[3],
+                                "Kubelet Version": parts[4],
+                                "Internal IP": parts[5] if len(parts) > 5 else "N/A",
+                                "OS Image": " ".join(parts[7:9]) if len(parts) > 8 else (parts[7] if len(parts) > 7 else "N/A"),
+                                "Container Runtime": parts[-1] if len(parts) > 9 else "N/A",
+                            })
+                    if node_data:
+                        import pandas as pd
+                        st.dataframe(
+                            pd.DataFrame(node_data),
+                            use_container_width=True,
+                            hide_index=True,
+                        )
+                        # Summary
+                        cp_count = sum(1 for n in node_data if "control-plane" in n["Roles"] or "master" in n["Roles"])
+                        worker_count = len(node_data) - cp_count
+                        ready_count = sum(1 for n in node_data if "Ready" in n["Status"])
+                        st.markdown(
+                            f"**Total:** {len(node_data)} node(s) — "
+                            f"{cp_count} control-plane, {worker_count} worker | "
+                            f"**Ready:** {ready_count}/{len(node_data)}"
+                        )
+                    else:
+                        st.code(node_result.stdout, language="text")
+
+                    # Cluster info (API server endpoint)
+                    info_result = run_kubectl(profile, "cluster-info", timeout=10)
+                    if info_result.success and info_result.stdout.strip():
+                        st.markdown("---")
+                        st.markdown("**Cluster Info:**")
+                        # Strip ANSI color codes for clean display
+                        import re
+                        clean_info = re.sub(r'\x1b\[[0-9;]*m', '', info_result.stdout)
+                        st.code(clean_info.strip(), language="text")
+                elif node_result.success:
+                    st.info("Connected to cluster but no nodes found.")
+                else:
+                    st.warning(
+                        f"Could not fetch cluster details: {node_result.stderr or 'kubectl command failed'}. "
+                        "Verify that kubectl is installed and the kubeconfig is valid."
+                    )
     else:
         cols = st.columns(5)
         cols[0].metric("Profile", profile.name)

From 937bc798a5320836d596b56f74b5741ce21cdd5c Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:23:05 +0000
Subject: [PATCH 20/31] Add Multi-Cluster Dashboard, Certificate Manager, Cost
 Optimizer, Pod Restart Tracker, Network Policy Visualizer, PVC/Storage
 Dashboard

---
 k8s-agent/app.py | 1085 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 1083 insertions(+), 2 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 2e4e13a..aadb5ba 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -209,6 +209,7 @@ def render_sidebar():
         # ── Navigation ──
         st.markdown("### Navigation")
         nav_options = [
+            "Multi-Cluster Dashboard",
             "Profile Manager",
             "Cluster Creation",
             "Resource Viewer",
@@ -216,6 +217,8 @@ def render_sidebar():
             "Monitoring Setup",
             "Log Analysis",
             "Upgrade Planner",
+            "Certificate Manager",
+            "Cost Optimizer",
             "AI Assistant",
         ]
         selected_page = st.radio(
@@ -1816,7 +1819,9 @@ def page_resource_viewer():
     if profile.cluster_source == "imported" and profile.kubeconfig_content:
         _rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
 
-    tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+    (tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl,
+     tab_node_health, tab_rbac, tab_helm, tab_events,
+     tab_restart_tracker, tab_netpol, tab_pvc) = st.tabs([
         "Cluster Resources",
         "Scaling",
         "Pod Shell",
@@ -1826,6 +1831,9 @@ def page_resource_viewer():
         "RBAC Viewer",
         "Helm Releases",
         "Events Timeline",
+        "Pod Restart Tracker",
+        "Network Policies",
+        "PVC / Storage",
     ])
 
     # ── Cluster Resources ────────────────────────────────────────────────
@@ -2960,6 +2968,429 @@ def page_resource_viewer():
                 else:
                     st.code(result.stderr, language="text")
 
+    # ── Pod Restart Tracker ───────────────────────────────────────────────
+    with tab_restart_tracker:
+        st.markdown("### Pod Restart Tracker")
+        st.markdown("Identify pods with frequent restarts, OOMKilled containers, and CrashLoopBackOff issues.")
+
+        rcol1, rcol2 = st.columns([2, 1])
+        with rcol1:
+            if _rv_namespaces:
+                restart_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="restart_ns")
+            else:
+                restart_ns = st.text_input("Namespace (blank = all)", value="", key="restart_ns_text")
+                if not restart_ns:
+                    restart_ns = "All Namespaces"
+        with rcol2:
+            min_restarts = st.number_input("Min restarts to show", min_value=0, value=1, key="min_restarts")
+
+        if st.button("Load Pod Restarts", type="primary", key="load_restarts"):
+            ns_flag = "-A" if restart_ns == "All Namespaces" else f"-n {restart_ns}"
+            cmd = f"get pods {ns_flag} -o json"
+            with st.spinner("Fetching pod data..."):
+                result = run_kubectl(profile, cmd, timeout=30)
+            if result.success and result.stdout.strip():
+                try:
+                    import pandas as pd
+                    pods_json = json.loads(result.stdout)
+                    restart_data = []
+                    for pod in pods_json.get("items", []):
+                        pod_name = pod.get("metadata", {}).get("name", "?")
+                        pod_ns = pod.get("metadata", {}).get("namespace", "?")
+                        for cs in pod.get("status", {}).get("containerStatuses", []):
+                            restarts = cs.get("restartCount", 0)
+                            if restarts < min_restarts:
+                                continue
+                            container_name = cs.get("name", "?")
+                            ready = cs.get("ready", False)
+                            # Detect OOMKilled
+                            last_state = cs.get("lastState", {})
+                            terminated = last_state.get("terminated", {})
+                            reason = terminated.get("reason", "")
+                            exit_code = terminated.get("exitCode", "")
+                            # Current state
+                            state = cs.get("state", {})
+                            if "running" in state:
+                                current_state = "Running"
+                            elif "waiting" in state:
+                                current_state = state["waiting"].get("reason", "Waiting")
+                            elif "terminated" in state:
+                                current_state = state["terminated"].get("reason", "Terminated")
+                            else:
+                                current_state = "Unknown"
+                            restart_data.append({
+                                "Namespace": pod_ns,
+                                "Pod": pod_name,
+                                "Container": container_name,
+                                "Restarts": restarts,
+                                "Ready": ready,
+                                "State": current_state,
+                                "Last Termination": reason or "N/A",
+                                "Exit Code": str(exit_code) if exit_code != "" else "N/A",
+                            })
+                    if restart_data:
+                        df = pd.DataFrame(restart_data).sort_values("Restarts", ascending=False)
+                        # Summary metrics
+                        total_restarts = df["Restarts"].sum()
+                        oom_count = len(df[df["Last Termination"] == "OOMKilled"])
+                        crash_count = len(df[df["State"] == "CrashLoopBackOff"])
+                        mcol1, mcol2, mcol3, mcol4 = st.columns(4)
+                        mcol1.metric("Containers with Restarts", len(df))
+                        mcol2.metric("Total Restarts", int(total_restarts))
+                        mcol3.metric("OOMKilled", oom_count)
+                        mcol4.metric("CrashLoopBackOff", crash_count)
+                        st.dataframe(df, use_container_width=True, hide_index=True)
+                        # Highlight problematic pods
+                        if oom_count > 0:
+                            st.warning(
+                                f"{oom_count} container(s) were terminated due to **OOMKilled** — "
+                                "consider increasing memory limits for those workloads."
+                            )
+                        if crash_count > 0:
+                            st.error(
+                                f"{crash_count} container(s) are in **CrashLoopBackOff** — "
+                                "check logs with `kubectl logs <pod> -c <container> --previous`."
+                            )
+                    else:
+                        st.success(f"No containers found with {min_restarts}+ restarts. Cluster looks healthy!")
+                except (json.JSONDecodeError, KeyError) as e:
+                    st.error(f"Failed to parse pod data: {e}")
+                    st.code(result.stdout[:2000], language="text")
+            elif result.success:
+                st.info("No pods found.")
+            else:
+                st.error("Failed to fetch pods")
+                st.code(result.stderr, language="text")
+
+    # ── Network Policy Visualizer ─────────────────────────────────────────
+    with tab_netpol:
+        st.markdown("### Network Policy Visualizer")
+        st.markdown("View and analyze NetworkPolicies to understand pod-to-pod communication rules.")
+
+        npcol1, npcol2 = st.columns([2, 1])
+        with npcol1:
+            if _rv_namespaces:
+                np_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="netpol_ns")
+            else:
+                np_ns = st.text_input("Namespace (blank = all)", value="", key="netpol_ns_text")
+                if not np_ns:
+                    np_ns = "All Namespaces"
+
+        if st.button("Load Network Policies", type="primary", key="load_netpol"):
+            ns_flag = "-A" if np_ns == "All Namespaces" else f"-n {np_ns}"
+            cmd = f"get networkpolicies {ns_flag} -o json"
+            with st.spinner("Fetching network policies..."):
+                result = run_kubectl(profile, cmd, timeout=15)
+            if result.success and result.stdout.strip():
+                try:
+                    import pandas as pd
+                    np_json = json.loads(result.stdout)
+                    policies = np_json.get("items", [])
+                    if not policies:
+                        st.info("No NetworkPolicies found. All pod-to-pod traffic is allowed by default.")
+                    else:
+                        st.markdown(f"**Found {len(policies)} NetworkPolicies**")
+
+                        policy_summary = []
+                        for pol in policies:
+                            meta = pol.get("metadata", {})
+                            spec = pol.get("spec", {})
+                            pol_name = meta.get("name", "?")
+                            pol_ns = meta.get("namespace", "?")
+                            # Pod selector
+                            pod_sel = spec.get("podSelector", {})
+                            match_labels = pod_sel.get("matchLabels", {})
+                            selector_str = ", ".join(f"{k}={v}" for k, v in match_labels.items()) if match_labels else "(all pods)"
+                            # Policy types
+                            policy_types = spec.get("policyTypes", [])
+                            # Ingress rules count
+                            ingress_rules = spec.get("ingress", [])
+                            egress_rules = spec.get("egress", [])
+
+                            policy_summary.append({
+                                "Namespace": pol_ns,
+                                "Policy": pol_name,
+                                "Pod Selector": selector_str,
+                                "Types": ", ".join(policy_types) if policy_types else "N/A",
+                                "Ingress Rules": len(ingress_rules),
+                                "Egress Rules": len(egress_rules),
+                            })
+
+                        st.dataframe(pd.DataFrame(policy_summary), use_container_width=True, hide_index=True)
+
+                        # Detailed view per policy
+                        for pol in policies:
+                            meta = pol.get("metadata", {})
+                            spec = pol.get("spec", {})
+                            pol_name = meta.get("name", "?")
+                            pol_ns = meta.get("namespace", "?")
+                            with st.expander(f"{pol_ns}/{pol_name}", expanded=False):
+                                # Pod selector
+                                pod_sel = spec.get("podSelector", {})
+                                match_labels = pod_sel.get("matchLabels", {})
+                                if match_labels:
+                                    st.markdown("**Applies to pods matching:** " + ", ".join(f"`{k}={v}`" for k, v in match_labels.items()))
+                                else:
+                                    st.markdown("**Applies to:** All pods in namespace")
+
+                                # Ingress
+                                ingress_rules = spec.get("ingress", [])
+                                if ingress_rules:
+                                    st.markdown("**Ingress Rules:**")
+                                    for i, rule in enumerate(ingress_rules):
+                                        sources = []
+                                        for fr in rule.get("from", []):
+                                            if "podSelector" in fr:
+                                                labels = fr["podSelector"].get("matchLabels", {})
+                                                sources.append("Pods: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
+                                            if "namespaceSelector" in fr:
+                                                labels = fr["namespaceSelector"].get("matchLabels", {})
+                                                sources.append("Namespaces: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
+                                            if "ipBlock" in fr:
+                                                sources.append(f"CIDR: {fr['ipBlock'].get('cidr', '?')}")
+                                        ports = []
+                                        for p in rule.get("ports", []):
+                                            ports.append(f"{p.get('protocol', 'TCP')}/{p.get('port', '*')}")
+                                        src_str = ", ".join(sources) if sources else "any"
+                                        port_str = ", ".join(ports) if ports else "all ports"
+                                        st.markdown(f"  - Rule {i+1}: Allow from **{src_str}** on **{port_str}**")
+                                elif "Ingress" in spec.get("policyTypes", []):
+                                    st.warning("Ingress type declared but no rules — all ingress traffic is **denied**.")
+
+                                # Egress
+                                egress_rules = spec.get("egress", [])
+                                if egress_rules:
+                                    st.markdown("**Egress Rules:**")
+                                    for i, rule in enumerate(egress_rules):
+                                        destinations = []
+                                        for to in rule.get("to", []):
+                                            if "podSelector" in to:
+                                                labels = to["podSelector"].get("matchLabels", {})
+                                                destinations.append("Pods: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
+                                            if "namespaceSelector" in to:
+                                                labels = to["namespaceSelector"].get("matchLabels", {})
+                                                destinations.append("Namespaces: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
+                                            if "ipBlock" in to:
+                                                destinations.append(f"CIDR: {to['ipBlock'].get('cidr', '?')}")
+                                        ports = []
+                                        for p in rule.get("ports", []):
+                                            ports.append(f"{p.get('protocol', 'TCP')}/{p.get('port', '*')}")
+                                        dest_str = ", ".join(destinations) if destinations else "any"
+                                        port_str = ", ".join(ports) if ports else "all ports"
+                                        st.markdown(f"  - Rule {i+1}: Allow to **{dest_str}** on **{port_str}**")
+                                elif "Egress" in spec.get("policyTypes", []):
+                                    st.warning("Egress type declared but no rules — all egress traffic is **denied**.")
+
+                                st.markdown("---")
+                                st.markdown("**Raw YAML:**")
+                                import yaml
+                                st.code(yaml.dump(pol, default_flow_style=False), language="yaml")
+
+                        # Coverage check
+                        st.markdown("---")
+                        st.markdown("#### Coverage Analysis")
+                        if st.button("Check Unprotected Pods", key="netpol_coverage"):
+                            # Get all pods and check which are selected by a policy
+                            pod_ns_flag = f"-n {np_ns}" if np_ns != "All Namespaces" else "-A"
+                            pod_cmd = f"get pods {pod_ns_flag} -o json"
+                            with st.spinner("Analyzing coverage..."):
+                                pod_result = run_kubectl(profile, pod_cmd, timeout=15)
+                            if pod_result.success and pod_result.stdout.strip():
+                                try:
+                                    all_pods = json.loads(pod_result.stdout).get("items", [])
+                                    protected_pods = set()
+                                    for pol in policies:
+                                        pol_ns_name = pol.get("metadata", {}).get("namespace", "")
+                                        pod_sel = pol.get("spec", {}).get("podSelector", {})
+                                        match_labels = pod_sel.get("matchLabels", {})
+                                        for p in all_pods:
+                                            p_ns = p.get("metadata", {}).get("namespace", "")
+                                            p_name = p.get("metadata", {}).get("name", "")
+                                            p_labels = p.get("metadata", {}).get("labels", {})
+                                            if p_ns != pol_ns_name:
+                                                continue
+                                            if not match_labels or all(p_labels.get(k) == v for k, v in match_labels.items()):
+                                                protected_pods.add(f"{p_ns}/{p_name}")
+                                    unprotected = []
+                                    for p in all_pods:
+                                        p_ns = p.get("metadata", {}).get("namespace", "")
+                                        p_name = p.get("metadata", {}).get("name", "")
+                                        if f"{p_ns}/{p_name}" not in protected_pods:
+                                            unprotected.append({"Namespace": p_ns, "Pod": p_name})
+                                    if unprotected:
+                                        st.warning(f"{len(unprotected)} pod(s) are **not covered** by any NetworkPolicy (all traffic allowed):")
+                                        st.dataframe(pd.DataFrame(unprotected), use_container_width=True, hide_index=True)
+                                    else:
+                                        st.success("All pods are covered by at least one NetworkPolicy.")
+                                except (json.JSONDecodeError, KeyError):
+                                    st.error("Failed to parse pod data for coverage analysis.")
+
+                except (json.JSONDecodeError, KeyError) as e:
+                    st.error(f"Failed to parse network policy data: {e}")
+            elif result.success:
+                st.info("No NetworkPolicies found. All pod-to-pod traffic is allowed by default.")
+            else:
+                st.error("Failed to fetch network policies")
+                st.code(result.stderr, language="text")
+
+    # ── PVC / Storage Dashboard ───────────────────────────────────────────
+    with tab_pvc:
+        st.markdown("### PVC / Storage Dashboard")
+        st.markdown("View PersistentVolumeClaims, PersistentVolumes, and StorageClasses.")
+
+        pvc_sub = st.radio(
+            "View",
+            ["PVCs", "PersistentVolumes", "StorageClasses"],
+            horizontal=True,
+            key="pvc_view",
+        )
+
+        if pvc_sub == "PVCs":
+            pcol1, pcol2 = st.columns([2, 1])
+            with pcol1:
+                if _rv_namespaces:
+                    pvc_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="pvc_ns")
+                else:
+                    pvc_ns = st.text_input("Namespace (blank = all)", value="", key="pvc_ns_text")
+                    if not pvc_ns:
+                        pvc_ns = "All Namespaces"
+
+            if st.button("Load PVCs", type="primary", key="load_pvcs"):
+                ns_flag = "-A" if pvc_ns == "All Namespaces" else f"-n {pvc_ns}"
+                cmd = f"get pvc {ns_flag} -o json"
+                with st.spinner("Fetching PVCs..."):
+                    result = run_kubectl(profile, cmd, timeout=15)
+                if result.success and result.stdout.strip():
+                    try:
+                        import pandas as pd
+                        pvc_json = json.loads(result.stdout)
+                        pvcs = pvc_json.get("items", [])
+                        if not pvcs:
+                            st.info("No PVCs found.")
+                        else:
+                            pvc_data = []
+                            for pvc in pvcs:
+                                meta = pvc.get("metadata", {})
+                                spec = pvc.get("spec", {})
+                                status = pvc.get("status", {})
+                                capacity = status.get("capacity", {}).get("storage", "N/A")
+                                requested = spec.get("resources", {}).get("requests", {}).get("storage", "N/A")
+                                pvc_data.append({
+                                    "Namespace": meta.get("namespace", "?"),
+                                    "Name": meta.get("name", "?"),
+                                    "Status": status.get("phase", "?"),
+                                    "Volume": spec.get("volumeName", "N/A"),
+                                    "Capacity": capacity,
+                                    "Requested": requested,
+                                    "Access Modes": ", ".join(spec.get("accessModes", [])),
+                                    "Storage Class": spec.get("storageClassName", "N/A"),
+                                })
+                            df = pd.DataFrame(pvc_data)
+                            # Summary
+                            bound = len(df[df["Status"] == "Bound"])
+                            pending = len(df[df["Status"] == "Pending"])
+                            lost = len(df[df["Status"] == "Lost"])
+                            scol1, scol2, scol3, scol4 = st.columns(4)
+                            scol1.metric("Total PVCs", len(df))
+                            scol2.metric("Bound", bound)
+                            scol3.metric("Pending", pending)
+                            scol4.metric("Lost", lost)
+                            if pending > 0:
+                                st.warning(f"{pending} PVC(s) are **Pending** — check StorageClass availability and provisioner status.")
+                            if lost > 0:
+                                st.error(f"{lost} PVC(s) are **Lost** — the bound PV has been deleted. Data may be lost.")
+                            st.dataframe(df, use_container_width=True, hide_index=True)
+                    except (json.JSONDecodeError, KeyError) as e:
+                        st.error(f"Failed to parse PVC data: {e}")
+                elif result.success:
+                    st.info("No PVCs found.")
+                else:
+                    st.error("Failed to fetch PVCs")
+                    st.code(result.stderr, language="text")
+
+        elif pvc_sub == "PersistentVolumes":
+            if st.button("Load PVs", type="primary", key="load_pvs"):
+                cmd = "get pv -o json"
+                with st.spinner("Fetching PersistentVolumes..."):
+                    result = run_kubectl(profile, cmd, timeout=15)
+                if result.success and result.stdout.strip():
+                    try:
+                        import pandas as pd
+                        pv_json = json.loads(result.stdout)
+                        pvs = pv_json.get("items", [])
+                        if not pvs:
+                            st.info("No PersistentVolumes found.")
+                        else:
+                            pv_data = []
+                            for pv in pvs:
+                                meta = pv.get("metadata", {})
+                                spec = pv.get("spec", {})
+                                status = pv.get("status", {})
+                                claim_ref = spec.get("claimRef", {})
+                                claim = f"{claim_ref.get('namespace', '')}/{claim_ref.get('name', '')}" if claim_ref else "Unbound"
+                                pv_data.append({
+                                    "Name": meta.get("name", "?"),
+                                    "Capacity": spec.get("capacity", {}).get("storage", "N/A"),
+                                    "Access Modes": ", ".join(spec.get("accessModes", [])),
+                                    "Reclaim Policy": spec.get("persistentVolumeReclaimPolicy", "N/A"),
+                                    "Status": status.get("phase", "?"),
+                                    "Claim": claim,
+                                    "Storage Class": spec.get("storageClassName", "N/A"),
+                                    "Volume Mode": spec.get("volumeMode", "N/A"),
+                                })
+                            df = pd.DataFrame(pv_data)
+                            avail = len(df[df["Status"] == "Available"])
+                            bound = len(df[df["Status"] == "Bound"])
+                            released = len(df[df["Status"] == "Released"])
+                            scol1, scol2, scol3, scol4 = st.columns(4)
+                            scol1.metric("Total PVs", len(df))
+                            scol2.metric("Bound", bound)
+                            scol3.metric("Available", avail)
+                            scol4.metric("Released", released)
+                            st.dataframe(df, use_container_width=True, hide_index=True)
+                    except (json.JSONDecodeError, KeyError) as e:
+                        st.error(f"Failed to parse PV data: {e}")
+                elif result.success:
+                    st.info("No PersistentVolumes found.")
+                else:
+                    st.error("Failed to fetch PVs")
+                    st.code(result.stderr, language="text")
+
+        elif pvc_sub == "StorageClasses":
+            if st.button("Load Storage Classes", type="primary", key="load_sc"):
+                cmd = "get storageclasses -o json"
+                with st.spinner("Fetching StorageClasses..."):
+                    result = run_kubectl(profile, cmd, timeout=15)
+                if result.success and result.stdout.strip():
+                    try:
+                        import pandas as pd
+                        sc_json = json.loads(result.stdout)
+                        scs = sc_json.get("items", [])
+                        if not scs:
+                            st.info("No StorageClasses found.")
+                        else:
+                            sc_data = []
+                            for sc in scs:
+                                meta = sc.get("metadata", {})
+                                annotations = meta.get("annotations", {})
+                                is_default = annotations.get("storageclass.kubernetes.io/is-default-class", "false") == "true"
+                                sc_data.append({
+                                    "Name": meta.get("name", "?"),
+                                    "Provisioner": sc.get("provisioner", "N/A"),
+                                    "Reclaim Policy": sc.get("reclaimPolicy", "N/A"),
+                                    "Volume Binding": sc.get("volumeBindingMode", "N/A"),
+                                    "Allow Expansion": sc.get("allowVolumeExpansion", False),
+                                    "Default": is_default,
+                                })
+                            st.dataframe(pd.DataFrame(sc_data), use_container_width=True, hide_index=True)
+                    except (json.JSONDecodeError, KeyError) as e:
+                        st.error(f"Failed to parse StorageClass data: {e}")
+                elif result.success:
+                    st.info("No StorageClasses found.")
+                else:
+                    st.error("Failed to fetch StorageClasses")
+                    st.code(result.stderr, language="text")
+
 
 # ══════════════════════════════════════════════════════════════════════════
 #  PAGE: Upgrade Planner
@@ -3302,6 +3733,650 @@ def page_ai_assistant():
         st.session_state.chat_history.append({"role": "assistant", "content": full_response})
 
 
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Multi-Cluster Dashboard
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_multi_cluster_dashboard():
+    st.markdown("## Multi-Cluster Dashboard")
+    st.markdown("Overview of all registered cluster profiles at a glance.")
+
+    profiles = list_profiles()
+    if not profiles:
+        st.info("No cluster profiles yet. Create one in the **Profile Manager** or import a cluster via kubeconfig.")
+        return
+
+    # Summary metrics
+    total = len(profiles)
+    imported = sum(1 for p in profiles if p.cluster_source == "imported")
+    provisioned = total - imported
+    active_count = sum(1 for p in profiles if p.status == "active")
+    draft_count = sum(1 for p in profiles if p.status == "draft")
+    error_count = sum(1 for p in profiles if p.status == "error")
+
+    mcol1, mcol2, mcol3, mcol4, mcol5 = st.columns(5)
+    mcol1.metric("Total Clusters", total)
+    mcol2.metric("Provisioned", provisioned)
+    mcol3.metric("Imported", imported)
+    mcol4.metric("Active", active_count)
+    mcol5.metric("Errors", error_count)
+
+    st.markdown("---")
+
+    # Cluster cards
+    for profile in profiles:
+        status_icon = {"active": "🟢", "error": "🔴", "draft": "⚪", "provisioning": "🟡"}.get(profile.status, "⚪")
+        source_label = "Imported" if profile.cluster_source == "imported" else "Provisioned"
+
+        with st.expander(
+            f"{status_icon} **{profile.name}** — {source_label} | {profile.status.upper()}",
+            expanded=(profile.status == "error"),
+        ):
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.markdown(f"**K8s Version:** {profile.kubernetes_version}")
+                st.markdown(f"**Source:** {source_label}")
+                st.markdown(f"**Status:** {profile.status.upper()}")
+            with col2:
+                if profile.cluster_source == "imported":
+                    st.markdown(f"**Kubeconfig:** {'Loaded' if profile.kubeconfig_content else 'Not loaded'}")
+                else:
+                    cp = len(profile.get_control_plane_nodes())
+                    wk = len(profile.get_worker_nodes())
+                    st.markdown(f"**Nodes:** {cp} control-plane + {wk} worker")
+                    st.markdown(f"**CRI-O:** {profile.crio_version}")
+                    st.markdown(f"**CNI:** Flannel")
+            with col3:
+                if profile.description:
+                    st.markdown(f"**Description:** {profile.description}")
+
+            # Live cluster health check for imported clusters
+            if profile.cluster_source == "imported" and profile.kubeconfig_content:
+                if st.button(f"Check Health", key=f"health_{profile.name}"):
+                    with st.spinner("Checking cluster health..."):
+                        node_result = run_kubectl(profile, "get nodes --no-headers", timeout=10)
+                        if node_result.success and node_result.stdout.strip():
+                            lines = [l for l in node_result.stdout.strip().split("\n") if l.strip()]
+                            total_nodes = len(lines)
+                            ready_nodes = sum(1 for l in lines if "Ready" in l.split()[1] if len(l.split()) > 1)
+                            not_ready = total_nodes - ready_nodes
+                            hcol1, hcol2, hcol3 = st.columns(3)
+                            hcol1.metric("Nodes", total_nodes)
+                            hcol2.metric("Ready", ready_nodes)
+                            hcol3.metric("Not Ready", not_ready)
+                            if not_ready > 0:
+                                st.warning(f"{not_ready} node(s) are not Ready.")
+                            else:
+                                st.success("All nodes are Ready.")
+                            # Pod summary
+                            pod_result = run_kubectl(profile, "get pods -A --no-headers", timeout=15)
+                            if pod_result.success and pod_result.stdout.strip():
+                                pod_lines = [l for l in pod_result.stdout.strip().split("\n") if l.strip()]
+                                total_pods = len(pod_lines)
+                                running_pods = sum(1 for l in pod_lines if "Running" in l)
+                                failed_pods = sum(1 for l in pod_lines if any(s in l for s in ["Error", "CrashLoopBackOff", "ImagePullBackOff"]))
+                                pcol1, pcol2, pcol3 = st.columns(3)
+                                pcol1.metric("Total Pods", total_pods)
+                                pcol2.metric("Running", running_pods)
+                                pcol3.metric("Failed/Error", failed_pods)
+                        elif node_result.success:
+                            st.info("Connected but no nodes found.")
+                        else:
+                            st.error(f"Could not connect: {node_result.stderr or 'kubectl failed'}")
+
+            # Quick actions
+            if profile.cluster_source == "imported" and profile.kubeconfig_content:
+                qcol1, qcol2, qcol3 = st.columns(3)
+                with qcol1:
+                    if st.button("View Nodes", key=f"qnodes_{profile.name}"):
+                        result = run_kubectl(profile, "get nodes -o wide", timeout=10)
+                        if result.success:
+                            st.code(result.stdout or "(no output)", language="text")
+                        else:
+                            st.error(result.stderr or "Failed")
+                with qcol2:
+                    if st.button("View Namespaces", key=f"qns_{profile.name}"):
+                        result = run_kubectl(profile, "get namespaces", timeout=10)
+                        if result.success:
+                            st.code(result.stdout or "(no output)", language="text")
+                        else:
+                            st.error(result.stderr or "Failed")
+                with qcol3:
+                    if st.button("Warning Events", key=f"qevents_{profile.name}"):
+                        result = run_kubectl(
+                            profile,
+                            "get events -A --field-selector type=Warning --sort-by=.lastTimestamp",
+                            timeout=15,
+                        )
+                        if result.success:
+                            st.code(result.stdout or "(no warning events)", language="text")
+                        else:
+                            st.error(result.stderr or "Failed")
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Certificate Manager
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_certificate_manager():
+    st.markdown("## Certificate Manager")
+    st.markdown("View cluster certificate expiration dates, TLS secrets, and plan renewals.")
+
+    profile = _get_active_profile()
+    if not profile:
+        return
+
+    _show_profile_summary(profile)
+
+    tab_certs, tab_tls, tab_renew = st.tabs([
+        "Cluster Certificates",
+        "TLS Secrets",
+        "Renewal Guide",
+    ])
+
+    # ── Cluster Certificates (kubeadm) ────────────────────────────────────
+    with tab_certs:
+        st.markdown("### Cluster Certificates (kubeadm)")
+
+        if profile.cluster_source == "imported":
+            st.info(
+                "Certificate inspection via `kubeadm certs check-expiration` requires SSH access "
+                "to control-plane nodes. For imported clusters, use the **TLS Secrets** tab to view "
+                "TLS certificates stored in the cluster."
+            )
+            # Still try to get API server cert info
+            if st.button("Check API Server Certificate", key="api_cert_check"):
+                with st.spinner("Checking API server certificate..."):
+                    cmd = (
+                        "get --raw /healthz -v=6 2>&1 || true"
+                    )
+                    result = run_kubectl(profile, "version --short", timeout=10)
+                    if result.success:
+                        st.success("API server is reachable and serving valid TLS.")
+                        st.code(result.stdout, language="text")
+                    else:
+                        if "certificate" in (result.stderr or "").lower():
+                            st.error("Certificate issue detected:")
+                            st.code(result.stderr, language="text")
+                        else:
+                            st.warning(f"Could not check: {result.stderr}")
+        else:
+            cp_nodes = profile.get_control_plane_nodes()
+            if not cp_nodes:
+                st.warning("No control-plane nodes defined.")
+            else:
+                st.markdown(
+                    "Runs `kubeadm certs check-expiration` on control-plane nodes via SSH "
+                    "to show certificate validity and expiration dates."
+                )
+                if st.button("Check Certificate Expiration", type="primary", key="check_certs"):
+                    for node in cp_nodes:
+                        node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
+                        with st.expander(f"Node: {node_label}", expanded=True):
+                            with st.spinner(f"Checking certificates on {node_label}..."):
+                                result = run_ssh_command(
+                                    ip_address=node["ip_address"],
+                                    command="sudo kubeadm certs check-expiration 2>/dev/null || echo 'kubeadm certs command not available'",
+                                    ssh_user=node.get("ssh_user", "root"),
+                                    ssh_port=node.get("ssh_port", 22),
+                                    ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+                                    timeout=30,
+                                )
+                                if result.success and result.stdout.strip():
+                                    st.code(result.stdout, language="text")
+                                    # Parse for expiring soon
+                                    if "RESIDUAL TIME" in result.stdout:
+                                        for line in result.stdout.split("\n"):
+                                            if any(warn in line.lower() for warn in ["invalid", "expired"]):
+                                                st.error(f"Certificate issue: {line.strip()}")
+                                else:
+                                    st.error(f"Failed: {result.stderr or 'No output'}")
+
+    # ── TLS Secrets ───────────────────────────────────────────────────────
+    with tab_tls:
+        st.markdown("### TLS Secrets")
+        st.markdown("View Kubernetes TLS secrets and their certificate details.")
+
+        if st.button("Load TLS Secrets", type="primary", key="load_tls"):
+            cmd = "get secrets -A -o json"
+            with st.spinner("Fetching secrets..."):
+                result = run_kubectl(profile, cmd, timeout=20)
+            if result.success and result.stdout.strip():
+                try:
+                    import pandas as pd
+                    secrets_json = json.loads(result.stdout)
+                    tls_secrets = []
+                    for secret in secrets_json.get("items", []):
+                        if secret.get("type") == "kubernetes.io/tls":
+                            meta = secret.get("metadata", {})
+                            annotations = meta.get("annotations", {})
+                            tls_secrets.append({
+                                "Namespace": meta.get("namespace", "?"),
+                                "Name": meta.get("name", "?"),
+                                "Type": "kubernetes.io/tls",
+                                "Created": meta.get("creationTimestamp", "N/A"),
+                                "Issuer": annotations.get("cert-manager.io/issuer-name", annotations.get("cert-manager.io/cluster-issuer", "N/A")),
+                                "Has cert": "tls.crt" in secret.get("data", {}),
+                                "Has key": "tls.key" in secret.get("data", {}),
+                            })
+                    if tls_secrets:
+                        st.markdown(f"**Found {len(tls_secrets)} TLS secret(s)**")
+                        st.dataframe(pd.DataFrame(tls_secrets), use_container_width=True, hide_index=True)
+                    else:
+                        st.info("No TLS secrets found in the cluster.")
+                except (json.JSONDecodeError, KeyError) as e:
+                    st.error(f"Failed to parse secrets: {e}")
+            elif result.success:
+                st.info("No secrets found.")
+            else:
+                st.error("Failed to fetch secrets")
+                st.code(result.stderr, language="text")
+
+        # cert-manager status
+        st.markdown("---")
+        st.markdown("#### cert-manager Status")
+        if st.button("Check cert-manager", key="check_certmanager"):
+            with st.spinner("Checking cert-manager..."):
+                result = run_kubectl(profile, "get pods -n cert-manager --no-headers", timeout=10)
+                if result.success and result.stdout.strip():
+                    st.success("cert-manager is installed:")
+                    st.code(result.stdout, language="text")
+                    # Check certificates
+                    cert_result = run_kubectl(profile, "get certificates -A --no-headers", timeout=10)
+                    if cert_result.success and cert_result.stdout.strip():
+                        st.markdown("**Managed Certificates:**")
+                        st.code(cert_result.stdout, language="text")
+                elif result.success:
+                    st.info("cert-manager namespace exists but no pods found.")
+                else:
+                    st.info("cert-manager does not appear to be installed.")
+
+    # ── Renewal Guide ─────────────────────────────────────────────────────
+    with tab_renew:
+        st.markdown("### Certificate Renewal Guide")
+
+        st.markdown("""
+#### Automatic Renewal (kubeadm)
+
+kubeadm automatically renews certificates during `kubeadm upgrade`. For manual renewal:
+
+```bash
+# Renew all certificates
+sudo kubeadm certs renew all
+
+# Renew specific certificate
+sudo kubeadm certs renew apiserver
+sudo kubeadm certs renew apiserver-kubelet-client
+sudo kubeadm certs renew front-proxy-client
+sudo kubeadm certs renew etcd-server
+sudo kubeadm certs renew etcd-peer
+sudo kubeadm certs renew etcd-healthcheck-client
+
+# After renewal, restart control plane components
+sudo systemctl restart kubelet
+```
+
+#### Certificate Authority (CA) Rotation
+
+CA rotation is more complex and requires:
+1. Generate new CA certificate and key
+2. Distribute to all nodes
+3. Re-sign all component certificates
+4. Rolling restart of all components
+
+#### cert-manager Renewal
+
+If using cert-manager, certificates are automatically renewed before expiration.
+Check cert-manager logs for renewal status:
+
+```bash
+kubectl logs -n cert-manager deploy/cert-manager -f
+```
+
+#### Best Practices
+- Monitor certificate expiration dates regularly
+- Set up alerts for certificates expiring within 30 days
+- Keep kubeadm version aligned with cluster version for smooth renewals
+- Back up `/etc/kubernetes/pki/` before any certificate operations
+- Test renewal in a staging environment first
+        """)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  PAGE: Cost Optimizer
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_cost_optimizer():
+    st.markdown("## Cost Estimator / Resource Optimizer")
+    st.markdown("Analyze resource usage vs requests/limits and identify optimization opportunities.")
+
+    profile = _get_active_profile()
+    if not profile:
+        return
+
+    _show_profile_summary(profile)
+
+    tab_usage, tab_right_size, tab_idle = st.tabs([
+        "Resource Usage",
+        "Right-Sizing",
+        "Idle Resources",
+    ])
+
+    # ── Resource Usage ────────────────────────────────────────────────────
+    with tab_usage:
+        st.markdown("### Actual Resource Usage vs Requests")
+        st.markdown("Compare real CPU/memory usage (from metrics-server) against configured requests and limits.")
+
+        usage_sub = st.radio("View", ["Node Usage", "Pod Usage"], horizontal=True, key="usage_view")
+
+        if usage_sub == "Node Usage":
+            if st.button("Load Node Usage", type="primary", key="load_node_usage"):
+                with st.spinner("Fetching node metrics..."):
+                    result = run_kubectl(profile, "top nodes --no-headers", timeout=15)
+                if result.success and result.stdout.strip():
+                    import pandas as pd
+                    lines = [l for l in result.stdout.strip().split("\n") if l.strip()]
+                    node_usage = []
+                    for line in lines:
+                        parts = line.split()
+                        if len(parts) >= 5:
+                            node_usage.append({
+                                "Node": parts[0],
+                                "CPU (cores)": parts[1],
+                                "CPU %": parts[2],
+                                "Memory": parts[3],
+                                "Memory %": parts[4],
+                            })
+                    if node_usage:
+                        st.dataframe(pd.DataFrame(node_usage), use_container_width=True, hide_index=True)
+                        # Chart
+                        try:
+                            import plotly.graph_objects as go
+                            fig = go.Figure()
+                            names = [n["Node"] for n in node_usage]
+                            cpu_pcts = [int(n["CPU %"].replace("%", "")) for n in node_usage]
+                            mem_pcts = [int(n["Memory %"].replace("%", "")) for n in node_usage]
+                            fig.add_trace(go.Bar(name="CPU %", x=names, y=cpu_pcts, marker_color="#326CE5"))
+                            fig.add_trace(go.Bar(name="Memory %", x=names, y=mem_pcts, marker_color="#764ba2"))
+                            fig.update_layout(
+                                title="Node Resource Utilization",
+                                yaxis_title="Utilization %",
+                                barmode="group",
+                                height=400,
+                            )
+                            fig.add_hline(y=80, line_dash="dash", line_color="red", annotation_text="80% threshold")
+                            st.plotly_chart(fig, use_container_width=True)
+                        except ImportError:
+                            pass
+                    else:
+                        st.code(result.stdout, language="text")
+                elif result.success:
+                    st.info("No node metrics available. Is metrics-server installed?")
+                else:
+                    st.error("Failed to fetch node metrics. Ensure metrics-server is installed.")
+                    st.code(result.stderr, language="text")
+                    st.info("Install metrics-server via **Monitoring Setup** > **Metrics Components**.")
+
+        elif usage_sub == "Pod Usage":
+            pcol1, pcol2 = st.columns([2, 1])
+            with pcol1:
+                _co_namespaces: list[str] = []
+                if profile.cluster_source == "imported" and profile.kubeconfig_content:
+                    _co_namespaces = fetch_namespaces(profile.kubeconfig_content)
+                if _co_namespaces:
+                    pod_usage_ns = st.selectbox("Namespace", ["All Namespaces"] + _co_namespaces, key="pod_usage_ns")
+                else:
+                    pod_usage_ns = st.text_input("Namespace (blank = all)", value="", key="pod_usage_ns_text")
+                    if not pod_usage_ns:
+                        pod_usage_ns = "All Namespaces"
+
+            if st.button("Load Pod Usage", type="primary", key="load_pod_usage"):
+                ns_flag = "-A" if pod_usage_ns == "All Namespaces" else f"-n {pod_usage_ns}"
+                with st.spinner("Fetching pod metrics..."):
+                    result = run_kubectl(profile, f"top pods {ns_flag} --no-headers", timeout=20)
+                if result.success and result.stdout.strip():
+                    import pandas as pd
+                    lines = [l for l in result.stdout.strip().split("\n") if l.strip()]
+                    pod_usage = []
+                    for line in lines:
+                        parts = line.split()
+                        if pod_usage_ns == "All Namespaces" and len(parts) >= 4:
+                            pod_usage.append({
+                                "Namespace": parts[0],
+                                "Pod": parts[1],
+                                "CPU": parts[2],
+                                "Memory": parts[3],
+                            })
+                        elif len(parts) >= 3:
+                            pod_usage.append({
+                                "Pod": parts[0],
+                                "CPU": parts[1],
+                                "Memory": parts[2],
+                            })
+                    if pod_usage:
+                        df = pd.DataFrame(pod_usage)
+                        st.dataframe(df, use_container_width=True, hide_index=True)
+                        st.markdown(f"**Total pods:** {len(df)}")
+                elif result.success:
+                    st.info("No pod metrics available.")
+                else:
+                    st.error("Failed to fetch pod metrics.")
+                    st.code(result.stderr, language="text")
+
+    # ── Right-Sizing ──────────────────────────────────────────────────────
+    with tab_right_size:
+        st.markdown("### Right-Sizing Recommendations")
+        st.markdown(
+            "Compare actual pod usage against configured requests/limits to find "
+            "over-provisioned or under-provisioned workloads."
+        )
+
+        rs_col1, rs_col2 = st.columns([2, 1])
+        with rs_col1:
+            _rs_namespaces: list[str] = []
+            if profile.cluster_source == "imported" and profile.kubeconfig_content:
+                _rs_namespaces = fetch_namespaces(profile.kubeconfig_content)
+            if _rs_namespaces:
+                rs_ns = st.selectbox("Namespace", _rs_namespaces, key="rs_ns")
+            else:
+                rs_ns = st.text_input("Namespace", value="default", key="rs_ns_text")
+
+        if st.button("Analyze Right-Sizing", type="primary", key="analyze_rs"):
+            if not rs_ns:
+                st.warning("Please specify a namespace.")
+            else:
+                with st.spinner("Fetching usage and resource specs..."):
+                    # Get actual usage
+                    usage_result = run_kubectl(
+                        profile,
+                        f"top pods -n {rs_ns} --no-headers --containers",
+                        timeout=20,
+                    )
+                    # Get resource specs
+                    spec_result = run_kubectl(
+                        profile,
+                        f"get pods -n {rs_ns} -o json",
+                        timeout=20,
+                    )
+
+                if usage_result.success and spec_result.success:
+                    try:
+                        import pandas as pd
+                        # Parse usage: POD CONTAINER CPU MEM
+                        usage_map = {}
+                        for line in (usage_result.stdout or "").strip().split("\n"):
+                            parts = line.split()
+                            if len(parts) >= 4:
+                                key = f"{parts[0]}/{parts[1]}"
+                                usage_map[key] = {"cpu_usage": parts[2], "mem_usage": parts[3]}
+
+                        # Parse specs
+                        pods_json = json.loads(spec_result.stdout)
+                        rows = []
+                        for pod in pods_json.get("items", []):
+                            pod_name = pod.get("metadata", {}).get("name", "?")
+                            for container in pod.get("spec", {}).get("containers", []):
+                                c_name = container.get("name", "?")
+                                res = container.get("resources", {})
+                                req_cpu = res.get("requests", {}).get("cpu", "none")
+                                req_mem = res.get("requests", {}).get("memory", "none")
+                                lim_cpu = res.get("limits", {}).get("cpu", "none")
+                                lim_mem = res.get("limits", {}).get("memory", "none")
+                                key = f"{pod_name}/{c_name}"
+                                usage = usage_map.get(key, {})
+                                rows.append({
+                                    "Pod": pod_name,
+                                    "Container": c_name,
+                                    "CPU Usage": usage.get("cpu_usage", "N/A"),
+                                    "CPU Request": req_cpu,
+                                    "CPU Limit": lim_cpu,
+                                    "Mem Usage": usage.get("mem_usage", "N/A"),
+                                    "Mem Request": req_mem,
+                                    "Mem Limit": lim_mem,
+                                })
+                        if rows:
+                            df = pd.DataFrame(rows)
+                            st.dataframe(df, use_container_width=True, hide_index=True)
+
+                            # Recommendations
+                            no_req_cpu = sum(1 for r in rows if r["CPU Request"] == "none")
+                            no_req_mem = sum(1 for r in rows if r["Mem Request"] == "none")
+                            no_lim_cpu = sum(1 for r in rows if r["CPU Limit"] == "none")
+                            no_lim_mem = sum(1 for r in rows if r["Mem Limit"] == "none")
+
+                            st.markdown("---")
+                            st.markdown("#### Recommendations")
+                            if no_req_cpu > 0:
+                                st.warning(f"{no_req_cpu} container(s) have **no CPU request** — scheduler cannot make optimal placement decisions.")
+                            if no_req_mem > 0:
+                                st.warning(f"{no_req_mem} container(s) have **no memory request** — pods may be evicted under pressure.")
+                            if no_lim_cpu > 0:
+                                st.info(f"{no_lim_cpu} container(s) have **no CPU limit** — they can consume all available CPU on the node.")
+                            if no_lim_mem > 0:
+                                st.warning(f"{no_lim_mem} container(s) have **no memory limit** — they may be OOMKilled or cause node instability.")
+                            if no_req_cpu == 0 and no_req_mem == 0 and no_lim_cpu == 0 and no_lim_mem == 0:
+                                st.success("All containers have CPU and memory requests and limits set.")
+                        else:
+                            st.info("No containers found in this namespace.")
+                    except (json.JSONDecodeError, KeyError) as e:
+                        st.error(f"Failed to parse data: {e}")
+                else:
+                    if not usage_result.success:
+                        st.error("Failed to fetch pod usage metrics. Is metrics-server installed?")
+                        st.code(usage_result.stderr, language="text")
+                    if not spec_result.success:
+                        st.error("Failed to fetch pod specs.")
+                        st.code(spec_result.stderr, language="text")
+
+    # ── Idle Resources ────────────────────────────────────────────────────
+    with tab_idle:
+        st.markdown("### Idle / Unused Resources")
+        st.markdown("Find resources that may be wasting cluster capacity.")
+
+        idle_checks = st.multiselect(
+            "Check for",
+            [
+                "Completed/Failed Jobs",
+                "Deployments scaled to 0",
+                "Orphaned ConfigMaps",
+                "Unbound PVCs",
+                "Empty Namespaces",
+            ],
+            default=["Completed/Failed Jobs", "Deployments scaled to 0", "Unbound PVCs"],
+            key="idle_checks",
+        )
+
+        if st.button("Scan for Idle Resources", type="primary", key="scan_idle"):
+            findings = []
+
+            if "Completed/Failed Jobs" in idle_checks:
+                with st.spinner("Checking completed/failed jobs..."):
+                    result = run_kubectl(profile, "get jobs -A -o json", timeout=15)
+                if result.success and result.stdout.strip():
+                    try:
+                        jobs = json.loads(result.stdout).get("items", [])
+                        old_jobs = []
+                        for job in jobs:
+                            status = job.get("status", {})
+                            conditions = status.get("conditions", [])
+                            for cond in conditions:
+                                if cond.get("type") in ("Complete", "Failed") and cond.get("status") == "True":
+                                    meta = job.get("metadata", {})
+                                    old_jobs.append(f"  - {meta.get('namespace', '?')}/{meta.get('name', '?')} ({cond['type']})")
+                        if old_jobs:
+                            findings.append(("warning", f"**{len(old_jobs)} completed/failed job(s)** can be cleaned up:\n" + "\n".join(old_jobs[:20])))
+                        else:
+                            findings.append(("success", "No completed/failed jobs found."))
+                    except (json.JSONDecodeError, KeyError):
+                        findings.append(("error", "Failed to parse jobs data."))
+
+            if "Deployments scaled to 0" in idle_checks:
+                with st.spinner("Checking zero-replica deployments..."):
+                    result = run_kubectl(profile, "get deployments -A -o json", timeout=15)
+                if result.success and result.stdout.strip():
+                    try:
+                        deploys = json.loads(result.stdout).get("items", [])
+                        zero_deploys = []
+                        for dep in deploys:
+                            replicas = dep.get("spec", {}).get("replicas", 1)
+                            if replicas == 0:
+                                meta = dep.get("metadata", {})
+                                zero_deploys.append(f"  - {meta.get('namespace', '?')}/{meta.get('name', '?')}")
+                        if zero_deploys:
+                            findings.append(("warning", f"**{len(zero_deploys)} deployment(s) scaled to 0 replicas:**\n" + "\n".join(zero_deploys[:20])))
+                        else:
+                            findings.append(("success", "No zero-replica deployments found."))
+                    except (json.JSONDecodeError, KeyError):
+                        findings.append(("error", "Failed to parse deployment data."))
+
+            if "Unbound PVCs" in idle_checks:
+                with st.spinner("Checking unbound PVCs..."):
+                    result = run_kubectl(profile, "get pvc -A --no-headers", timeout=15)
+                if result.success and result.stdout.strip():
+                    lines = [l for l in result.stdout.strip().split("\n") if l.strip()]
+                    pending_pvcs = [l for l in lines if "Pending" in l]
+                    if pending_pvcs:
+                        findings.append(("warning", f"**{len(pending_pvcs)} PVC(s) in Pending state** (not bound to a PV):\n```\n" + "\n".join(pending_pvcs[:10]) + "\n```"))
+                    else:
+                        findings.append(("success", "All PVCs are bound."))
+                elif result.success:
+                    findings.append(("info", "No PVCs found."))
+
+            if "Empty Namespaces" in idle_checks:
+                with st.spinner("Checking empty namespaces..."):
+                    ns_result = run_kubectl(profile, "get namespaces --no-headers", timeout=10)
+                if ns_result.success and ns_result.stdout.strip():
+                    ns_lines = [l.split()[0] for l in ns_result.stdout.strip().split("\n") if l.strip()]
+                    system_ns = {"kube-system", "kube-public", "kube-node-lease", "default"}
+                    empty_ns = []
+                    for ns in ns_lines:
+                        if ns in system_ns:
+                            continue
+                        pod_r = run_kubectl(profile, f"get pods -n {ns} --no-headers", timeout=10)
+                        if pod_r.success and not pod_r.stdout.strip():
+                            empty_ns.append(ns)
+                    if empty_ns:
+                        findings.append(("info", f"**{len(empty_ns)} namespace(s) with no pods:**\n  - " + "\n  - ".join(empty_ns[:15])))
+                    else:
+                        findings.append(("success", "No empty non-system namespaces found."))
+
+            if "Orphaned ConfigMaps" in idle_checks:
+                findings.append(("info", "Orphaned ConfigMap detection requires cross-referencing all pod specs — use the **Resource Viewer** to manually inspect ConfigMaps per namespace."))
+
+            # Display findings
+            st.markdown("---")
+            st.markdown("#### Findings")
+            for level, msg in findings:
+                if level == "warning":
+                    st.warning(msg)
+                elif level == "error":
+                    st.error(msg)
+                elif level == "success":
+                    st.success(msg)
+                else:
+                    st.info(msg)
+
+
 # ── Helper functions ──────────────────────────────────────────────────────
 
 def _get_active_profile() -> ClusterProfile | None:
@@ -3422,7 +4497,9 @@ def _show_profile_summary(profile: ClusterProfile):
 def main():
     page = render_sidebar()
 
-    if page == "Profile Manager":
+    if page == "Multi-Cluster Dashboard":
+        page_multi_cluster_dashboard()
+    elif page == "Profile Manager":
         page_profile_manager()
     elif page == "Cluster Creation":
         page_cluster_creation()
@@ -3436,6 +4513,10 @@ def main():
         page_log_analysis()
     elif page == "Upgrade Planner":
         page_upgrade_planner()
+    elif page == "Certificate Manager":
+        page_certificate_manager()
+    elif page == "Cost Optimizer":
+        page_cost_optimizer()
     elif page == "AI Assistant":
         page_ai_assistant()
 

From 2df2413ebfaaf7e18deadb6b5b795e8c3cae391e Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:28:18 +0000
Subject: [PATCH 21/31] Add Smart Log Analysis (LogAI-inspired): clustering,
 anomaly detection, pattern mining, summarization

---
 k8s-agent/app.py                  | 260 +++++++++++++++++++-
 k8s-agent/modules/log_analyzer.py | 383 ++++++++++++++++++++++++++++++
 k8s-agent/requirements.txt        |   3 +
 3 files changed, 645 insertions(+), 1 deletion(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index aadb5ba..85e3874 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -80,6 +80,11 @@
     llm_analyze_logs,
     llm_correlate_analysis,
     get_pod_list,
+    smart_analyze,
+    cluster_logs,
+    detect_anomalies,
+    mine_log_patterns,
+    summarize_logs,
 )
 from modules.llm_client import query_llm, stream_llm
 
@@ -1575,10 +1580,11 @@ def page_log_analysis():
 
     available_log_sources = get_available_log_sources(profile)
 
-    tab_system, tab_pod, tab_correlation, tab_ai = st.tabs([
+    tab_system, tab_pod, tab_correlation, tab_smart, tab_ai = st.tabs([
         "System Logs",
         "Pod Logs",
         "Error Correlation",
+        "Smart Log Analysis",
         "AI Log Analysis",
     ])
 
@@ -1731,6 +1737,258 @@ def page_log_analysis():
                         analysis = llm_correlate_analysis(multi_logs)
                         st.markdown(analysis)
 
+    # ── Smart Log Analysis (LogAI-inspired) ─────────────────────────────
+    with tab_smart:
+        st.markdown("### Smart Log Analysis (LogAI-inspired)")
+        st.markdown(
+            "ML-powered log analysis using techniques from "
+            "[Salesforce LogAI](https://github.com/salesforce/logai): "
+            "**log clustering** (TF-IDF + DBSCAN), **anomaly detection**, "
+            "**pattern mining** (Drain-style), and **auto-summarization**."
+        )
+
+        smart_mode = st.radio(
+            "Analysis mode",
+            ["Collect from cluster", "Paste logs"],
+            horizontal=True,
+            key="smart_mode",
+        )
+
+        smart_log_text = ""
+
+        if smart_mode == "Collect from cluster":
+            scol1, scol2, scol3 = st.columns(3)
+            with scol1:
+                smart_source = st.selectbox(
+                    "Log Source", available_log_sources, key="smart_source",
+                )
+            with scol2:
+                smart_lines = st.number_input(
+                    "Lines to fetch", min_value=100, max_value=5000, value=500, key="smart_lines",
+                )
+            with scol3:
+                smart_since_opts = {
+                    "Last 15 min": ("15 minutes ago", "15m"),
+                    "Last 1 hour": ("1 hour ago", "1h"),
+                    "Last 6 hours": ("6 hours ago", "6h"),
+                    "Last 24 hours": ("24 hours ago", "24h"),
+                }
+                smart_since_label = st.selectbox(
+                    "Time Range", list(smart_since_opts.keys()), index=1, key="smart_since",
+                )
+                smart_since, smart_since_k8s = smart_since_opts[smart_since_label]
+
+            if st.button("Collect & Analyze", type="primary", key="smart_collect"):
+                with st.spinner(f"Collecting {smart_source} logs..."):
+                    result = collect_logs(
+                        cp_node, smart_source, smart_lines, smart_since, smart_since_k8s, profile=profile,
+                    )
+                if result.success and result.stdout.strip():
+                    smart_log_text = result.stdout
+                    st.session_state["_smart_log_text"] = smart_log_text
+                    st.session_state["_smart_source"] = smart_source
+                elif result.success:
+                    st.info("No logs returned for the selected source and time range.")
+                else:
+                    st.error(f"Failed to collect logs: {result.stderr}")
+
+            # Persist across reruns
+            if "_smart_log_text" in st.session_state and not smart_log_text:
+                smart_log_text = st.session_state["_smart_log_text"]
+
+        else:
+            smart_log_text = st.text_area(
+                "Paste log output",
+                height=200,
+                placeholder="Paste your Kubernetes logs here for smart analysis...",
+                key="smart_paste",
+            )
+            if smart_log_text:
+                st.session_state["_smart_log_text"] = smart_log_text
+                st.session_state["_smart_source"] = "pasted"
+
+        # Run analysis if we have log text
+        if smart_log_text:
+            src_label = st.session_state.get("_smart_source", "")
+            with st.spinner("Running LogAI-inspired analysis pipeline..."):
+                sa_result = smart_analyze(smart_log_text, source=src_label)
+
+            # ── Summary / Health Score ────────────────────────────────
+            st.markdown("---")
+            st.markdown("#### Log Summary & Health Score")
+            summary = sa_result.summary
+            health = summary.get("health_score", 100)
+            health_color = "green" if health >= 80 else ("orange" if health >= 50 else "red")
+            scol1, scol2, scol3, scol4, scol5 = st.columns(5)
+            scol1.metric("Total Lines", summary.get("total_lines", 0))
+            scol2.metric("Errors", summary.get("error_count", 0))
+            scol3.metric("Warnings", summary.get("warning_count", 0))
+            scol4.metric("Unique Templates", summary.get("unique_templates", 0))
+            scol5.metric("Health Score", f"{health}/100")
+
+            if health < 50:
+                st.error(f"Health score is **{health}/100** — significant issues detected in logs.")
+            elif health < 80:
+                st.warning(f"Health score is **{health}/100** — some issues detected.")
+            else:
+                st.success(f"Health score is **{health}/100** — logs look healthy.")
+
+            st.markdown(
+                f"**Time span:** {summary.get('first_timestamp', 'N/A')} → {summary.get('last_timestamp', 'N/A')} | "
+                f"**Template diversity:** {summary.get('template_diversity', 0)}%"
+            )
+
+            # Top errors
+            top_errors = summary.get("top_errors", [])
+            if top_errors:
+                with st.expander(f"Top {len(top_errors)} Error Patterns", expanded=True):
+                    for pattern, count in top_errors:
+                        st.markdown(f"- **x{count}** — `{pattern[:200]}`")
+
+            # ── Log Clustering ────────────────────────────────────────
+            st.markdown("---")
+            st.markdown("#### Log Clustering (TF-IDF + DBSCAN)")
+            st.markdown(
+                "Groups similar log messages together to reduce noise and highlight distinct message types. "
+                "Uses TF-IDF vectorization and DBSCAN density-based clustering."
+            )
+            if sa_result.clusters:
+                import pandas as pd
+                cluster_data = []
+                for c in sa_result.clusters:
+                    label = f"Cluster {c.cluster_id}" if c.cluster_id >= 0 else "Noise (unique)"
+                    cluster_data.append({
+                        "Cluster": label,
+                        "Count": c.count,
+                        "Level": c.level,
+                        "Template": c.template[:120],
+                        "First Seen": c.first_seen or "N/A",
+                        "Last Seen": c.last_seen or "N/A",
+                    })
+                df_clusters = pd.DataFrame(cluster_data)
+                st.dataframe(df_clusters, use_container_width=True, hide_index=True)
+
+                # Cluster distribution chart
+                try:
+                    import plotly.express as px
+                    fig = px.pie(
+                        df_clusters, names="Cluster", values="Count",
+                        title="Log Message Distribution by Cluster",
+                        color_discrete_sequence=px.colors.qualitative.Set3,
+                    )
+                    fig.update_layout(height=400)
+                    st.plotly_chart(fig, use_container_width=True)
+                except ImportError:
+                    pass
+
+                # Show sample messages per cluster
+                error_clusters = [c for c in sa_result.clusters if c.level == "ERROR"]
+                if error_clusters:
+                    with st.expander(f"Error Clusters ({len(error_clusters)})", expanded=True):
+                        for c in error_clusters:
+                            label = f"Cluster {c.cluster_id}" if c.cluster_id >= 0 else "Noise"
+                            st.markdown(f"**{label}** — {c.count} messages")
+                            for sample in c.sample_messages[:2]:
+                                st.code(sample, language="text")
+            else:
+                st.info("Not enough log lines for clustering (need 3+ lines).")
+
+            # ── Anomaly Detection ─────────────────────────────────────
+            st.markdown("---")
+            st.markdown("#### Anomaly Detection")
+            st.markdown(
+                "Detects unusual log lines using TF-IDF distance from centroid (outlier scoring) "
+                "and frequency-based rare template detection."
+            )
+            if sa_result.anomalies:
+                st.markdown(f"**{len(sa_result.anomalies)} anomalous log line(s) detected**")
+                import pandas as pd
+                anomaly_data = []
+                for a in sa_result.anomalies[:30]:
+                    anomaly_data.append({
+                        "Score": round(a.score, 2),
+                        "Reason": a.reason,
+                        "Timestamp": a.timestamp or "N/A",
+                        "Message": a.message[:150],
+                    })
+                df_anomalies = pd.DataFrame(anomaly_data)
+                st.dataframe(df_anomalies, use_container_width=True, hide_index=True)
+
+                # Show full messages for top anomalies
+                with st.expander("Top Anomaly Details", expanded=False):
+                    for i, a in enumerate(sa_result.anomalies[:10]):
+                        st.markdown(f"**#{i+1}** (score: {a.score:.2f}) — {a.reason}")
+                        st.code(a.message, language="text")
+            else:
+                st.success("No anomalous log lines detected — all messages follow expected patterns.")
+
+            # ── Pattern Mining ────────────────────────────────────────
+            st.markdown("---")
+            st.markdown("#### Pattern Mining (Drain-style)")
+            st.markdown(
+                "Extracts frequent log templates by replacing variable tokens (IPs, IDs, numbers, paths) "
+                "with placeholders — similar to LogAI's Drain parser."
+            )
+            if sa_result.patterns:
+                import pandas as pd
+                pattern_data = []
+                for p in sa_result.patterns[:20]:
+                    pattern_data.append({
+                        "Template": p["template"][:120],
+                        "Count": p["count"],
+                        "% of Logs": p["percentage"],
+                        "Level": p["level"],
+                    })
+                df_patterns = pd.DataFrame(pattern_data)
+                st.dataframe(df_patterns, use_container_width=True, hide_index=True)
+
+                # Bar chart of top patterns
+                try:
+                    import plotly.express as px
+                    top_10 = sa_result.patterns[:10]
+                    fig = px.bar(
+                        x=[p["template"][:60] for p in top_10],
+                        y=[p["count"] for p in top_10],
+                        labels={"x": "Template", "y": "Count"},
+                        title="Top 10 Log Templates",
+                        color=[p["level"] for p in top_10],
+                        color_discrete_map={"ERROR": "#FF4B4B", "WARNING": "#FFA500", "INFO": "#326CE5"},
+                    )
+                    fig.update_layout(height=400, xaxis_tickangle=-45)
+                    st.plotly_chart(fig, use_container_width=True)
+                except ImportError:
+                    pass
+            else:
+                st.info("No patterns extracted.")
+
+            # ── Timeline ──────────────────────────────────────────────
+            if sa_result.timeline_buckets and len(sa_result.timeline_buckets) > 1:
+                st.markdown("---")
+                st.markdown("#### Log Volume Timeline")
+                try:
+                    import plotly.graph_objects as go
+                    import pandas as pd
+                    ts_labels = [b["timestamp"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"]
+                    ts_totals = [b["total"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"]
+                    ts_errors = [b["errors"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"]
+                    ts_warnings = [b["warnings"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"]
+
+                    if ts_labels:
+                        fig = go.Figure()
+                        fig.add_trace(go.Scatter(x=ts_labels, y=ts_totals, name="Total", mode="lines+markers", line=dict(color="#326CE5")))
+                        fig.add_trace(go.Bar(x=ts_labels, y=ts_errors, name="Errors", marker_color="#FF4B4B"))
+                        fig.add_trace(go.Bar(x=ts_labels, y=ts_warnings, name="Warnings", marker_color="#FFA500"))
+                        fig.update_layout(
+                            title="Log Volume Over Time",
+                            yaxis_title="Count",
+                            xaxis_title="Time",
+                            barmode="stack",
+                            height=400,
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+                except ImportError:
+                    pass
+
     # ── AI Log Analysis ───────────────────────────────────────────────────
     with tab_ai:
         st.markdown("### AI-Powered Log Analysis")
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index 917bc3e..3f6c6df 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -453,3 +453,386 @@ def get_pod_list(
     ns_flag = f"-n {namespace}" if namespace else "-A"
     command = f"kubectl get pods {ns_flag} -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,CONTAINERS:.spec.containers[*].name' --no-headers"
     return _run_on_cluster(control_plane_node, command, profile=profile, timeout=30)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  LogAI-inspired Smart Log Analysis
+#  Provides: Log Clustering, Anomaly Detection, Pattern Mining, Summarization
+#  Uses scikit-learn (TF-IDF + DBSCAN) instead of LogAI directly due to
+#  Python 3.12 compatibility issues with the logai package.
+# ══════════════════════════════════════════════════════════════════════════
+
+@dataclass
+class LogCluster:
+    """A cluster of similar log messages."""
+    cluster_id: int
+    template: str
+    count: int
+    level: str  # predominant level: ERROR, WARNING, INFO
+    sample_messages: list[str] = field(default_factory=list)
+    first_seen: str = ""
+    last_seen: str = ""
+
+
+@dataclass
+class LogAnomaly:
+    """An anomalous log line or pattern."""
+    message: str
+    score: float  # anomaly score (higher = more anomalous)
+    reason: str
+    timestamp: str = ""
+    source: str = ""
+
+
+@dataclass
+class SmartAnalysisResult:
+    """Full result from smart log analysis."""
+    total_lines: int = 0
+    clusters: list[LogCluster] = field(default_factory=list)
+    anomalies: list[LogAnomaly] = field(default_factory=list)
+    patterns: list[dict] = field(default_factory=list)
+    summary: dict = field(default_factory=dict)
+    timeline_buckets: list[dict] = field(default_factory=list)
+
+
+def _tokenize_log(message: str) -> str:
+    """Tokenize a log message by replacing variable parts with placeholders.
+
+    This mimics LogAI's Drain-style log parsing — variable tokens (IPs,
+    hex IDs, numbers, paths, UUIDs) are replaced so that messages with the
+    same *template* look identical after tokenization.
+    """
+    # Remove leading timestamp (various formats)
+    msg = re.sub(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[^\s]*\s*", "", message)
+    msg = re.sub(r"^[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s*", "", msg)
+    # Replace UUIDs
+    msg = re.sub(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", "<UUID>", msg, flags=re.IGNORECASE)
+    # Replace hex IDs (8+ chars)
+    msg = re.sub(r"\b[0-9a-f]{8,}\b", "<HEX>", msg)
+    # Replace IPs
+    msg = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?", "<IP>", msg)
+    # Replace pure numbers
+    msg = re.sub(r"\b\d+\b", "<NUM>", msg)
+    # Replace file paths
+    msg = re.sub(r"/[\w./-]+", "<PATH>", msg)
+    # Replace pod/container names with common suffixes
+    msg = re.sub(r"\b[\w]+-[0-9a-f]{5,10}\b", "<POD>", msg)
+    return msg.strip()
+
+
+def cluster_logs(log_text: str, source: str = "", max_clusters: int = 50, eps: float = 0.5) -> list[LogCluster]:
+    """Cluster log messages using TF-IDF vectorization + DBSCAN.
+
+    Inspired by LogAI's log clustering pipeline:
+    1. Parse each log line
+    2. Tokenize to extract log templates
+    3. Vectorize with TF-IDF
+    4. Cluster with DBSCAN (density-based — no need to specify k)
+    5. Return clusters sorted by size
+    """
+    try:
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.cluster import DBSCAN
+        import numpy as np
+    except ImportError:
+        return []
+
+    lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+    if len(lines) < 3:
+        return []
+
+    # Parse and tokenize
+    entries = [parse_log_line(l, source) for l in lines]
+    tokenized = [_tokenize_log(e.message) for e in entries]
+
+    # Filter out empty tokenized lines
+    valid_indices = [i for i, t in enumerate(tokenized) if t.strip()]
+    if len(valid_indices) < 3:
+        return []
+
+    valid_tokenized = [tokenized[i] for i in valid_indices]
+    valid_entries = [entries[i] for i in valid_indices]
+
+    # TF-IDF vectorization
+    try:
+        vectorizer = TfidfVectorizer(max_features=1000, stop_words=None, token_pattern=r"(?u)\b\w+\b")
+        tfidf_matrix = vectorizer.fit_transform(valid_tokenized)
+    except ValueError:
+        return []
+
+    # DBSCAN clustering
+    clustering = DBSCAN(eps=eps, min_samples=2, metric="cosine")
+    labels = clustering.fit_predict(tfidf_matrix)
+
+    # Build clusters
+    cluster_map: dict[int, list[int]] = {}
+    for idx, label in enumerate(labels):
+        cluster_map.setdefault(label, []).append(idx)
+
+    result_clusters = []
+    for cluster_id, member_indices in sorted(cluster_map.items(), key=lambda x: -len(x[1])):
+        members = [valid_entries[i] for i in member_indices]
+        levels = [m.level for m in members]
+        level_counter = Counter(levels)
+        predominant_level = level_counter.most_common(1)[0][0]
+
+        # Use the most common tokenized form as the template
+        templates = [valid_tokenized[i] for i in member_indices]
+        template = Counter(templates).most_common(1)[0][0]
+
+        # Timestamps
+        timestamps = [m.timestamp for m in members if m.timestamp]
+        first_seen = min(timestamps) if timestamps else ""
+        last_seen = max(timestamps) if timestamps else ""
+
+        samples = [members[i].raw for i in range(min(3, len(members)))]
+
+        label_str = "noise" if cluster_id == -1 else str(cluster_id)
+        result_clusters.append(LogCluster(
+            cluster_id=cluster_id,
+            template=template if cluster_id != -1 else "(unclustered / unique messages)",
+            count=len(members),
+            level=predominant_level,
+            sample_messages=samples,
+            first_seen=first_seen,
+            last_seen=last_seen,
+        ))
+
+    # Sort by count descending, but put noise cluster (-1) last
+    result_clusters.sort(key=lambda c: (c.cluster_id == -1, -c.count))
+    return result_clusters[:max_clusters]
+
+
+def detect_anomalies(log_text: str, source: str = "", threshold: float = 2.0) -> list[LogAnomaly]:
+    """Detect anomalous log lines using frequency-based and TF-IDF outlier detection.
+
+    Inspired by LogAI's anomaly detection pipeline:
+    1. Tokenize messages to get templates
+    2. Count template frequencies
+    3. Rare templates (below frequency threshold) are flagged
+    4. Additionally, use TF-IDF distance from centroid for outlier scoring
+    """
+    try:
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        import numpy as np
+    except ImportError:
+        return []
+
+    lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+    if len(lines) < 5:
+        return []
+
+    entries = [parse_log_line(l, source) for l in lines]
+    tokenized = [_tokenize_log(e.message) for e in entries]
+
+    # Frequency-based anomaly detection
+    template_counts = Counter(tokenized)
+    total = len(tokenized)
+    freq_threshold = max(1, total * 0.01)  # templates appearing in < 1% of lines
+
+    anomalies = []
+
+    # TF-IDF outlier detection
+    try:
+        vectorizer = TfidfVectorizer(max_features=500, token_pattern=r"(?u)\b\w+\b")
+        tfidf_matrix = vectorizer.fit_transform(tokenized)
+        centroid = tfidf_matrix.mean(axis=0)
+        centroid = np.asarray(centroid).flatten()
+
+        distances = []
+        for i in range(tfidf_matrix.shape[0]):
+            vec = np.asarray(tfidf_matrix[i].todense()).flatten()
+            dist = np.linalg.norm(vec - centroid)
+            distances.append(dist)
+
+        distances = np.array(distances)
+        mean_dist = distances.mean()
+        std_dist = distances.std() if distances.std() > 0 else 1.0
+
+        for i, (entry, dist) in enumerate(zip(entries, distances)):
+            z_score = (dist - mean_dist) / std_dist
+            reasons = []
+
+            # TF-IDF outlier
+            if z_score > threshold:
+                reasons.append(f"TF-IDF outlier (z-score: {z_score:.2f})")
+
+            # Frequency anomaly
+            if template_counts[tokenized[i]] <= freq_threshold:
+                reasons.append(f"Rare template (seen {template_counts[tokenized[i]]}x out of {total})")
+
+            # Error/critical level
+            if entry.level == "ERROR":
+                reasons.append("Error-level message")
+
+            if reasons:
+                anomalies.append(LogAnomaly(
+                    message=entry.raw,
+                    score=float(z_score),
+                    reason="; ".join(reasons),
+                    timestamp=entry.timestamp,
+                    source=source,
+                ))
+    except ValueError:
+        # Fallback to frequency-only if TF-IDF fails
+        for i, entry in enumerate(entries):
+            if template_counts[tokenized[i]] <= freq_threshold:
+                anomalies.append(LogAnomaly(
+                    message=entry.raw,
+                    score=1.0,
+                    reason=f"Rare template (seen {template_counts[tokenized[i]]}x out of {total})",
+                    timestamp=entry.timestamp,
+                    source=source,
+                ))
+
+    # Sort by score descending
+    anomalies.sort(key=lambda a: -a.score)
+    return anomalies[:100]  # cap at 100
+
+
+def mine_log_patterns(log_text: str, source: str = "", top_n: int = 30) -> list[dict]:
+    """Mine frequent log patterns/templates from log text.
+
+    Inspired by LogAI's Drain log parser — extracts common templates by
+    tokenizing variable parts and counting occurrences.
+    """
+    lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+    if not lines:
+        return []
+
+    entries = [parse_log_line(l, source) for l in lines]
+    tokenized = [_tokenize_log(e.message) for e in entries]
+
+    # Count templates
+    template_counts = Counter(tokenized)
+
+    # Group by template
+    template_levels: dict[str, Counter] = {}
+    template_samples: dict[str, str] = {}
+    for entry, template in zip(entries, tokenized):
+        if template not in template_levels:
+            template_levels[template] = Counter()
+            template_samples[template] = entry.raw
+        template_levels[template][entry.level] += 1
+
+    patterns = []
+    for template, count in template_counts.most_common(top_n):
+        level_dist = dict(template_levels.get(template, {}))
+        predominant = max(level_dist, key=level_dist.get) if level_dist else "INFO"
+        patterns.append({
+            "template": template,
+            "count": count,
+            "percentage": round(count / len(lines) * 100, 1),
+            "level": predominant,
+            "level_distribution": level_dist,
+            "sample": template_samples.get(template, ""),
+        })
+
+    return patterns
+
+
+def summarize_logs(log_text: str, source: str = "") -> dict:
+    """Generate a comprehensive summary of log data.
+
+    Inspired by LogAI's summarization — provides:
+    - Level distribution (INFO/WARNING/ERROR counts)
+    - Time span
+    - Top error messages
+    - Log velocity (lines per minute)
+    - Health score
+    """
+    lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+    if not lines:
+        return {"total_lines": 0, "health_score": 100}
+
+    entries = [parse_log_line(l, source) for l in lines]
+
+    # Level distribution
+    levels = Counter(e.level for e in entries)
+    error_count = levels.get("ERROR", 0)
+    warning_count = levels.get("WARNING", 0)
+    info_count = levels.get("INFO", 0)
+
+    # Time span
+    timestamps = [e.timestamp for e in entries if e.timestamp]
+    first_ts = min(timestamps) if timestamps else "N/A"
+    last_ts = max(timestamps) if timestamps else "N/A"
+
+    # Top errors
+    error_messages = [_normalize_error(e.message) for e in entries if e.level == "ERROR"]
+    top_errors = Counter(error_messages).most_common(10)
+
+    # Top warnings
+    warning_messages = [_normalize_error(e.message) for e in entries if e.level == "WARNING"]
+    top_warnings = Counter(warning_messages).most_common(5)
+
+    # Unique templates
+    tokenized = [_tokenize_log(e.message) for e in entries]
+    unique_templates = len(set(tokenized))
+
+    # Health score (0-100)
+    # High errors = low score, high warnings = moderate reduction
+    error_ratio = error_count / len(lines) if lines else 0
+    warning_ratio = warning_count / len(lines) if lines else 0
+    health_score = max(0, min(100, int(100 - error_ratio * 300 - warning_ratio * 50)))
+
+    return {
+        "total_lines": len(lines),
+        "error_count": error_count,
+        "warning_count": warning_count,
+        "info_count": info_count,
+        "level_distribution": dict(levels),
+        "first_timestamp": first_ts,
+        "last_timestamp": last_ts,
+        "top_errors": top_errors,
+        "top_warnings": top_warnings,
+        "unique_templates": unique_templates,
+        "template_diversity": round(unique_templates / len(lines) * 100, 1) if lines else 0,
+        "health_score": health_score,
+    }
+
+
+def smart_analyze(log_text: str, source: str = "") -> SmartAnalysisResult:
+    """Run the full LogAI-inspired analysis pipeline.
+
+    Combines: clustering, anomaly detection, pattern mining, and summarization.
+    """
+    result = SmartAnalysisResult()
+    lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+    result.total_lines = len(lines)
+
+    if not lines:
+        return result
+
+    # 1. Clustering
+    result.clusters = cluster_logs(log_text, source)
+
+    # 2. Anomaly detection
+    result.anomalies = detect_anomalies(log_text, source)
+
+    # 3. Pattern mining
+    result.patterns = mine_log_patterns(log_text, source)
+
+    # 4. Summarization
+    result.summary = summarize_logs(log_text, source)
+
+    # 5. Timeline buckets (group by timestamp prefix for timeline view)
+    entries = [parse_log_line(l, source) for l in lines]
+    ts_buckets: dict[str, dict] = {}
+    for entry in entries:
+        if entry.timestamp:
+            # Bucket by minute (first 16 chars: YYYY-MM-DDTHH:MM)
+            bucket_key = entry.timestamp[:16] if len(entry.timestamp) >= 16 else entry.timestamp
+        else:
+            bucket_key = "unknown"
+        if bucket_key not in ts_buckets:
+            ts_buckets[bucket_key] = {"timestamp": bucket_key, "total": 0, "errors": 0, "warnings": 0}
+        ts_buckets[bucket_key]["total"] += 1
+        if entry.level == "ERROR":
+            ts_buckets[bucket_key]["errors"] += 1
+        elif entry.level == "WARNING":
+            ts_buckets[bucket_key]["warnings"] += 1
+
+    result.timeline_buckets = sorted(ts_buckets.values(), key=lambda b: b["timestamp"])
+
+    return result
diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt
index c9c4741..8667fe8 100644
--- a/k8s-agent/requirements.txt
+++ b/k8s-agent/requirements.txt
@@ -1,3 +1,6 @@
 streamlit>=1.32.0
 requests>=2.31.0
 plotly>=5.18.0
+pandas>=2.0.0
+scikit-learn>=1.3.0
+numpy>=1.24.0

From 85f833a9a366666e7a8c0c3bafd7d99c9a6eb92d Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:42:20 +0000
Subject: [PATCH 22/31] Fix proxy /etc/environment format: use KEY=VALUE for
 pam_env, source env in subsequent steps

---
 k8s-agent/modules/cluster_creator.py | 62 +++++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index d9f694d..5967556 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -106,7 +106,12 @@ def test_ssh_connectivity(node: dict) -> SSHResult:
 
 
 def _proxy_env_block(profile: ClusterProfile) -> str:
-    """Generate shell export lines for proxy environment variables."""
+    """Generate shell export lines for proxy environment variables.
+
+    These are valid *shell* statements — use inside scripts for the
+    current session.  Do NOT write these to ``/etc/environment``;
+    use :func:`_proxy_env_file_block` for that.
+    """
     lines = []
     proxy = profile.http_proxy or profile.http_proxy_alt
     proxys = profile.https_proxy or profile.https_proxy_alt
@@ -122,9 +127,45 @@ def _proxy_env_block(profile: ClusterProfile) -> str:
     return "\n".join(lines)
 
 
+def _proxy_env_file_block(profile: ClusterProfile) -> str:
+    """Generate KEY=VALUE lines suitable for ``/etc/environment``.
+
+    ``/etc/environment`` is parsed by ``pam_env.so`` which expects plain
+    ``KEY=VALUE`` lines — the ``export`` keyword is **not** valid there.
+    """
+    lines = []
+    proxy = profile.http_proxy or profile.http_proxy_alt
+    proxys = profile.https_proxy or profile.https_proxy_alt
+    if proxy:
+        lines.append(f'http_proxy="{proxy}"')
+        lines.append(f'HTTP_PROXY="{proxy}"')
+    if proxys:
+        lines.append(f'https_proxy="{proxys}"')
+        lines.append(f'HTTPS_PROXY="{proxys}"')
+    if profile.no_proxy:
+        lines.append(f'no_proxy="{profile.no_proxy}"')
+        lines.append(f'NO_PROXY="{profile.no_proxy}"')
+    return "\n".join(lines)
+
+
+def _source_env_preamble() -> str:
+    """Return a shell snippet that sources /etc/environment.
+
+    Each ``ProvisionStep`` runs in its own SSH session, so environment
+    variables set by a previous step (e.g. proxy settings) are lost.
+    Sourcing ``/etc/environment`` at the top of every network-dependent
+    step ensures the variables are available.
+    """
+    return (
+        "# Source /etc/environment so proxy vars (and others) persist across SSH sessions\n"
+        "set -a; . /etc/environment 2>/dev/null || true; set +a\n"
+    )
+
+
 def generate_common_setup_script(profile: ClusterProfile) -> str:
     """Generate the common setup script that runs on ALL nodes (control-plane + workers)."""
     proxy_block = _proxy_env_block(profile)
+    proxy_env_file_block = _proxy_env_file_block(profile)
     proxy_section = ""
     if proxy_block:
         proxy_section = f"""
@@ -132,9 +173,9 @@ def generate_common_setup_script(profile: ClusterProfile) -> str:
 echo ">> Configuring proxy settings..."
 {proxy_block}
 
-# Persist proxy in /etc/environment for all users
-cat >> /etc/environment <<PROXYEOF
-{proxy_block}
+# Persist proxy in /etc/environment for all users (KEY=VALUE format for pam_env)
+cat >> /etc/environment <<'PROXYEOF'
+{proxy_env_file_block}
 PROXYEOF
 """
 
@@ -569,6 +610,7 @@ def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
     steps: List[ProvisionStep] = []
 
     # 0. Proxy (optional)
+    proxy_env_file_block = _proxy_env_file_block(profile)
     if proxy_block:
         steps.append(ProvisionStep(
             name="configure_proxy",
@@ -576,9 +618,9 @@ def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
             script=f"""set -euo pipefail
 echo '>> Configuring proxy settings...'
 {proxy_block}
-# Persist proxy in /etc/environment for all users
+# Persist proxy in /etc/environment for all users (KEY=VALUE format for pam_env)
 cat >> /etc/environment <<'PROXYEOF'
-{proxy_block}
+{proxy_env_file_block}
 PROXYEOF
 echo 'Proxy configured.'
 """,
@@ -652,10 +694,12 @@ def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
         ))
 
     # 3. Install CRI-O
+    _env_preamble = _source_env_preamble()
     steps.append(ProvisionStep(
         name="install_crio",
         title=f"Install CRI-O {profile.crio_version}",
         script=f"""set -euo pipefail
+{_env_preamble}
 echo '>> Installing CRI-O {profile.crio_version}...'
 
 OS="$(. /etc/os-release && echo "$ID")"
@@ -714,6 +758,7 @@ def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
         name="install_k8s",
         title=f"Install Kubernetes {profile.kubernetes_version} Components",
         script=f"""set -euo pipefail
+{_env_preamble}
 echo '>> Installing Kubernetes {profile.kubernetes_version} components...'
 
 OS="$(. /etc/os-release && echo "$ID")"
@@ -762,12 +807,15 @@ def get_control_plane_steps(profile: ClusterProfile) -> List[ProvisionStep]:
 
     steps: List[ProvisionStep] = []
 
+    _env_preamble = _source_env_preamble()
+
     # 0. Proxy on CP (optional)
     if proxy_block:
         steps.append(ProvisionStep(
             name="cp_proxy",
             title="Set Proxy Environment for kubeadm",
             script=f"""set -euo pipefail
+{_env_preamble}
 echo '>> Setting proxy environment for kubeadm...'
 {proxy_block}
 echo 'Proxy environment set.'
@@ -780,6 +828,7 @@ def get_control_plane_steps(profile: ClusterProfile) -> List[ProvisionStep]:
         name="kubeadm_init",
         title="Run kubeadm init",
         script=f"""set -euo pipefail
+{_env_preamble}
 echo '>> Preparing kubeadm config...'
 mkdir -p "{audit_log_dir}"
 cat > /tmp/kubeadm-config.yaml <<EOF
@@ -873,6 +922,7 @@ def get_control_plane_steps(profile: ClusterProfile) -> List[ProvisionStep]:
         name="install_flannel",
         title="Install Flannel CNI",
         script=f"""set -euo pipefail
+{_env_preamble}
 echo '>> Installing Flannel CNI...'
 {flannel_apply}
 echo '>> Waiting for Flannel pods to be ready...'

From 95c61e326635485e923a8ee9c506f660cc16b2c1 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 11:37:53 +0000
Subject: [PATCH 23/31] Fix unquoted paths in rm -rf reset commands and
 sanitize profile name in kubeconfig path

---
 k8s-agent/modules/cluster_creator.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 5967556..3fea3c7 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -1173,7 +1173,7 @@ def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]:
             title="Remove kubelet data",
             script=f"""set -uo pipefail
 echo '>> Removing kubelet data at {kubelet_root}...'
-rm -rf {kubelet_root}/*
+rm -rf "{kubelet_root}"/*
 rm -rf /etc/kubernetes/*
 rm -rf /tmp/kubeadm-join-command.txt
 echo 'Kubelet data removed.'
@@ -1185,7 +1185,7 @@ def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]:
             title="Remove CRI-O container data",
             script=f"""set -uo pipefail
 echo '>> Removing CRI-O storage at {crio_root}...'
-rm -rf {crio_root}/*
+rm -rf "{crio_root}"/*
 echo '>> Removing CRI-O run root...'
 rm -rf /run/containers/storage/*
 echo 'CRI-O data removed.'
@@ -1209,8 +1209,8 @@ def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]:
             title="Clean K8s-related logs",
             script=f"""set -uo pipefail
 echo '>> Cleaning K8s logs at {log_root}...'
-rm -rf {log_root}/pods/*
-rm -rf {log_root}/containers/*
+rm -rf "{log_root}"/pods/*
+rm -rf "{log_root}"/containers/*
 rm -rf /var/log/kubernetes/* 2>/dev/null || true
 echo 'Logs cleaned.'
 """,
@@ -1390,9 +1390,14 @@ def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSH
     is_helm = command.strip().startswith("helm ")
 
     if profile.kubeconfig_content:
-        # Write kubeconfig to a file and run locally
+        # Write kubeconfig to a file and run locally.
+        # Sanitize the profile name for use as a filename — replace any
+        # non-alphanumeric characters (spaces, shell metacharacters, etc.)
+        # with underscores so the path is always safe for shell interpolation.
+        import re as _re
+        safe_name = _re.sub(r"[^\w.-]", "_", profile.name) or "cluster"
         kubeconfig_path = os.path.join(
-            config.DATA_DIR, "kubeconfigs", f"{profile.name}.kubeconfig"
+            config.DATA_DIR, "kubeconfigs", f"{safe_name}.kubeconfig"
         )
         os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
         with open(kubeconfig_path, "w") as f:
@@ -1406,7 +1411,7 @@ def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSH
             resolved = command.strip()
             if resolved.startswith("helm "):
                 resolved = bin_path + resolved[4:]
-            full_cmd = f"KUBECONFIG={kubeconfig_path} {resolved}"
+            full_cmd = f'KUBECONFIG="{kubeconfig_path}" {resolved}'
         else:
             if not kubectl:
                 return SSHResult(
@@ -1425,7 +1430,7 @@ def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSH
                     ),
                     success=False,
                 )
-            full_cmd = f"{kubectl} --kubeconfig={kubeconfig_path} {command}"
+            full_cmd = f'{kubectl} --kubeconfig="{kubeconfig_path}" {command}'
         try:
             proc = subprocess.run(
                 full_cmd,

From f91b1294a2b63b3ab5c126658883206ac2b53c67 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 11:48:57 +0000
Subject: [PATCH 24/31] Add pod/container dropdowns to Pod Logs tab and fix
 fetch button feedback

---
 k8s-agent/app.py | 140 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 107 insertions(+), 33 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 85e3874..861c893 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -1646,6 +1646,8 @@ def page_log_analysis():
     # ── Pod Logs ──────────────────────────────────────────────────────────
     with tab_pod:
         st.markdown("### Pod Logs")
+
+        # --- Namespace selection ---
         col1, col2 = st.columns(2)
         with col1:
             if _cluster_namespaces:
@@ -1654,42 +1656,114 @@ def page_log_analysis():
                                       key="pod_ns")
             else:
                 pod_ns = st.text_input("Namespace", value="default", key="pod_ns")
-            pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz", key="pod_name_input")
         with col2:
-            container = st.text_input("Container (optional)", key="pod_container")
-            pod_lines = st.number_input("Lines", min_value=50, max_value=1000, value=200, key="pod_lines")
-            pod_previous = st.checkbox("Previous container logs (crash recovery)")
-
-        if st.button("Fetch Pod Logs", type="primary", key="fetch_pod") and pod_name:
-            with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."):
-                result = collect_pod_logs(
-                    cp_node, pod_ns, pod_name, container, pod_lines,
-                    "1h", pod_previous, profile=profile,
+            pod_lines = st.number_input("Lines", min_value=50, max_value=5000, value=200, key="pod_lines")
+
+        # --- Load pods from the cluster ---
+        if st.button("Load Pods", key="load_pods_btn"):
+            with st.spinner(f"Fetching pods in namespace '{pod_ns}'..."):
+                pod_result = get_pod_list(cp_node, namespace=pod_ns, profile=profile)
+                if pod_result.success and pod_result.stdout.strip():
+                    _pods: list[dict] = []
+                    for line in pod_result.stdout.strip().split("\n"):
+                        parts = line.split()
+                        if len(parts) >= 4:
+                            _pods.append({
+                                "namespace": parts[0],
+                                "name": parts[1],
+                                "status": parts[2],
+                                "containers": parts[3],
+                            })
+                        elif len(parts) >= 2:
+                            _pods.append({
+                                "namespace": parts[0],
+                                "name": parts[1],
+                                "status": parts[2] if len(parts) > 2 else "Unknown",
+                                "containers": parts[3] if len(parts) > 3 else "",
+                            })
+                    st.session_state["_pod_list"] = _pods
+                    st.session_state["_pod_list_ns"] = pod_ns
+                    st.success(f"Found {len(_pods)} pod(s) in namespace '{pod_ns}'.")
+                elif pod_result.success:
+                    st.session_state["_pod_list"] = []
+                    st.session_state["_pod_list_ns"] = pod_ns
+                    st.warning(f"No pods found in namespace '{pod_ns}'.")
+                else:
+                    st.error(f"Failed to fetch pods: {pod_result.stderr}")
+
+        # --- Pod & container dropdowns ---
+        _pods_loaded = st.session_state.get("_pod_list", [])
+        _pods_loaded_ns = st.session_state.get("_pod_list_ns", "")
+
+        col_p1, col_p2 = st.columns(2)
+        with col_p1:
+            if _pods_loaded and _pods_loaded_ns == pod_ns:
+                pod_options = [f"{p['name']}  ({p['status']})" for p in _pods_loaded]
+                selected_pod_idx = st.selectbox(
+                    "Pod Name", options=range(len(pod_options)),
+                    format_func=lambda i: pod_options[i],
+                    key="pod_name_select",
                 )
-                if result.success:
-                    analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}")
-                    m1, m2, m3 = st.columns(3)
-                    m1.metric("Total Lines", analysis.total_lines)
-                    m2.metric("Errors", analysis.error_count)
-                    m3.metric("Warnings", analysis.warning_count)
-
-                    if analysis.error_patterns:
-                        st.markdown("**Error Patterns:**")
-                        for pattern, count in list(analysis.error_patterns.items())[:10]:
-                            st.markdown(f"- `{pattern}` (x{count})")
-
-                    st.code(result.stdout[-5000:], language="text")
-
-                    if analysis.error_count > 0 and is_llm_configured():
-                        if st.button("Analyze with AI", key="pod_ai"):
-                            with st.spinner("AI analyzing pod logs..."):
-                                ai_analysis = llm_analyze_logs(
-                                    result.stdout, f"{pod_ns}/{pod_name}"
-                                )
-                                st.markdown(ai_analysis)
+                pod_name = _pods_loaded[selected_pod_idx]["name"] if selected_pod_idx is not None else ""
+            else:
+                pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz (click Load Pods to get dropdown)", key="pod_name_input")
+
+        with col_p2:
+            if _pods_loaded and _pods_loaded_ns == pod_ns and pod_name:
+                # Find the selected pod's containers
+                _selected_pod = next((p for p in _pods_loaded if p["name"] == pod_name), None)
+                _containers: list[str] = []
+                if _selected_pod and _selected_pod.get("containers"):
+                    _containers = [c.strip() for c in _selected_pod["containers"].split(",") if c.strip()]
+                if _containers:
+                    container_options = ["(all / default)"] + _containers
+                    container_sel = st.selectbox("Container", options=container_options, key="pod_container_select")
+                    container = "" if container_sel == "(all / default)" else container_sel
                 else:
-                    st.error("Failed to fetch pod logs")
-                    st.code(result.stderr, language="text")
+                    container = st.text_input("Container (optional)", key="pod_container")
+            else:
+                container = st.text_input("Container (optional)", key="pod_container")
+
+        pod_previous = st.checkbox("Previous container logs (crash recovery)")
+
+        # --- Fetch logs ---
+        if st.button("Fetch Pod Logs", type="primary", key="fetch_pod"):
+            if not pod_name:
+                st.warning("Please enter a pod name or click **Load Pods** to select one.")
+            else:
+                with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."):
+                    result = collect_pod_logs(
+                        cp_node, pod_ns, pod_name, container, pod_lines,
+                        "1h", pod_previous, profile=profile,
+                    )
+                    if result.success:
+                        if not result.stdout.strip():
+                            st.info(f"No log output returned for pod `{pod_ns}/{pod_name}`. "
+                                    "The pod may have just started or has no recent logs.")
+                        else:
+                            analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}")
+                            m1, m2, m3 = st.columns(3)
+                            m1.metric("Total Lines", analysis.total_lines)
+                            m2.metric("Errors", analysis.error_count)
+                            m3.metric("Warnings", analysis.warning_count)
+
+                            if analysis.error_patterns:
+                                st.markdown("**Error Patterns:**")
+                                for pattern, count in list(analysis.error_patterns.items())[:10]:
+                                    st.markdown(f"- `{pattern}` (x{count})")
+
+                            st.code(result.stdout[-5000:], language="text")
+
+                            if analysis.error_count > 0 and is_llm_configured():
+                                if st.button("Analyze with AI", key="pod_ai"):
+                                    with st.spinner("AI analyzing pod logs..."):
+                                        ai_analysis = llm_analyze_logs(
+                                            result.stdout, f"{pod_ns}/{pod_name}"
+                                        )
+                                        st.markdown(ai_analysis)
+                    else:
+                        st.error(f"Failed to fetch pod logs for `{pod_ns}/{pod_name}`")
+                        st.code(result.stderr, language="text")
 
     # ── Error Correlation ─────────────────────────────────────────────────
     with tab_correlation:

From c70cd7fe93bc5b031757be05f71d1e9165f460ed Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 12:04:45 +0000
Subject: [PATCH 25/31] Add Istio/Envoy access log analysis: response time
 analytics, status codes, per-path/upstream breakdowns, slow requests

---
 k8s-agent/app.py                  | 198 ++++++++++++++++++
 k8s-agent/modules/log_analyzer.py | 322 ++++++++++++++++++++++++++++++
 2 files changed, 520 insertions(+)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 861c893..95462c1 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -85,6 +85,7 @@
     detect_anomalies,
     mine_log_patterns,
     summarize_logs,
+    analyze_istio_access_logs,
 )
 from modules.llm_client import query_llm, stream_llm
 
@@ -2063,6 +2064,203 @@ def page_log_analysis():
                 except ImportError:
                     pass
 
+            # ── Istio / Envoy Access Log Analysis ────────────────────────
+            if sa_result.istio:
+                istio = sa_result.istio
+                st.markdown("---")
+                st.markdown("#### Istio / Envoy Access Log Analysis")
+                st.markdown(
+                    "Detected **Istio/Envoy access logs** — showing response time analytics, "
+                    "status code distribution, per-path and per-upstream breakdowns, and slow requests."
+                )
+
+                # ── Overview metrics ──
+                icol1, icol2, icol3, icol4, icol5, icol6 = st.columns(6)
+                icol1.metric("Total Requests", f"{istio.total_requests:,}")
+                icol2.metric("Avg Latency", f"{istio.avg_ms:.0f} ms")
+                icol3.metric("P50", f"{istio.p50_ms:.0f} ms")
+                icol4.metric("P95", f"{istio.p95_ms:.0f} ms")
+                icol5.metric("P99", f"{istio.p99_ms:.0f} ms")
+                icol6.metric("Error Rate", f"{istio.error_rate:.1f}%")
+
+                if istio.error_rate > 10:
+                    st.error(f"High error rate: **{istio.error_rate:.1f}%** of requests returned 4xx/5xx.")
+                elif istio.error_rate > 2:
+                    st.warning(f"Elevated error rate: **{istio.error_rate:.1f}%** of requests returned 4xx/5xx.")
+
+                icol7, icol8, icol9 = st.columns(3)
+                icol7.metric("Min Latency", f"{istio.min_ms:.0f} ms")
+                icol8.metric("Max Latency", f"{istio.max_ms:.0f} ms")
+                icol9.metric("P90", f"{istio.p90_ms:.0f} ms")
+
+                # ── Status Code Distribution ──
+                st.markdown("##### Status Code Distribution")
+                if istio.status_distribution:
+                    import pandas as pd
+                    status_data = [{"Status Code": str(k), "Count": v} for k, v in sorted(istio.status_distribution.items())]
+                    df_status = pd.DataFrame(status_data)
+                    scol_t, scol_c = st.columns([1, 1])
+                    with scol_t:
+                        st.dataframe(df_status, use_container_width=True, hide_index=True)
+                    with scol_c:
+                        try:
+                            import plotly.express as px
+                            fig = px.pie(
+                                df_status, names="Status Code", values="Count",
+                                title="Response Status Codes",
+                                color="Status Code",
+                                color_discrete_map={
+                                    str(k): ("#2ecc71" if k < 300 else "#f39c12" if k < 400 else "#e67e22" if k < 500 else "#e74c3c")
+                                    for k in istio.status_distribution
+                                },
+                            )
+                            fig.update_layout(height=350)
+                            st.plotly_chart(fig, use_container_width=True)
+                        except ImportError:
+                            pass
+
+                # ── Status class summary ──
+                if istio.status_class_distribution:
+                    class_cols = st.columns(len(istio.status_class_distribution))
+                    for idx, (cls, cnt) in enumerate(sorted(istio.status_class_distribution.items())):
+                        class_cols[idx].metric(cls, cnt)
+
+                # ── Response Flags ──
+                if istio.response_flags_dist and len(istio.response_flags_dist) > 1:
+                    with st.expander("Response Flags (Envoy)", expanded=False):
+                        st.markdown(
+                            "Envoy response flags indicate special conditions: "
+                            "`UF`=upstream failure, `UH`=no healthy upstream, "
+                            "`UT`=upstream timeout, `NR`=no route, `DC`=downstream disconnected, etc."
+                        )
+                        import pandas as pd
+                        flags_data = [{"Flag": k, "Count": v} for k, v in istio.response_flags_dist.items()]
+                        st.dataframe(pd.DataFrame(flags_data), use_container_width=True, hide_index=True)
+
+                # ── Latency Distribution Histogram ──
+                st.markdown("##### Latency Distribution")
+                try:
+                    import plotly.express as px
+                    durations = [e.duration_ms for e in istio.parsed_entries]
+                    fig = px.histogram(
+                        x=durations, nbins=50,
+                        labels={"x": "Duration (ms)", "y": "Count"},
+                        title="Request Latency Distribution",
+                    )
+                    fig.add_vline(x=istio.p50_ms, line_dash="dash", line_color="green",
+                                  annotation_text=f"P50: {istio.p50_ms:.0f}ms")
+                    fig.add_vline(x=istio.p95_ms, line_dash="dash", line_color="orange",
+                                  annotation_text=f"P95: {istio.p95_ms:.0f}ms")
+                    fig.add_vline(x=istio.p99_ms, line_dash="dash", line_color="red",
+                                  annotation_text=f"P99: {istio.p99_ms:.0f}ms")
+                    fig.update_layout(height=400)
+                    st.plotly_chart(fig, use_container_width=True)
+                except ImportError:
+                    pass
+
+                # ── Per-Path Response Time ──
+                if istio.path_stats:
+                    st.markdown("##### Per-Path Response Time")
+                    import pandas as pd
+                    path_data = []
+                    for ps in istio.path_stats[:30]:
+                        path_data.append({
+                            "Path": ps["path"][:80],
+                            "Requests": ps["count"],
+                            "Avg (ms)": ps["avg_ms"],
+                            "P50 (ms)": ps["p50_ms"],
+                            "P95 (ms)": ps["p95_ms"],
+                            "P99 (ms)": ps["p99_ms"],
+                            "Max (ms)": ps["max_ms"],
+                            "Errors": ps["error_count"],
+                            "Error %": ps["error_rate"],
+                        })
+                    df_paths = pd.DataFrame(path_data)
+                    st.dataframe(df_paths, use_container_width=True, hide_index=True)
+
+                    # Bar chart of top paths by P95
+                    try:
+                        import plotly.express as px
+                        top_paths = istio.path_stats[:15]
+                        fig = px.bar(
+                            x=[p["path"][:50] for p in top_paths],
+                            y=[p["p95_ms"] for p in top_paths],
+                            labels={"x": "Path", "y": "P95 Latency (ms)"},
+                            title="Top Paths by P95 Latency",
+                            color=[p["error_rate"] for p in top_paths],
+                            color_continuous_scale="RdYlGn_r",
+                        )
+                        fig.update_layout(height=400, xaxis_tickangle=-45,
+                                          coloraxis_colorbar_title="Error %")
+                        st.plotly_chart(fig, use_container_width=True)
+                    except ImportError:
+                        pass
+
+                # ── Per-Upstream Service Stats ──
+                if istio.upstream_stats:
+                    st.markdown("##### Per-Upstream Service Stats")
+                    import pandas as pd
+                    up_data = []
+                    for us in istio.upstream_stats[:20]:
+                        up_data.append({
+                            "Upstream": us["upstream"][:60],
+                            "Requests": us["count"],
+                            "Avg Duration (ms)": us["avg_duration_ms"],
+                            "Avg Upstream (ms)": us["avg_upstream_ms"],
+                            "P95 Duration (ms)": us["p95_duration_ms"],
+                            "P95 Upstream (ms)": us["p95_upstream_ms"],
+                            "Errors": us["error_count"],
+                            "Error %": us["error_rate"],
+                        })
+                    st.dataframe(pd.DataFrame(up_data), use_container_width=True, hide_index=True)
+
+                # ── Slow Requests ──
+                if istio.slow_requests:
+                    with st.expander(f"Slow Requests (>{istio.p95_ms:.0f}ms — top {len(istio.slow_requests)})", expanded=True):
+                        import pandas as pd
+                        slow_data = []
+                        for sr in istio.slow_requests[:30]:
+                            slow_data.append({
+                                "Duration (ms)": sr.duration_ms,
+                                "Upstream (ms)": sr.upstream_service_time_ms,
+                                "Method": sr.method,
+                                "Path": sr.path[:80],
+                                "Status": sr.response_code,
+                                "Flags": sr.response_flags,
+                                "Upstream Host": sr.upstream_host[:40],
+                                "Timestamp": sr.timestamp[:25] if sr.timestamp else "",
+                            })
+                        st.dataframe(pd.DataFrame(slow_data), use_container_width=True, hide_index=True)
+
+                # ── Istio Request Timeline ──
+                if istio.timeline_buckets and len(istio.timeline_buckets) > 1:
+                    st.markdown("##### Request Timeline")
+                    try:
+                        import plotly.graph_objects as go
+                        ts_labels = [b["timestamp"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"]
+                        ts_totals = [b["total"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"]
+                        ts_errors = [b["errors"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"]
+                        ts_avg_dur = [b["avg_duration"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"]
+
+                        if ts_labels:
+                            fig = go.Figure()
+                            fig.add_trace(go.Bar(x=ts_labels, y=ts_totals, name="Requests", marker_color="#326CE5"))
+                            fig.add_trace(go.Bar(x=ts_labels, y=ts_errors, name="Errors (4xx+5xx)", marker_color="#FF4B4B"))
+                            fig.add_trace(go.Scatter(
+                                x=ts_labels, y=ts_avg_dur, name="Avg Latency (ms)",
+                                mode="lines+markers", yaxis="y2", line=dict(color="orange"),
+                            ))
+                            fig.update_layout(
+                                title="Requests & Latency Over Time",
+                                yaxis_title="Request Count",
+                                yaxis2=dict(title="Avg Latency (ms)", overlaying="y", side="right"),
+                                barmode="overlay",
+                                height=400,
+                            )
+                            st.plotly_chart(fig, use_container_width=True)
+                    except ImportError:
+                        pass
+
     # ── AI Log Analysis ───────────────────────────────────────────────────
     with tab_ai:
         st.markdown("### AI-Powered Log Analysis")
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index 3f6c6df..69c4e9d 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -484,6 +484,59 @@ class LogAnomaly:
     source: str = ""
 
 
+@dataclass
+class IstioAccessEntry:
+    """A parsed Istio/Envoy access log entry."""
+    timestamp: str = ""
+    method: str = ""
+    path: str = ""
+    protocol: str = ""
+    response_code: int = 0
+    response_flags: str = ""
+    bytes_received: int = 0
+    bytes_sent: int = 0
+    duration_ms: float = 0.0          # total request duration
+    upstream_service_time_ms: float = 0.0  # time spent in upstream
+    upstream_cluster: str = ""
+    upstream_host: str = ""
+    downstream_remote: str = ""
+    downstream_local: str = ""
+    requested_server_name: str = ""
+    authority: str = ""               # Host header
+    user_agent: str = ""
+    raw_line: str = ""
+
+
+@dataclass
+class IstioAnalysisResult:
+    """Result from Istio access log analysis."""
+    total_requests: int = 0
+    parsed_entries: list[IstioAccessEntry] = field(default_factory=list)
+    # Latency percentiles
+    p50_ms: float = 0.0
+    p90_ms: float = 0.0
+    p95_ms: float = 0.0
+    p99_ms: float = 0.0
+    avg_ms: float = 0.0
+    max_ms: float = 0.0
+    min_ms: float = 0.0
+    # Status code distribution
+    status_distribution: dict = field(default_factory=dict)   # code -> count
+    status_class_distribution: dict = field(default_factory=dict)  # "2xx"->count
+    # Error rate
+    error_rate: float = 0.0  # percentage of 4xx+5xx
+    # Slow requests (above p95)
+    slow_requests: list[IstioAccessEntry] = field(default_factory=list)
+    # Per-path stats
+    path_stats: list[dict] = field(default_factory=list)
+    # Per-upstream stats
+    upstream_stats: list[dict] = field(default_factory=list)
+    # Response flags distribution
+    response_flags_dist: dict = field(default_factory=dict)
+    # Timeline buckets (per-minute)
+    timeline_buckets: list[dict] = field(default_factory=list)
+
+
 @dataclass
 class SmartAnalysisResult:
     """Full result from smart log analysis."""
@@ -493,6 +546,7 @@ class SmartAnalysisResult:
     patterns: list[dict] = field(default_factory=list)
     summary: dict = field(default_factory=dict)
     timeline_buckets: list[dict] = field(default_factory=list)
+    istio: IstioAnalysisResult | None = None  # populated when Istio logs detected
 
 
 def _tokenize_log(message: str) -> str:
@@ -835,4 +889,272 @@ def smart_analyze(log_text: str, source: str = "") -> SmartAnalysisResult:
 
     result.timeline_buckets = sorted(ts_buckets.values(), key=lambda b: b["timestamp"])
 
+    # 6. Istio / Envoy access log analysis (auto-detected)
+    istio_result = analyze_istio_access_logs(log_text)
+    if istio_result and istio_result.total_requests > 0:
+        result.istio = istio_result
+
+    return result
+
+
+# ══════════════════════════════════════════════════════════════════════════
+#  Istio / Envoy Access Log Analysis
+#  Parses Envoy access log format used by Istio sidecars and provides
+#  response-time analytics, status code distributions, per-path and
+#  per-upstream breakdowns, and slow-request detection.
+# ══════════════════════════════════════════════════════════════════════════
+
+# Envoy default access log format (as emitted by Istio):
+# [%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%"
+# %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT%
+# %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)%
+# "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%"
+# "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" %UPSTREAM_CLUSTER%
+# %UPSTREAM_LOCAL_ADDRESS% %DOWNSTREAM_LOCAL_ADDRESS%
+# %DOWNSTREAM_REMOTE_ADDRESS% %REQUESTED_SERVER_NAME% %ROUTE_NAME%
+
+_ISTIO_LOG_RE = re.compile(
+    r'\[(?P<timestamp>[^\]]+)\]\s+'
+    r'"(?P<method>\S+)\s+(?P<path>\S+)\s+(?P<protocol>[^"]*?)"\s+'
+    r'(?P<response_code>\d+)\s+'
+    r'(?P<response_flags>\S+)\s+'
+    r'(?P<bytes_received>\d+)\s+'
+    r'(?P<bytes_sent>\d+)\s+'
+    r'(?P<duration>\d+)\s+'
+    r'(?P<upstream_service_time>\d+|-)\s+'
+    r'"(?P<xff>[^"]*)"\s+'
+    r'"(?P<user_agent>[^"]*)"\s+'
+    r'"(?P<request_id>[^"]*)"\s+'
+    r'"(?P<authority>[^"]*)"\s+'
+    r'"(?P<upstream_host>[^"]*)"\s*'
+    r'(?P<rest>.*)'
+)
+
+# Simpler fallback: JSON-format Istio access logs (structured logging)
+_ISTIO_JSON_KEYS = {
+    "response_code", "duration", "method", "path", "upstream_service_time",
+    "upstream_cluster", "authority", "bytes_received", "bytes_sent",
+}
+
+
+def _parse_istio_line(line: str) -> IstioAccessEntry | None:
+    """Try to parse a single line as an Istio/Envoy access log entry."""
+    import json as _json
+
+    # Try structured JSON format first
+    stripped = line.strip()
+    if stripped.startswith("{"):
+        try:
+            obj = _json.loads(stripped)
+            # Verify it looks like an Istio access log
+            if "response_code" in obj or "method" in obj or "duration" in obj:
+                duration = obj.get("duration", 0)
+                ust = obj.get("upstream_service_time", 0)
+                # Istio JSON logs may use different field names
+                return IstioAccessEntry(
+                    timestamp=str(obj.get("start_time", obj.get("timestamp", ""))),
+                    method=str(obj.get("method", obj.get("request_method", ""))),
+                    path=str(obj.get("path", obj.get("request_path", ""))),
+                    protocol=str(obj.get("protocol", "")),
+                    response_code=int(obj.get("response_code", 0)),
+                    response_flags=str(obj.get("response_flags", "-")),
+                    bytes_received=int(obj.get("bytes_received", 0)),
+                    bytes_sent=int(obj.get("bytes_sent", 0)),
+                    duration_ms=float(duration) if duration not in ("-", "", None) else 0.0,
+                    upstream_service_time_ms=float(ust) if ust not in ("-", "", None) else 0.0,
+                    upstream_cluster=str(obj.get("upstream_cluster", "")),
+                    upstream_host=str(obj.get("upstream_host", "")),
+                    authority=str(obj.get("authority", obj.get("host", ""))),
+                    user_agent=str(obj.get("user_agent", "")),
+                    downstream_remote=str(obj.get("downstream_remote_address", "")),
+                    downstream_local=str(obj.get("downstream_local_address", "")),
+                    requested_server_name=str(obj.get("requested_server_name", "")),
+                    raw_line=line,
+                )
+        except (_json.JSONDecodeError, ValueError, TypeError):
+            pass
+
+    # Try standard Envoy text format
+    m = _ISTIO_LOG_RE.match(stripped)
+    if m:
+        ust = m.group("upstream_service_time")
+        rest = m.group("rest").strip()
+        # Parse remaining fields from rest (upstream_cluster, etc.)
+        rest_parts = rest.split()
+        upstream_cluster = rest_parts[0] if rest_parts else ""
+        return IstioAccessEntry(
+            timestamp=m.group("timestamp"),
+            method=m.group("method"),
+            path=m.group("path"),
+            protocol=m.group("protocol"),
+            response_code=int(m.group("response_code")),
+            response_flags=m.group("response_flags"),
+            bytes_received=int(m.group("bytes_received")),
+            bytes_sent=int(m.group("bytes_sent")),
+            duration_ms=float(m.group("duration")),
+            upstream_service_time_ms=float(ust) if ust != "-" else 0.0,
+            upstream_cluster=upstream_cluster,
+            upstream_host=m.group("upstream_host"),
+            authority=m.group("authority"),
+            user_agent=m.group("user_agent"),
+            downstream_remote=m.group("xff") or "",
+            raw_line=line,
+        )
+
+    return None
+
+
+def _is_likely_istio_log(lines: list[str], sample_size: int = 20) -> bool:
+    """Heuristic: check if a meaningful fraction of lines look like Istio access logs."""
+    sample = lines[:sample_size]
+    parsed = sum(1 for l in sample if _parse_istio_line(l) is not None)
+    return parsed >= max(1, len(sample) * 0.3)  # at least 30% parse successfully
+
+
+def analyze_istio_access_logs(log_text: str) -> IstioAnalysisResult | None:
+    """Parse and analyze Istio/Envoy access logs.
+
+    Returns None if the logs don't look like Istio access logs.
+    Returns an IstioAnalysisResult with latency stats, status distribution,
+    per-path breakdowns, per-upstream breakdowns, and slow requests.
+    """
+    lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+    if not lines:
+        return None
+
+    # Quick heuristic — bail early if this doesn't look like Istio logs
+    if not _is_likely_istio_log(lines):
+        return None
+
+    entries: list[IstioAccessEntry] = []
+    for line in lines:
+        entry = _parse_istio_line(line)
+        if entry is not None:
+            entries.append(entry)
+
+    if not entries:
+        return None
+
+    result = IstioAnalysisResult(
+        total_requests=len(entries),
+        parsed_entries=entries,
+    )
+
+    # ── Latency percentiles ──────────────────────────────────────────
+    import numpy as np
+    durations = np.array([e.duration_ms for e in entries])
+    if len(durations) > 0:
+        result.avg_ms = float(np.mean(durations))
+        result.min_ms = float(np.min(durations))
+        result.max_ms = float(np.max(durations))
+        result.p50_ms = float(np.percentile(durations, 50))
+        result.p90_ms = float(np.percentile(durations, 90))
+        result.p95_ms = float(np.percentile(durations, 95))
+        result.p99_ms = float(np.percentile(durations, 99))
+
+    # ── Status code distribution ─────────────────────────────────────
+    status_counter: Counter = Counter()
+    class_counter: Counter = Counter()
+    for e in entries:
+        status_counter[e.response_code] += 1
+        class_label = f"{e.response_code // 100}xx"
+        class_counter[class_label] += 1
+
+    result.status_distribution = dict(status_counter.most_common())
+    result.status_class_distribution = dict(class_counter.most_common())
+
+    # Error rate (4xx + 5xx)
+    error_count = sum(1 for e in entries if e.response_code >= 400)
+    result.error_rate = (error_count / len(entries)) * 100 if entries else 0.0
+
+    # ── Slow requests (above p95) ────────────────────────────────────
+    p95_threshold = result.p95_ms
+    slow = [e for e in entries if e.duration_ms > p95_threshold]
+    # Sort by duration descending, limit to top 50
+    slow.sort(key=lambda e: e.duration_ms, reverse=True)
+    result.slow_requests = slow[:50]
+
+    # ── Per-path stats ───────────────────────────────────────────────
+    path_groups: dict[str, list[IstioAccessEntry]] = {}
+    for e in entries:
+        # Normalize path: strip query params for grouping
+        base_path = e.path.split("?")[0] if e.path else "(unknown)"
+        path_groups.setdefault(base_path, []).append(e)
+
+    path_stats = []
+    for path, group in path_groups.items():
+        durations_g = [e.duration_ms for e in group]
+        errors_g = sum(1 for e in group if e.response_code >= 400)
+        path_stats.append({
+            "path": path,
+            "count": len(group),
+            "avg_ms": round(sum(durations_g) / len(durations_g), 1) if durations_g else 0,
+            "p50_ms": round(float(np.percentile(durations_g, 50)), 1) if durations_g else 0,
+            "p95_ms": round(float(np.percentile(durations_g, 95)), 1) if durations_g else 0,
+            "p99_ms": round(float(np.percentile(durations_g, 99)), 1) if durations_g else 0,
+            "max_ms": round(max(durations_g), 1) if durations_g else 0,
+            "error_count": errors_g,
+            "error_rate": round((errors_g / len(group)) * 100, 1) if group else 0,
+        })
+    path_stats.sort(key=lambda p: p["count"], reverse=True)
+    result.path_stats = path_stats[:50]
+
+    # ── Per-upstream stats ───────────────────────────────────────────
+    upstream_groups: dict[str, list[IstioAccessEntry]] = {}
+    for e in entries:
+        key = e.upstream_cluster or e.upstream_host or "(direct/unknown)"
+        upstream_groups.setdefault(key, []).append(e)
+
+    upstream_stats = []
+    for upstream, group in upstream_groups.items():
+        ust_vals = [e.upstream_service_time_ms for e in group if e.upstream_service_time_ms > 0]
+        dur_vals = [e.duration_ms for e in group]
+        errors_g = sum(1 for e in group if e.response_code >= 400)
+        upstream_stats.append({
+            "upstream": upstream,
+            "count": len(group),
+            "avg_duration_ms": round(sum(dur_vals) / len(dur_vals), 1) if dur_vals else 0,
+            "avg_upstream_ms": round(sum(ust_vals) / len(ust_vals), 1) if ust_vals else 0,
+            "p95_duration_ms": round(float(np.percentile(dur_vals, 95)), 1) if dur_vals else 0,
+            "p95_upstream_ms": round(float(np.percentile(ust_vals, 95)), 1) if ust_vals else 0,
+            "error_count": errors_g,
+            "error_rate": round((errors_g / len(group)) * 100, 1) if group else 0,
+        })
+    upstream_stats.sort(key=lambda u: u["count"], reverse=True)
+    result.upstream_stats = upstream_stats[:30]
+
+    # ── Response flags distribution ──────────────────────────────────
+    flags_counter: Counter = Counter()
+    for e in entries:
+        flag = e.response_flags if e.response_flags and e.response_flags != "-" else "(none)"
+        flags_counter[flag] += 1
+    result.response_flags_dist = dict(flags_counter.most_common())
+
+    # ── Timeline buckets (per-minute) ────────────────────────────────
+    ts_buckets: dict[str, dict] = {}
+    for e in entries:
+        # Try to extract minute-level bucket from timestamp
+        ts = e.timestamp
+        if ts:
+            # Envoy format: 2024-01-15T10:30:45.123Z or similar
+            bucket_key = ts[:16] if len(ts) >= 16 else ts[:10]
+        else:
+            bucket_key = "unknown"
+        if bucket_key not in ts_buckets:
+            ts_buckets[bucket_key] = {
+                "timestamp": bucket_key, "total": 0, "errors": 0,
+                "avg_duration": 0.0, "_durations": [],
+            }
+        ts_buckets[bucket_key]["total"] += 1
+        ts_buckets[bucket_key]["_durations"].append(e.duration_ms)
+        if e.response_code >= 400:
+            ts_buckets[bucket_key]["errors"] += 1
+
+    # Compute avg duration per bucket
+    for bucket in ts_buckets.values():
+        durs = bucket.pop("_durations", [])
+        bucket["avg_duration"] = round(sum(durs) / len(durs), 1) if durs else 0
+
+    result.timeline_buckets = sorted(ts_buckets.values(), key=lambda b: b["timestamp"])
+
     return result

From 63ecf080f7b9f2c44bd67c9a188b4799be745b1c Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 12:11:50 +0000
Subject: [PATCH 26/31] Remove Helm/Network Policy tabs, remove init containers
 from Resource Limits, add pod count per node, fix Set Active profile button

---
 k8s-agent/app.py | 376 ++++++++---------------------------------------
 1 file changed, 61 insertions(+), 315 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 95462c1..39f0be2 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -576,6 +576,7 @@ def page_profile_manager():
                     st.markdown(f"**Updated:** {profile.updated_at[:10] if profile.updated_at else 'N/A'}")
                     if st.button("Set Active", key=f"activate_{profile.name}"):
                         st.session_state.active_profile = profile.name
+                        st.session_state.profile_selector = profile.name
                         st.rerun()
                     if st.button("Delete", key=f"delete_{profile.name}", type="secondary"):
                         delete_profile(profile.name)
@@ -2350,8 +2351,8 @@ def page_resource_viewer():
         _rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
 
     (tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl,
-     tab_node_health, tab_rbac, tab_helm, tab_events,
-     tab_restart_tracker, tab_netpol, tab_pvc) = st.tabs([
+     tab_node_health, tab_rbac, tab_events,
+     tab_restart_tracker, tab_pvc) = st.tabs([
         "Cluster Resources",
         "Scaling",
         "Pod Shell",
@@ -2359,10 +2360,8 @@ def page_resource_viewer():
         "Node Containers",
         "Node Health",
         "RBAC Viewer",
-        "Helm Releases",
         "Events Timeline",
         "Pod Restart Tracker",
-        "Network Policies",
         "PVC / Storage",
     ])
 
@@ -2827,7 +2826,6 @@ def page_resource_viewer():
                                 template = spec.get("template", {})
                                 pod_spec = template.get("spec", {})
                                 containers = pod_spec.get("containers", [])
-                                init_containers = pod_spec.get("initContainers", [])
                                 for ctr in containers:
                                     res = ctr.get("resources", {})
                                     req = res.get("requests", {})
@@ -2836,23 +2834,6 @@ def page_resource_viewer():
                                         "Type": wl_label,
                                         "Workload": workload_name,
                                         "Container": ctr.get("name", "?"),
-                                        "Init": "",
-                                        "CPU Req": req.get("cpu", "-"),
-                                        "CPU Lim": lim.get("cpu", "-"),
-                                        "Mem Req": req.get("memory", "-"),
-                                        "Mem Lim": lim.get("memory", "-"),
-                                        "Eph Req": req.get("ephemeral-storage", "-"),
-                                        "Eph Lim": lim.get("ephemeral-storage", "-"),
-                                    })
-                                for ctr in init_containers:
-                                    res = ctr.get("resources", {})
-                                    req = res.get("requests", {})
-                                    lim = res.get("limits", {})
-                                    all_rows.append({
-                                        "Type": wl_label,
-                                        "Workload": workload_name,
-                                        "Container": ctr.get("name", "?"),
-                                        "Init": "init",
                                         "CPU Req": req.get("cpu", "-"),
                                         "CPU Lim": lim.get("cpu", "-"),
                                         "Mem Req": req.get("memory", "-"),
@@ -2874,7 +2855,6 @@ def page_resource_viewer():
                         "Type": st.column_config.TextColumn(width="small"),
                         "Workload": st.column_config.TextColumn(width="medium"),
                         "Container": st.column_config.TextColumn(width="medium"),
-                        "Init": st.column_config.TextColumn(width="small"),
                         "CPU Req": st.column_config.TextColumn(width="small"),
                         "CPU Lim": st.column_config.TextColumn(width="small"),
                         "Mem Req": st.column_config.TextColumn(width="small"),
@@ -2887,20 +2867,20 @@ def page_resource_viewer():
                 # Summary stats
                 st.markdown("---")
                 st.markdown("#### Summary")
-                no_cpu_req = sum(1 for r in all_rows if r["CPU Req"] == "-" and r["Init"] == "")
-                no_mem_req = sum(1 for r in all_rows if r["Mem Req"] == "-" and r["Init"] == "")
-                no_cpu_lim = sum(1 for r in all_rows if r["CPU Lim"] == "-" and r["Init"] == "")
-                no_mem_lim = sum(1 for r in all_rows if r["Mem Lim"] == "-" and r["Init"] == "")
-                non_init = sum(1 for r in all_rows if r["Init"] == "")
+                total_ctr = len(all_rows)
+                no_cpu_req = sum(1 for r in all_rows if r["CPU Req"] == "-")
+                no_mem_req = sum(1 for r in all_rows if r["Mem Req"] == "-")
+                no_cpu_lim = sum(1 for r in all_rows if r["CPU Lim"] == "-")
+                no_mem_lim = sum(1 for r in all_rows if r["Mem Lim"] == "-")
                 sc1, sc2, sc3, sc4 = st.columns(4)
                 with sc1:
-                    st.metric("No CPU Request", f"{no_cpu_req}/{non_init}")
+                    st.metric("No CPU Request", f"{no_cpu_req}/{total_ctr}")
                 with sc2:
-                    st.metric("No CPU Limit", f"{no_cpu_lim}/{non_init}")
+                    st.metric("No CPU Limit", f"{no_cpu_lim}/{total_ctr}")
                 with sc3:
-                    st.metric("No Mem Request", f"{no_mem_req}/{non_init}")
+                    st.metric("No Mem Request", f"{no_mem_req}/{total_ctr}")
                 with sc4:
-                    st.metric("No Mem Limit", f"{no_mem_lim}/{non_init}")
+                    st.metric("No Mem Limit", f"{no_mem_lim}/{total_ctr}")
 
                 if no_cpu_req > 0 or no_mem_req > 0:
                     st.warning(
@@ -2914,10 +2894,10 @@ def page_resource_viewer():
                     )
 
                 # Download as TSV
-                tsv_lines = ["Type\tWorkload\tContainer\tInit\tCPU Req\tCPU Lim\tMem Req\tMem Lim\tEph Req\tEph Lim"]
+                tsv_lines = ["Type\tWorkload\tContainer\tCPU Req\tCPU Lim\tMem Req\tMem Lim\tEph Req\tEph Lim"]
                 for r in all_rows:
                     tsv_lines.append(
-                        f"{r['Type']}\t{r['Workload']}\t{r['Container']}\t{r['Init']}\t"
+                        f"{r['Type']}\t{r['Workload']}\t{r['Container']}\t"
                         f"{r['CPU Req']}\t{r['CPU Lim']}\t{r['Mem Req']}\t{r['Mem Lim']}\t"
                         f"{r['Eph Req']}\t{r['Eph Lim']}"
                     )
@@ -2959,8 +2939,54 @@ def page_resource_viewer():
                     )
                 if node_result.success and node_result.stdout.strip():
                     node_names = [n.strip() for n in node_result.stdout.strip().split("\n") if n.strip()]
+
+                    # Pod count distribution across nodes
+                    node_pod_counts: dict[str, int] = {}
+                    for node_name in node_names:
+                        with st.spinner(f"Fetching pods on {node_name}..."):
+                            count_result = run_kubectl(
+                                profile,
+                                f"get pods -A --field-selector spec.nodeName={node_name} "
+                                "--no-headers",
+                                timeout=15,
+                            )
+                            if count_result.success:
+                                lines = [l for l in (count_result.stdout or "").strip().split("\n") if l.strip()]
+                                node_pod_counts[node_name] = len(lines)
+                            else:
+                                node_pod_counts[node_name] = 0
+
+                    # Show pod distribution summary
+                    st.markdown("#### Pod Distribution Across Nodes")
+                    dist_cols = st.columns(min(len(node_names), 6))
+                    for idx, node_name in enumerate(node_names):
+                        with dist_cols[idx % min(len(node_names), 6)]:
+                            st.metric(node_name, f"{node_pod_counts.get(node_name, 0)} pods")
+                    total_pods = sum(node_pod_counts.values())
+                    if total_pods > 0 and len(node_names) > 1:
+                        avg_pods = total_pods / len(node_names)
+                        max_pods = max(node_pod_counts.values())
+                        min_pods = min(node_pod_counts.values())
+                        spread = max_pods - min_pods
+                        st.markdown(
+                            f"**Total:** {total_pods} pods across {len(node_names)} nodes | "
+                            f"**Avg:** {avg_pods:.1f} | **Min:** {min_pods} | **Max:** {max_pods} | "
+                            f"**Spread:** {spread}"
+                        )
+                        if spread > avg_pods * 0.5 and avg_pods > 0:
+                            st.warning(
+                                f"Pod distribution is uneven (spread of {spread}). "
+                                "Consider checking node affinity rules or pod topology spread constraints."
+                            )
+                        else:
+                            st.success("Pods are reasonably well-distributed across nodes.")
+
+                    st.markdown("---")
+
+                    # Detailed per-node pod listing
                     for node_name in node_names:
-                        with st.expander(f"Node: **{node_name}**", expanded=True):
+                        pod_count = node_pod_counts.get(node_name, 0)
+                        with st.expander(f"Node: **{node_name}** ({pod_count} pods)", expanded=True):
                             with st.spinner(f"Fetching containers on {node_name}..."):
                                 pod_result = run_kubectl(
                                     profile,
@@ -3219,115 +3245,6 @@ def page_resource_viewer():
                     st.error("Describe failed")
                     st.code(result.stderr, language="text")
 
-    # ── Helm Releases ────────────────────────────────────────────────────
-    with tab_helm:
-        st.markdown("### Helm Release Manager")
-        st.markdown("List, inspect, and manage Helm releases on your cluster.")
-
-        helm_tab_list, helm_tab_install, helm_tab_history = st.tabs([
-            "List Releases", "Install Chart", "Release History",
-        ])
-
-        with helm_tab_list:
-            helm_ns_all = st.checkbox("All namespaces", value=True, key="helm_ns_all")
-            helm_ns = ""
-            if not helm_ns_all:
-                if _rv_namespaces:
-                    helm_ns = st.selectbox("Namespace", options=_rv_namespaces,
-                                           index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
-                                           key="helm_ns")
-                else:
-                    helm_ns = st.text_input("Namespace", value="default", key="helm_ns")
-
-            if st.button("List Helm Releases", type="primary", key="helm_list"):
-                helm_cmd = "helm list"
-                if helm_ns_all:
-                    helm_cmd += " -A"
-                elif helm_ns:
-                    helm_cmd += f" -n {helm_ns}"
-                helm_cmd += " -o table"
-
-                with st.spinner("Fetching Helm releases..."):
-                    result = run_kubectl(profile, helm_cmd.replace("kubectl ", ""), timeout=15)
-                    if result.success:
-                        st.code(result.stdout or "(no releases found)", language="text")
-                    else:
-                        st.warning("Helm may not be installed on this cluster.")
-                        st.code(result.stderr, language="text")
-
-        with helm_tab_install:
-            st.markdown("#### Install a Helm Chart")
-            hcol1, hcol2 = st.columns(2)
-            with hcol1:
-                helm_release_name = st.text_input("Release Name", placeholder="my-release", key="helm_rel")
-                helm_chart = st.text_input("Chart", placeholder="prometheus-community/kube-prometheus-stack", key="helm_chart")
-            with hcol2:
-                if _rv_namespaces:
-                    helm_install_ns = st.selectbox("Namespace", options=_rv_namespaces,
-                                                   index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
-                                                   key="helm_install_ns")
-                else:
-                    helm_install_ns = st.text_input("Namespace", value="default", key="helm_install_ns")
-                helm_create_ns = st.checkbox("Create namespace if not exists", value=True, key="helm_create_ns")
-            helm_values = st.text_area(
-                "Values (YAML, optional)",
-                placeholder="# Custom values.yaml content here",
-                height=150,
-                key="helm_values",
-            )
-
-            if st.button("Install Chart", type="primary", key="helm_install") and helm_release_name and helm_chart:
-                install_cmd = f"helm install {helm_release_name} {helm_chart} -n {helm_install_ns}"
-                if helm_create_ns:
-                    install_cmd += " --create-namespace"
-                # If user provided values, write to temp file
-                if helm_values.strip():
-                    values_path = os.path.join(config.UPLOADS_DIR, f"helm-values-{helm_release_name}.yaml")
-                    with open(values_path, "w") as vf:
-                        vf.write(helm_values)
-                    install_cmd += f" -f {values_path}"
-
-                with st.spinner(f"Installing {helm_chart}..."):
-                    result = run_kubectl(profile, install_cmd.replace("kubectl ", ""), timeout=120)
-                    if result.success:
-                        st.success(f"Release '{helm_release_name}' installed!")
-                        st.code(result.stdout, language="text")
-                    else:
-                        st.error("Helm install failed")
-                        st.code(result.stderr, language="text")
-
-        with helm_tab_history:
-            st.markdown("#### Release History")
-            hist_name = st.text_input("Release name", placeholder="my-release", key="helm_hist_name")
-            hist_ns = st.text_input("Namespace", value="default", key="helm_hist_ns")
-
-            if st.button("Get History", key="helm_hist") and hist_name:
-                hist_cmd = f"helm history {hist_name} -n {hist_ns}"
-                with st.spinner("Fetching history..."):
-                    result = run_kubectl(profile, hist_cmd.replace("kubectl ", ""), timeout=15)
-                    if result.success:
-                        st.code(result.stdout, language="text")
-                    else:
-                        st.error("Could not get release history")
-                        st.code(result.stderr, language="text")
-
-            st.markdown("---")
-            st.markdown("#### Rollback Release")
-            rb_name = st.text_input("Release name", placeholder="my-release", key="helm_rb_name")
-            rb_ns = st.text_input("Namespace", value="default", key="helm_rb_ns")
-            rb_rev = st.number_input("Revision number", min_value=1, value=1, key="helm_rb_rev")
-
-            if st.button("Rollback", key="helm_rollback") and rb_name:
-                rb_cmd = f"helm rollback {rb_name} {rb_rev} -n {rb_ns}"
-                with st.spinner(f"Rolling back {rb_name} to revision {rb_rev}..."):
-                    result = run_kubectl(profile, rb_cmd.replace("kubectl ", ""), timeout=60)
-                    if result.success:
-                        st.success(f"Rolled back '{rb_name}' to revision {rb_rev}")
-                        st.code(result.stdout, language="text")
-                    else:
-                        st.error("Rollback failed")
-                        st.code(result.stderr, language="text")
-
     # ── Events Timeline ──────────────────────────────────────────────────
     with tab_events:
         st.markdown("### Cluster Events Timeline")
@@ -3592,177 +3509,6 @@ def page_resource_viewer():
                 st.error("Failed to fetch pods")
                 st.code(result.stderr, language="text")
 
-    # ── Network Policy Visualizer ─────────────────────────────────────────
-    with tab_netpol:
-        st.markdown("### Network Policy Visualizer")
-        st.markdown("View and analyze NetworkPolicies to understand pod-to-pod communication rules.")
-
-        npcol1, npcol2 = st.columns([2, 1])
-        with npcol1:
-            if _rv_namespaces:
-                np_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="netpol_ns")
-            else:
-                np_ns = st.text_input("Namespace (blank = all)", value="", key="netpol_ns_text")
-                if not np_ns:
-                    np_ns = "All Namespaces"
-
-        if st.button("Load Network Policies", type="primary", key="load_netpol"):
-            ns_flag = "-A" if np_ns == "All Namespaces" else f"-n {np_ns}"
-            cmd = f"get networkpolicies {ns_flag} -o json"
-            with st.spinner("Fetching network policies..."):
-                result = run_kubectl(profile, cmd, timeout=15)
-            if result.success and result.stdout.strip():
-                try:
-                    import pandas as pd
-                    np_json = json.loads(result.stdout)
-                    policies = np_json.get("items", [])
-                    if not policies:
-                        st.info("No NetworkPolicies found. All pod-to-pod traffic is allowed by default.")
-                    else:
-                        st.markdown(f"**Found {len(policies)} NetworkPolicies**")
-
-                        policy_summary = []
-                        for pol in policies:
-                            meta = pol.get("metadata", {})
-                            spec = pol.get("spec", {})
-                            pol_name = meta.get("name", "?")
-                            pol_ns = meta.get("namespace", "?")
-                            # Pod selector
-                            pod_sel = spec.get("podSelector", {})
-                            match_labels = pod_sel.get("matchLabels", {})
-                            selector_str = ", ".join(f"{k}={v}" for k, v in match_labels.items()) if match_labels else "(all pods)"
-                            # Policy types
-                            policy_types = spec.get("policyTypes", [])
-                            # Ingress rules count
-                            ingress_rules = spec.get("ingress", [])
-                            egress_rules = spec.get("egress", [])
-
-                            policy_summary.append({
-                                "Namespace": pol_ns,
-                                "Policy": pol_name,
-                                "Pod Selector": selector_str,
-                                "Types": ", ".join(policy_types) if policy_types else "N/A",
-                                "Ingress Rules": len(ingress_rules),
-                                "Egress Rules": len(egress_rules),
-                            })
-
-                        st.dataframe(pd.DataFrame(policy_summary), use_container_width=True, hide_index=True)
-
-                        # Detailed view per policy
-                        for pol in policies:
-                            meta = pol.get("metadata", {})
-                            spec = pol.get("spec", {})
-                            pol_name = meta.get("name", "?")
-                            pol_ns = meta.get("namespace", "?")
-                            with st.expander(f"{pol_ns}/{pol_name}", expanded=False):
-                                # Pod selector
-                                pod_sel = spec.get("podSelector", {})
-                                match_labels = pod_sel.get("matchLabels", {})
-                                if match_labels:
-                                    st.markdown("**Applies to pods matching:** " + ", ".join(f"`{k}={v}`" for k, v in match_labels.items()))
-                                else:
-                                    st.markdown("**Applies to:** All pods in namespace")
-
-                                # Ingress
-                                ingress_rules = spec.get("ingress", [])
-                                if ingress_rules:
-                                    st.markdown("**Ingress Rules:**")
-                                    for i, rule in enumerate(ingress_rules):
-                                        sources = []
-                                        for fr in rule.get("from", []):
-                                            if "podSelector" in fr:
-                                                labels = fr["podSelector"].get("matchLabels", {})
-                                                sources.append("Pods: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
-                                            if "namespaceSelector" in fr:
-                                                labels = fr["namespaceSelector"].get("matchLabels", {})
-                                                sources.append("Namespaces: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
-                                            if "ipBlock" in fr:
-                                                sources.append(f"CIDR: {fr['ipBlock'].get('cidr', '?')}")
-                                        ports = []
-                                        for p in rule.get("ports", []):
-                                            ports.append(f"{p.get('protocol', 'TCP')}/{p.get('port', '*')}")
-                                        src_str = ", ".join(sources) if sources else "any"
-                                        port_str = ", ".join(ports) if ports else "all ports"
-                                        st.markdown(f"  - Rule {i+1}: Allow from **{src_str}** on **{port_str}**")
-                                elif "Ingress" in spec.get("policyTypes", []):
-                                    st.warning("Ingress type declared but no rules — all ingress traffic is **denied**.")
-
-                                # Egress
-                                egress_rules = spec.get("egress", [])
-                                if egress_rules:
-                                    st.markdown("**Egress Rules:**")
-                                    for i, rule in enumerate(egress_rules):
-                                        destinations = []
-                                        for to in rule.get("to", []):
-                                            if "podSelector" in to:
-                                                labels = to["podSelector"].get("matchLabels", {})
-                                                destinations.append("Pods: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
-                                            if "namespaceSelector" in to:
-                                                labels = to["namespaceSelector"].get("matchLabels", {})
-                                                destinations.append("Namespaces: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
-                                            if "ipBlock" in to:
-                                                destinations.append(f"CIDR: {to['ipBlock'].get('cidr', '?')}")
-                                        ports = []
-                                        for p in rule.get("ports", []):
-                                            ports.append(f"{p.get('protocol', 'TCP')}/{p.get('port', '*')}")
-                                        dest_str = ", ".join(destinations) if destinations else "any"
-                                        port_str = ", ".join(ports) if ports else "all ports"
-                                        st.markdown(f"  - Rule {i+1}: Allow to **{dest_str}** on **{port_str}**")
-                                elif "Egress" in spec.get("policyTypes", []):
-                                    st.warning("Egress type declared but no rules — all egress traffic is **denied**.")
-
-                                st.markdown("---")
-                                st.markdown("**Raw YAML:**")
-                                import yaml
-                                st.code(yaml.dump(pol, default_flow_style=False), language="yaml")
-
-                        # Coverage check
-                        st.markdown("---")
-                        st.markdown("#### Coverage Analysis")
-                        if st.button("Check Unprotected Pods", key="netpol_coverage"):
-                            # Get all pods and check which are selected by a policy
-                            pod_ns_flag = f"-n {np_ns}" if np_ns != "All Namespaces" else "-A"
-                            pod_cmd = f"get pods {pod_ns_flag} -o json"
-                            with st.spinner("Analyzing coverage..."):
-                                pod_result = run_kubectl(profile, pod_cmd, timeout=15)
-                            if pod_result.success and pod_result.stdout.strip():
-                                try:
-                                    all_pods = json.loads(pod_result.stdout).get("items", [])
-                                    protected_pods = set()
-                                    for pol in policies:
-                                        pol_ns_name = pol.get("metadata", {}).get("namespace", "")
-                                        pod_sel = pol.get("spec", {}).get("podSelector", {})
-                                        match_labels = pod_sel.get("matchLabels", {})
-                                        for p in all_pods:
-                                            p_ns = p.get("metadata", {}).get("namespace", "")
-                                            p_name = p.get("metadata", {}).get("name", "")
-                                            p_labels = p.get("metadata", {}).get("labels", {})
-                                            if p_ns != pol_ns_name:
-                                                continue
-                                            if not match_labels or all(p_labels.get(k) == v for k, v in match_labels.items()):
-                                                protected_pods.add(f"{p_ns}/{p_name}")
-                                    unprotected = []
-                                    for p in all_pods:
-                                        p_ns = p.get("metadata", {}).get("namespace", "")
-                                        p_name = p.get("metadata", {}).get("name", "")
-                                        if f"{p_ns}/{p_name}" not in protected_pods:
-                                            unprotected.append({"Namespace": p_ns, "Pod": p_name})
-                                    if unprotected:
-                                        st.warning(f"{len(unprotected)} pod(s) are **not covered** by any NetworkPolicy (all traffic allowed):")
-                                        st.dataframe(pd.DataFrame(unprotected), use_container_width=True, hide_index=True)
-                                    else:
-                                        st.success("All pods are covered by at least one NetworkPolicy.")
-                                except (json.JSONDecodeError, KeyError):
-                                    st.error("Failed to parse pod data for coverage analysis.")
-
-                except (json.JSONDecodeError, KeyError) as e:
-                    st.error(f"Failed to parse network policy data: {e}")
-            elif result.success:
-                st.info("No NetworkPolicies found. All pod-to-pod traffic is allowed by default.")
-            else:
-                st.error("Failed to fetch network policies")
-                st.code(result.stderr, language="text")
-
     # ── PVC / Storage Dashboard ───────────────────────────────────────────
     with tab_pvc:
         st.markdown("### PVC / Storage Dashboard")

From 368b50dcdd78894704b9ad0209ce43433ca15ca6 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Wed, 8 Apr 2026 08:17:46 +0000
Subject: [PATCH 27/31] Fix: restrict profile JSON file permissions to 0600 to
 protect kubeconfig credentials

---
 k8s-agent/modules/profile_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
index d5707cd..0b7eb10 100644
--- a/k8s-agent/modules/profile_manager.py
+++ b/k8s-agent/modules/profile_manager.py
@@ -82,7 +82,8 @@ def save_profile(profile: ClusterProfile) -> str:
     profile.updated_at = now
 
     path = _profile_path(profile.name)
-    with open(path, "w") as f:
+    fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
+    with os.fdopen(fd, "w") as f:
         json.dump(asdict(profile), f, indent=2)
     return path
 

From b06dbf3e63b77c526ba75ca9d6e7ef45cc52cbf4 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:00:45 +0000
Subject: [PATCH 28/31] Fix Set Active profile button (delete widget state
 before rerun), quote kubeconfig paths in shell commands

---
 k8s-agent/app.py                      | 3 ++-
 k8s-agent/config.py                   | 2 +-
 k8s-agent/modules/cluster_debugger.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 39f0be2..2f8030b 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -576,7 +576,8 @@ def page_profile_manager():
                     st.markdown(f"**Updated:** {profile.updated_at[:10] if profile.updated_at else 'N/A'}")
                     if st.button("Set Active", key=f"activate_{profile.name}"):
                         st.session_state.active_profile = profile.name
-                        st.session_state.profile_selector = profile.name
+                        if "profile_selector" in st.session_state:
+                            del st.session_state["profile_selector"]
                         st.rerun()
                     if st.button("Delete", key=f"delete_{profile.name}", type="secondary"):
                         delete_profile(profile.name)
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index 90eb95b..6cfe0cf 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -120,7 +120,7 @@ def fetch_namespaces(kubeconfig_content: str) -> list[str]:
         f.write(kubeconfig_content)
     try:
         proc = subprocess.run(
-            f"{kubectl} --kubeconfig={kc_path} get namespaces -o jsonpath='{{.items[*].metadata.name}}'",
+            f"{kubectl} --kubeconfig=\"{kc_path}\" get namespaces -o jsonpath='{{.items[*].metadata.name}}'",
             shell=True, capture_output=True, text=True, timeout=15,
         )
         if proc.returncode == 0 and proc.stdout.strip():
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index 6815a42..df80c9d 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -77,7 +77,7 @@ def _run_local_kubectl(kubeconfig_content: str, kubectl_args: str, timeout: int
     kubeconfig_path = config.get_kubeconfig_path("_debug_temp")
     with open(kubeconfig_path, "w") as f:
         f.write(kubeconfig_content)
-    full_cmd = f"{kubectl} --kubeconfig={kubeconfig_path} {kubectl_args}"
+    full_cmd = f"{kubectl} --kubeconfig=\"{kubeconfig_path}\" {kubectl_args}"
     try:
         proc = subprocess.run(full_cmd, shell=True, capture_output=True, text=True, timeout=timeout)
         return SSHResult(

From 0d9734d7b71e8f727c15d4a0ccc85c8ee0ff8c76 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:05:41 +0000
Subject: [PATCH 29/31] Add 'Collect pod logs' mode to Smart Log Analysis for
 Istio access log analysis

---
 k8s-agent/app.py | 149 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 147 insertions(+), 2 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 2f8030b..66d12fb 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -1824,9 +1824,16 @@ def page_log_analysis():
             "**pattern mining** (Drain-style), and **auto-summarization**."
         )
 
+        st.info(
+            "**Istio access log analysis:** Select **Collect pod logs** and choose an "
+            "application pod with an Istio sidecar (e.g. `istio-proxy` container). "
+            "The pipeline auto-detects Envoy/Istio access logs and shows response time "
+            "analytics, status codes, per-path breakdowns, and slow requests."
+        )
+
         smart_mode = st.radio(
             "Analysis mode",
-            ["Collect from cluster", "Paste logs"],
+            ["Collect from cluster", "Collect pod logs", "Paste logs"],
             horizontal=True,
             key="smart_mode",
         )
@@ -1873,11 +1880,149 @@ def page_log_analysis():
             if "_smart_log_text" in st.session_state and not smart_log_text:
                 smart_log_text = st.session_state["_smart_log_text"]
 
+        elif smart_mode == "Collect pod logs":
+            st.markdown(
+                "Fetch logs from a specific pod — ideal for **Istio sidecar** access "
+                "logs (`istio-proxy` container) or any application pod."
+            )
+            spcol1, spcol2 = st.columns(2)
+            with spcol1:
+                if _cluster_namespaces:
+                    smart_pod_ns = st.selectbox(
+                        "Namespace", options=_cluster_namespaces,
+                        index=_cluster_namespaces.index("default") if "default" in _cluster_namespaces else 0,
+                        key="smart_pod_ns",
+                    )
+                else:
+                    smart_pod_ns = st.text_input("Namespace", value="default", key="smart_pod_ns")
+            with spcol2:
+                smart_pod_lines = st.number_input(
+                    "Lines to fetch", min_value=100, max_value=10000, value=1000, key="smart_pod_lines",
+                )
+
+            # Load pods button
+            if st.button("Load Pods", key="smart_load_pods"):
+                with st.spinner(f"Fetching pods in namespace '{smart_pod_ns}'..."):
+                    pod_result = get_pod_list(cp_node, namespace=smart_pod_ns, profile=profile)
+                    if pod_result.success and pod_result.stdout.strip():
+                        _sp_pods: list[dict] = []
+                        for line in pod_result.stdout.strip().split("\n"):
+                            parts = line.split()
+                            if len(parts) >= 2:
+                                _sp_pods.append({
+                                    "namespace": parts[0],
+                                    "name": parts[1],
+                                    "status": parts[2] if len(parts) > 2 else "Unknown",
+                                    "containers": parts[3] if len(parts) > 3 else "",
+                                })
+                        st.session_state["_smart_pod_list"] = _sp_pods
+                        st.session_state["_smart_pod_list_ns"] = smart_pod_ns
+                        st.success(f"Found {len(_sp_pods)} pod(s) in namespace '{smart_pod_ns}'.")
+                    elif pod_result.success:
+                        st.session_state["_smart_pod_list"] = []
+                        st.warning(f"No pods found in namespace '{smart_pod_ns}'.")
+                    else:
+                        st.error(f"Failed to fetch pods: {pod_result.stderr}")
+
+            # Pod & container selection
+            _sp_pods_loaded = st.session_state.get("_smart_pod_list", [])
+            _sp_pods_ns = st.session_state.get("_smart_pod_list_ns", "")
+
+            sp_col1, sp_col2 = st.columns(2)
+            with sp_col1:
+                if _sp_pods_loaded and _sp_pods_ns == smart_pod_ns:
+                    sp_pod_options = [f"{p['name']}  ({p['status']})" for p in _sp_pods_loaded]
+                    sp_selected_idx = st.selectbox(
+                        "Pod Name", options=range(len(sp_pod_options)),
+                        format_func=lambda i: sp_pod_options[i],
+                        key="smart_pod_select",
+                    )
+                    smart_pod_name = _sp_pods_loaded[sp_selected_idx]["name"] if sp_selected_idx is not None else ""
+                else:
+                    smart_pod_name = st.text_input(
+                        "Pod Name", placeholder="Click 'Load Pods' to get dropdown", key="smart_pod_name",
+                    )
+            with sp_col2:
+                # Container selection — show istio-proxy hint
+                if _sp_pods_loaded and _sp_pods_ns == smart_pod_ns and smart_pod_name:
+                    matching = [p for p in _sp_pods_loaded if p["name"] == smart_pod_name]
+                    container_names = []
+                    if matching and matching[0].get("containers"):
+                        container_names = [c.strip() for c in matching[0]["containers"].split(",") if c.strip()]
+                    if container_names:
+                        container_names = ["(all / default)"] + container_names
+                        # Pre-select istio-proxy if available
+                        default_idx = 0
+                        for idx, cn in enumerate(container_names):
+                            if cn == "istio-proxy":
+                                default_idx = idx
+                                break
+                        smart_pod_container = st.selectbox(
+                            "Container (select `istio-proxy` for Istio access logs)",
+                            options=container_names,
+                            index=default_idx,
+                            key="smart_pod_container",
+                        )
+                        if smart_pod_container == "(all / default)":
+                            smart_pod_container = ""
+                    else:
+                        smart_pod_container = st.text_input(
+                            "Container (e.g. istio-proxy)",
+                            value="istio-proxy",
+                            key="smart_pod_container_text",
+                        )
+                else:
+                    smart_pod_container = st.text_input(
+                        "Container (e.g. istio-proxy for Istio access logs)",
+                        value="istio-proxy",
+                        key="smart_pod_container_text2",
+                    )
+
+            # Fetch & analyze
+            smart_pod_since_opts = {
+                "Last 15 min": "15m",
+                "Last 1 hour": "1h",
+                "Last 6 hours": "6h",
+                "Last 24 hours": "24h",
+            }
+            smart_pod_since_label = st.selectbox(
+                "Time Range", list(smart_pod_since_opts.keys()), index=1, key="smart_pod_since",
+            )
+            smart_pod_since_k8s = smart_pod_since_opts[smart_pod_since_label]
+
+            if st.button("Fetch Pod Logs & Analyze", type="primary", key="smart_pod_collect"):
+                if not smart_pod_name:
+                    st.warning("Please select or enter a pod name.")
+                else:
+                    with st.spinner(f"Fetching logs from pod '{smart_pod_name}' (container: {smart_pod_container or 'default'})..."):
+                        pod_log_result = collect_pod_logs(
+                            cp_node,
+                            namespace=smart_pod_ns,
+                            pod_name=smart_pod_name,
+                            container=smart_pod_container,
+                            lines=smart_pod_lines,
+                            since_k8s=smart_pod_since_k8s,
+                            profile=profile,
+                        )
+                    if pod_log_result.success and pod_log_result.stdout.strip():
+                        smart_log_text = pod_log_result.stdout
+                        st.session_state["_smart_log_text"] = smart_log_text
+                        st.session_state["_smart_source"] = f"pod:{smart_pod_name}/{smart_pod_container or 'default'}"
+                        st.success(f"Fetched {len(smart_log_text.splitlines())} log lines from pod '{smart_pod_name}'.")
+                    elif pod_log_result.success:
+                        st.info(f"No logs returned from pod '{smart_pod_name}' for the selected time range.")
+                    else:
+                        st.error(f"Failed to fetch pod logs: {pod_log_result.stderr}")
+
+            # Persist across reruns
+            if "_smart_log_text" in st.session_state and not smart_log_text:
+                smart_log_text = st.session_state["_smart_log_text"]
+
         else:
             smart_log_text = st.text_area(
                 "Paste log output",
                 height=200,
-                placeholder="Paste your Kubernetes logs here for smart analysis...",
+                placeholder="Paste your Kubernetes / Istio access logs here for smart analysis...",
                 key="smart_paste",
             )
             if smart_log_text:

From 45560a6e89f3224d10408f29ffe382820d84d082 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:16:18 +0000
Subject: [PATCH 30/31] Fix profile switching: sync widget key with
 active_profile before selectbox renders, add on_change callback, delete
 profile_selector on all profile state changes

---
 k8s-agent/app.py | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 66d12fb..d2ab0e5 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -176,16 +176,41 @@ def render_sidebar():
         profile_names = [p.name for p in profiles]
 
         if profile_names:
+            _selector_options = ["(none)"] + profile_names
+            _current = st.session_state.get("active_profile")
+            _default_idx = (
+                profile_names.index(_current) + 1
+                if _current and _current in profile_names
+                else 0
+            )
+
+            # Sync widget key with active_profile before widget renders.
+            # Streamlit reads the widget value from session_state[key] when
+            # the key already exists, ignoring ``index``.  So we must write
+            # the desired option into session_state["profile_selector"]
+            # *before* the selectbox is instantiated.
+            if "profile_selector" not in st.session_state:
+                # First render or key was deleted — seed from active_profile
+                st.session_state["profile_selector"] = _selector_options[_default_idx]
+            elif st.session_state["profile_selector"] not in _selector_options:
+                # Profile was deleted — reset
+                st.session_state["profile_selector"] = "(none)"
+
+            def _on_profile_change():
+                sel = st.session_state.get("profile_selector", "(none)")
+                if sel != "(none)":
+                    st.session_state.active_profile = sel
+                else:
+                    st.session_state.active_profile = None
+
             selected = st.selectbox(
                 "Active Profile",
-                options=["(none)"] + profile_names,
-                index=(
-                    profile_names.index(st.session_state.active_profile) + 1
-                    if st.session_state.active_profile in profile_names
-                    else 0
-                ),
+                options=_selector_options,
                 key="profile_selector",
+                on_change=_on_profile_change,
             )
+
+            # Also keep active_profile in sync on this run
             if selected != "(none)":
                 st.session_state.active_profile = selected
                 profile = load_profile(selected)
@@ -475,6 +500,8 @@ def page_profile_manager():
                     )
                     path = save_profile(profile)
                     st.session_state.active_profile = name
+                    if "profile_selector" in st.session_state:
+                        del st.session_state["profile_selector"]
                     st.session_state._flash_message = ("success", f"Profile '{name}' created successfully! Select it from the sidebar to get started.")
                     st.rerun()
 
@@ -532,6 +559,8 @@ def page_profile_manager():
                 try:
                     save_profile(profile)
                     st.session_state.active_profile = import_name
+                    if "profile_selector" in st.session_state:
+                        del st.session_state["profile_selector"]
                     st.session_state._flash_message = (
                         "success",
                         f"Cluster '{import_name}' imported successfully! "
@@ -583,6 +612,8 @@ def page_profile_manager():
                         delete_profile(profile.name)
                         if st.session_state.active_profile == profile.name:
                             st.session_state.active_profile = None
+                        if "profile_selector" in st.session_state:
+                            del st.session_state["profile_selector"]
                         st.rerun()
 
     # ── Import / Export ───────────────────────────────────────────────────

From c89f8cc940a919b1c90bcff9c742fd70720fcee2 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:49:16 +0000
Subject: [PATCH 31/31] Add Ollama LLM support: provider selection
 (OpenAI/Ollama), local Ollama connection, model fetching, streaming support

---
 k8s-agent/app.py                | 107 ++++++++++++----
 k8s-agent/config.py             |  29 ++++-
 k8s-agent/modules/llm_client.py | 211 ++++++++++++++++++++------------
 3 files changed, 241 insertions(+), 106 deletions(-)

diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index d2ab0e5..2e06614 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -87,7 +87,7 @@
     summarize_logs,
     analyze_istio_access_logs,
 )
-from modules.llm_client import query_llm, stream_llm
+from modules.llm_client import query_llm, stream_llm, list_ollama_models
 
 
 # ── Page Configuration ────────────────────────────────────────────────────
@@ -263,27 +263,82 @@ def _on_profile_change():
 
         # ── LLM config ──
         with st.expander("LLM Settings"):
-            st.text_input(
-                "API URL",
-                value=config.LLM_API_URL,
-                key="llm_api_url",
-                help="Endpoint for the LLM API",
-            )
-            st.text_input(
-                "API Key",
-                value=config.LLM_API_KEY[:8] + "..." if config.LLM_API_KEY else "",
-                type="password",
-                key="llm_api_key_display",
-                disabled=True,
-                help="Set via LLM_API_KEY or INFOSYS_CODER_API_KEY env var",
-            )
-            st.selectbox(
-                "Model",
-                options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"],
-                index=0,
-                key="llm_model_select",
+            provider_options = ["openai", "ollama"]
+            _prov_idx = provider_options.index(config.LLM_PROVIDER) if config.LLM_PROVIDER in provider_options else 0
+            llm_provider = st.selectbox(
+                "Provider",
+                options=provider_options,
+                format_func=lambda p: {"openai": "OpenAI-compatible", "ollama": "Ollama (local)"}[p],
+                index=_prov_idx,
+                key="llm_provider_select",
+                help="Select 'Ollama (local)' to connect to a local Ollama instance",
             )
 
+            if llm_provider == "ollama":
+                ollama_url = st.text_input(
+                    "Ollama URL",
+                    value=config.OLLAMA_BASE_URL,
+                    key="ollama_url_input",
+                    help="Base URL for your Ollama instance (e.g. http://10.73.98.113:11434)",
+                )
+                # Fetch models button
+                if st.button("Fetch available models", key="ollama_fetch_models"):
+                    with st.spinner("Connecting to Ollama..."):
+                        models = list_ollama_models(ollama_url)
+                        if models:
+                            st.session_state["_ollama_models"] = models
+                            st.success(f"Found {len(models)} model(s)")
+                        else:
+                            st.error(f"Could not connect to Ollama at {ollama_url}")
+                _cached_models = st.session_state.get("_ollama_models", [])
+                if _cached_models:
+                    st.selectbox(
+                        "Model",
+                        options=_cached_models,
+                        index=0,
+                        key="ollama_model_select",
+                    )
+                else:
+                    st.text_input(
+                        "Model",
+                        value=config.OLLAMA_MODEL,
+                        key="ollama_model_input",
+                        help="Model name (e.g. llama3, mistral, codellama)",
+                    )
+
+                # Apply Ollama settings at runtime
+                config.LLM_PROVIDER = "ollama"
+                config.OLLAMA_BASE_URL = ollama_url
+                _sel_model = st.session_state.get("ollama_model_select") or st.session_state.get("ollama_model_input", config.OLLAMA_MODEL)
+                config.OLLAMA_MODEL = _sel_model
+
+                if config.is_llm_configured():
+                    st.caption(f"✓ Ollama configured → `{config.OLLAMA_BASE_URL}` / `{config.OLLAMA_MODEL}`")
+                else:
+                    st.caption("Enter the Ollama URL above to enable AI features")
+            else:
+                st.text_input(
+                    "API URL",
+                    value=config.LLM_API_URL,
+                    key="llm_api_url",
+                    help="Endpoint for the LLM API",
+                )
+                st.text_input(
+                    "API Key",
+                    value=config.LLM_API_KEY[:8] + "..." if config.LLM_API_KEY else "",
+                    type="password",
+                    key="llm_api_key_display",
+                    disabled=True,
+                    help="Set via LLM_API_KEY or INFOSYS_CODER_API_KEY env var",
+                )
+                st.selectbox(
+                    "Model",
+                    options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"],
+                    index=0,
+                    key="llm_model_select",
+                )
+                config.LLM_PROVIDER = "openai"
+
         return selected_page
 
 
@@ -1100,7 +1155,7 @@ def page_cluster_creation():
         st.markdown("### AI Cluster Setup Advisor")
         if not is_llm_configured():
             st.info(
-                "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+                "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
                 "environment variables to enable AI-powered recommendations."
             )
         else:
@@ -1183,7 +1238,7 @@ def page_cluster_debugger():
 
         if st.session_state.debug_results:
             if not is_llm_configured():
-                st.info("Enable AI analysis by setting `LLM_API_URL` and `LLM_API_KEY` env vars.")
+                st.info("Enable AI analysis by selecting a provider (OpenAI or Ollama) in the sidebar LLM Settings.")
             elif st.button("Analyze with AI", type="secondary"):
                 with st.spinner("AI is analyzing diagnostics..."):
                     analysis = analyze_diagnostics(
@@ -1237,7 +1292,7 @@ def page_cluster_debugger():
         st.markdown("### AI Debug Assistant")
         if not is_llm_configured():
             st.info(
-                "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+                "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
                 "environment variables to enable AI-powered debugging."
             )
             st.markdown(
@@ -1571,7 +1626,7 @@ def page_monitoring_setup():
         st.markdown("### AI Monitoring Advisor")
         if not is_llm_configured():
             st.info(
-                "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+                "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
                 "environment variables to enable AI-powered monitoring advice."
             )
         else:
@@ -2444,7 +2499,7 @@ def page_log_analysis():
         st.markdown("### AI-Powered Log Analysis")
         if not is_llm_configured():
             st.info(
-                "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+                "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
                 "environment variables to enable AI-powered log analysis."
             )
             st.markdown(
@@ -4150,7 +4205,7 @@ def page_ai_assistant():
 
     if not is_llm_configured():
         st.info(
-            "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+            "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
             "environment variables to enable the AI chat assistant."
         )
         st.markdown(
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index 6cfe0cf..2951fa1 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -6,6 +6,8 @@
 
 
 # LLM Configuration
+# Provider: "openai" (OpenAI-compatible endpoint) or "ollama" (local Ollama)
+LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai")
 LLM_API_URL = os.getenv(
     "LLM_API_URL",
     "https://aigateway-intern.ad.infosys.com/aigateway/chat/completions",
@@ -15,12 +17,37 @@
 LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
 LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096"))
 
+# Ollama-specific defaults
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://10.73.98.113:11434")
+OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3")
+
 
 def is_llm_configured() -> bool:
-    """Return True if the LLM endpoint and API key are both set."""
+    """Return True if the LLM is configured.
+
+    For Ollama, only the base URL is required (no API key).
+    For OpenAI-compatible endpoints, both URL and key are required.
+    """
+    if LLM_PROVIDER == "ollama":
+        return bool(OLLAMA_BASE_URL)
     return bool(LLM_API_URL and LLM_API_KEY)
 
 
+def get_active_llm_url() -> str:
+    """Return the effective chat completions URL based on the active provider."""
+    if LLM_PROVIDER == "ollama":
+        base = OLLAMA_BASE_URL.rstrip("/")
+        return f"{base}/api/chat"
+    return LLM_API_URL
+
+
+def get_active_model() -> str:
+    """Return the effective model name based on the active provider."""
+    if LLM_PROVIDER == "ollama":
+        return OLLAMA_MODEL
+    return LLM_MODEL
+
+
 # Application paths
 DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 PROFILES_DIR = os.path.join(DATA_DIR, "profiles")
diff --git a/k8s-agent/modules/llm_client.py b/k8s-agent/modules/llm_client.py
index fc77be1..ab8cb18 100644
--- a/k8s-agent/modules/llm_client.py
+++ b/k8s-agent/modules/llm_client.py
@@ -1,8 +1,11 @@
-"""LLM client — optional integration with an OpenAI-compatible endpoint.
+"""LLM client — optional integration with OpenAI-compatible or Ollama endpoints.
+
+Supports two providers:
+  * **openai** — Any OpenAI-compatible chat completions API (default).
+  * **ollama** — Local Ollama instance (no API key required).
 
 All public functions gracefully return a fallback message when the LLM is not
-configured (i.e. ``LLM_API_KEY`` or ``LLM_API_URL`` is empty).  The rest of the
-application works without any LLM dependency.
+configured.  The rest of the application works without any LLM dependency.
 """
 
 import json
@@ -13,8 +16,9 @@
 import config
 
 _NOT_CONFIGURED_MSG = (
-    "LLM is not configured. Set the LLM_API_URL and LLM_API_KEY environment "
-    "variables to enable AI-powered features."
+    "LLM is not configured. Set the LLM provider and connection details in "
+    "the sidebar LLM Settings panel, or via environment variables "
+    "(LLM_PROVIDER, LLM_API_URL / OLLAMA_BASE_URL)."
 )
 
 
@@ -32,64 +36,103 @@
 the provided information is insufficient."""
 
 
-def query_llm(
+def _build_messages(
     user_message: str,
     system_message: Optional[str] = None,
     conversation_history: Optional[list[dict]] = None,
-    temperature: Optional[float] = None,
-    max_tokens: Optional[int] = None,
-) -> str:
-    """Send a query to the LLM and return the response text.
-
-    Args:
-        user_message: The user's message/query.
-        system_message: Optional system prompt override.
-        conversation_history: Optional list of prior messages for context.
-        temperature: Optional temperature override.
-        max_tokens: Optional max tokens override.
-
-    Returns:
-        The assistant's response text.
-    """
+) -> list[dict]:
+    """Assemble the messages list shared by both query and stream."""
     messages = []
-
     sys_msg = system_message or SYSTEM_PROMPT
     messages.append({"role": "system", "content": sys_msg})
-
     if conversation_history:
         messages.extend(conversation_history)
-
     messages.append({"role": "user", "content": user_message})
+    return messages
+
+
+def _build_headers() -> dict:
+    """Return request headers for the active provider."""
+    headers = {"Content-Type": "application/json"}
+    if config.LLM_PROVIDER != "ollama" and config.LLM_API_KEY:
+        headers["Authorization"] = f"Bearer {config.LLM_API_KEY}"
+    return headers
 
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {config.LLM_API_KEY}",
-    }
 
+def _build_payload(
+    messages: list[dict],
+    temperature: Optional[float] = None,
+    max_tokens: Optional[int] = None,
+    stream: bool = False,
+) -> dict:
+    """Return the request payload for the active provider."""
+    temp = temperature if temperature is not None else config.LLM_TEMPERATURE
+    model = config.get_active_model()
+
+    if config.LLM_PROVIDER == "ollama":
+        payload: dict = {
+            "model": model,
+            "messages": messages,
+            "stream": stream,
+            "options": {
+                "temperature": temp,
+            },
+        }
+        if max_tokens is not None or config.LLM_MAX_TOKENS:
+            payload["options"]["num_predict"] = (
+                max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS
+            )
+        return payload
+
+    # OpenAI-compatible
     payload = {
-        "model": config.LLM_MODEL,
+        "model": model,
         "messages": messages,
-        "temperature": temperature if temperature is not None else config.LLM_TEMPERATURE,
+        "temperature": temp,
         "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
     }
+    if stream:
+        payload["stream"] = True
+    return payload
+
+
+def query_llm(
+    user_message: str,
+    system_message: Optional[str] = None,
+    conversation_history: Optional[list[dict]] = None,
+    temperature: Optional[float] = None,
+    max_tokens: Optional[int] = None,
+) -> str:
+    """Send a query to the LLM and return the response text.
 
+    Supports both OpenAI-compatible and Ollama endpoints.
+    """
     if not config.is_llm_configured():
         return _NOT_CONFIGURED_MSG
 
+    messages = _build_messages(user_message, system_message, conversation_history)
+    headers = _build_headers()
+    payload = _build_payload(messages, temperature, max_tokens, stream=False)
+    url = config.get_active_llm_url()
+
     try:
-        response = requests.post(
-            config.LLM_API_URL,
-            headers=headers,
-            json=payload,
-            timeout=120,
-        )
+        response = requests.post(url, headers=headers, json=payload, timeout=120)
         response.raise_for_status()
         data = response.json()
+
+        # Ollama returns {"message": {"content": "..."}}
+        if config.LLM_PROVIDER == "ollama":
+            return data.get("message", {}).get("content", "")
+
+        # OpenAI returns {"choices": [{"message": {"content": "..."}}]}
         return data["choices"][0]["message"]["content"]
     except requests.exceptions.Timeout:
         return "Error: LLM request timed out. Please try again."
     except requests.exceptions.ConnectionError:
-        return "Error: Could not connect to the LLM endpoint. Please check your network and LLM_API_URL configuration."
+        return (
+            f"Error: Could not connect to the LLM endpoint at {url}. "
+            "Please check your network and LLM configuration."
+        )
     except requests.exceptions.HTTPError as exc:
         return f"Error: LLM API returned HTTP {exc.response.status_code}: {exc.response.text}"
     except (KeyError, IndexError, json.JSONDecodeError) as exc:
@@ -105,58 +148,68 @@ def stream_llm(
 ) -> Generator[str, None, None]:
     """Stream a response from the LLM token-by-token.
 
+    Supports both OpenAI-compatible and Ollama endpoints.
     Yields chunks of text as they arrive from the API.
     """
-    messages = []
-
-    sys_msg = system_message or SYSTEM_PROMPT
-    messages.append({"role": "system", "content": sys_msg})
-
-    if conversation_history:
-        messages.extend(conversation_history)
-
-    messages.append({"role": "user", "content": user_message})
-
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {config.LLM_API_KEY}",
-    }
-
-    payload = {
-        "model": config.LLM_MODEL,
-        "messages": messages,
-        "temperature": temperature if temperature is not None else config.LLM_TEMPERATURE,
-        "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
-        "stream": True,
-    }
-
     if not config.is_llm_configured():
         yield _NOT_CONFIGURED_MSG
         return
 
+    messages = _build_messages(user_message, system_message, conversation_history)
+    headers = _build_headers()
+    payload = _build_payload(messages, temperature, max_tokens, stream=True)
+    url = config.get_active_llm_url()
+
     try:
         response = requests.post(
-            config.LLM_API_URL,
-            headers=headers,
-            json=payload,
-            timeout=120,
-            stream=True,
+            url, headers=headers, json=payload, timeout=120, stream=True,
         )
         response.raise_for_status()
 
-        for line in response.iter_lines(decode_unicode=True):
-            if not line or not line.startswith("data: "):
-                continue
-            data_str = line[len("data: "):]
-            if data_str.strip() == "[DONE]":
-                break
-            try:
-                chunk = json.loads(data_str)
-                delta = chunk.get("choices", [{}])[0].get("delta", {})
-                content = delta.get("content", "")
-                if content:
-                    yield content
-            except (json.JSONDecodeError, KeyError, IndexError):
-                continue
+        if config.LLM_PROVIDER == "ollama":
+            # Ollama streams newline-delimited JSON objects
+            for line in response.iter_lines(decode_unicode=True):
+                if not line:
+                    continue
+                try:
+                    chunk = json.loads(line)
+                    content = chunk.get("message", {}).get("content", "")
+                    if content:
+                        yield content
+                    if chunk.get("done", False):
+                        break
+                except json.JSONDecodeError:
+                    continue
+        else:
+            # OpenAI SSE format: "data: {...}\n"
+            for line in response.iter_lines(decode_unicode=True):
+                if not line or not line.startswith("data: "):
+                    continue
+                data_str = line[len("data: "):]
+                if data_str.strip() == "[DONE]":
+                    break
+                try:
+                    chunk = json.loads(data_str)
+                    delta = chunk.get("choices", [{}])[0].get("delta", {})
+                    content = delta.get("content", "")
+                    if content:
+                        yield content
+                except (json.JSONDecodeError, KeyError, IndexError):
+                    continue
     except requests.exceptions.RequestException as exc:
         yield f"\n\nError during streaming: {exc}"
+
+
+def list_ollama_models(base_url: str = "") -> list[str]:
+    """Fetch available model names from an Ollama instance.
+
+    Returns a list of model name strings, or an empty list on failure.
+    """
+    url = (base_url or config.OLLAMA_BASE_URL).rstrip("/") + "/api/tags"
+    try:
+        resp = requests.get(url, timeout=10)
+        resp.raise_for_status()
+        data = resp.json()
+        return [m["name"] for m in data.get("models", [])]
+    except Exception:
+        return []