From cb21aa03eae67bb096db4ebe17a6111e69f20e2d Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 09:48:06 +0000
Subject: [PATCH 01/31] Add K8s Agent: Streamlit-based on-prem Kubernetes
cluster management UI
- Profile Manager: CRUD for cluster profiles with node definitions (control-plane/worker), SSH credentials
- Cluster Creation: SSH-based provisioning with CRI-O, Flannel CNI, kubeadm, best practices hardening
- Cluster Debugger: Diagnostic commands with AI-powered root cause analysis and recommendations
- Monitoring Setup: One-click Prometheus + Grafana deployment with dashboards and alerting rules
- Log Analysis: Multi-source log collection, error pattern extraction, cross-source correlation
- AI Assistant: Chat interface powered by LLM for Kubernetes questions
- Integrated with Infosys AI Gateway for LLM capabilities
---
.gitignore | 1 +
k8s-agent/README.md | 87 ++
k8s-agent/app.py | 1111 +++++++++++++++++++++++++
k8s-agent/config.py | 21 +
k8s-agent/data/profiles/.gitkeep | 1 +
k8s-agent/modules/__init__.py | 1 +
k8s-agent/modules/cluster_creator.py | 545 ++++++++++++
k8s-agent/modules/cluster_debugger.py | 228 +++++
k8s-agent/modules/llm_client.py | 145 ++++
k8s-agent/modules/log_analyzer.py | 345 ++++++++
k8s-agent/modules/monitoring_setup.py | 440 ++++++++++
k8s-agent/modules/profile_manager.py | 119 +++
k8s-agent/requirements.txt | 6 +
k8s-agent/templates/.gitkeep | 1 +
14 files changed, 3051 insertions(+)
create mode 100644 k8s-agent/README.md
create mode 100644 k8s-agent/app.py
create mode 100644 k8s-agent/config.py
create mode 100644 k8s-agent/data/profiles/.gitkeep
create mode 100644 k8s-agent/modules/__init__.py
create mode 100644 k8s-agent/modules/cluster_creator.py
create mode 100644 k8s-agent/modules/cluster_debugger.py
create mode 100644 k8s-agent/modules/llm_client.py
create mode 100644 k8s-agent/modules/log_analyzer.py
create mode 100644 k8s-agent/modules/monitoring_setup.py
create mode 100644 k8s-agent/modules/profile_manager.py
create mode 100644 k8s-agent/requirements.txt
create mode 100644 k8s-agent/templates/.gitkeep
diff --git a/.gitignore b/.gitignore
index 9359002..2a1bb18 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,4 @@ charts/*/charts/
*.pem
*.key
kubeconfig*
+k8s-agent/__pycache__/
diff --git a/k8s-agent/README.md b/k8s-agent/README.md
new file mode 100644
index 0000000..17c03e9
--- /dev/null
+++ b/k8s-agent/README.md
@@ -0,0 +1,87 @@
+# K8s Agent — On-Prem Kubernetes Cluster Management
+
+A Streamlit-based UI for managing on-premises Kubernetes clusters with CRI-O container runtime and Flannel CNI.
+
+## Features
+
+1. **Profile Manager** — Create and manage profiles for multiple clusters with node definitions (control-plane / worker), SSH credentials, and K8s configuration.
+
+2. **Cluster Creation** — SSH into nodes and provision a full Kubernetes cluster:
+ - Installs CRI-O container runtime
+ - Installs kubeadm, kubelet, kubectl
+ - Initializes control plane with best-practice kubeadm config
+ - Deploys Flannel CNI
+ - Joins worker nodes automatically
+ - Applies security hardening (NetworkPolicies, RBAC, ResourceQuotas, PodSecurity)
+
+3. **Cluster Debugger** — Run diagnostic commands and get AI-powered analysis:
+ - Pre-built checks for nodes, pods, networking, storage, certificates
+ - Category-based scanning (Cluster Overview, Networking, Security, etc.)
+ - Custom command execution via SSH
+ - AI-powered root cause analysis and remediation recommendations
+
+4. **Monitoring Setup** — Deploy Prometheus + Grafana with production-ready configuration:
+ - One-click kube-prometheus-stack installation
+ - Grafana dashboard imports (cluster overview, node exporter, pods, etcd, API server, etc.)
+ - Alerting rules for node health, pod crashes, disk pressure, etcd latency
+ - AI-powered monitoring recommendations
+
+5. **Log Analysis** — Collect, parse, and correlate logs across cluster components:
+ - System component logs (kubelet, CRI-O, API server, etcd, Flannel, CoreDNS)
+ - Pod-level log collection with previous container support
+ - Automated error pattern extraction and grouping
+ - Cross-source error correlation
+ - AI-powered deep log analysis and root cause identification
+
+6. **AI Assistant** — Chat interface for Kubernetes questions powered by your LLM.
+
+## Quick Start
+
+```bash
+cd k8s-agent
+pip install -r requirements.txt
+
+# Set your LLM API key
+export LLM_API_KEY="your-api-key"
+# Or use the Infosys AI Gateway key
+export INFOSYS_CODER_API_KEY="your-key"
+
+# Run the app
+streamlit run app.py
+```
+
+## Configuration
+
+Environment variables:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `LLM_API_URL` | LLM API endpoint | Infosys AI Gateway |
+| `LLM_API_KEY` | LLM API key | Falls back to `INFOSYS_CODER_API_KEY` |
+| `LLM_MODEL` | Model name | `gpt-4` |
+| `LLM_TEMPERATURE` | Response temperature | `0.3` |
+| `LLM_MAX_TOKENS` | Max response tokens | `4096` |
+
+## Architecture
+
+```
+k8s-agent/
+├── app.py # Main Streamlit application
+├── config.py # Configuration and environment variables
+├── requirements.txt # Python dependencies
+├── modules/
+│ ├── llm_client.py # LLM API integration (query + streaming)
+│ ├── profile_manager.py # Cluster profile CRUD operations
+│ ├── cluster_creator.py # SSH-based cluster provisioning
+│ ├── cluster_debugger.py # Diagnostic commands and AI analysis
+│ ├── monitoring_setup.py # Prometheus/Grafana deployment
+│ └── log_analyzer.py # Log collection, parsing, correlation
+├── templates/ # Configuration templates
+└── data/profiles/ # Stored cluster profiles (JSON)
+```
+
+## Requirements
+
+- Python 3.10+
+- SSH access to target nodes (for cluster operations)
+- LLM API endpoint (Infosys AI Gateway or compatible OpenAI-style API)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
new file mode 100644
index 0000000..4225a81
--- /dev/null
+++ b/k8s-agent/app.py
@@ -0,0 +1,1111 @@
+"""K8s Agent — Streamlit-based Kubernetes Cluster Management UI."""
+
+import sys
+import os
+
+# Ensure the k8s-agent directory is on the Python path so sibling imports work.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import json
+import streamlit as st
+from streamlit_option_menu import option_menu
+
+import config
+from modules.profile_manager import (
+ ClusterProfile,
+ NodeInfo,
+ save_profile,
+ load_profile,
+ list_profiles,
+ delete_profile,
+ update_profile_status,
+)
+from modules.cluster_creator import (
+ test_ssh_connectivity,
+ generate_common_setup_script,
+ generate_control_plane_init_script,
+ generate_worker_join_script,
+ generate_best_practices_script,
+ provision_node_common,
+ init_control_plane,
+ retrieve_join_command,
+ join_worker_node,
+ apply_best_practices,
+ get_cluster_status,
+ get_llm_cluster_advice,
+)
+from modules.cluster_debugger import (
+ DIAGNOSTIC_COMMANDS,
+ CATEGORY_MAP,
+ run_diagnostic,
+ run_category_diagnostics,
+ run_all_diagnostics,
+ run_custom_command,
+ analyze_diagnostics,
+ get_debug_suggestion,
+ check_pod_issues,
+)
+from modules.monitoring_setup import (
+ GRAFANA_DASHBOARDS,
+ install_helm,
+ install_prometheus_stack,
+ install_dashboards,
+ install_alert_rules,
+ get_monitoring_status,
+ get_monitoring_advice,
+ generate_prometheus_install_script,
+ generate_dashboard_import_script,
+ generate_alerting_rules_script,
+)
+from modules.log_analyzer import (
+ LOG_SOURCES,
+ collect_logs,
+ collect_pod_logs,
+ collect_multi_source_logs,
+ analyze_logs,
+ correlate_errors,
+ llm_analyze_logs,
+ llm_correlate_analysis,
+ get_pod_list,
+)
+from modules.llm_client import query_llm, stream_llm
+
+
+# ── Page Configuration ────────────────────────────────────────────────────
+
+st.set_page_config(
+ page_title="K8s Agent",
+ page_icon="☸",
+ layout="wide",
+ initial_sidebar_state="expanded",
+)
+
+# ── Custom CSS ────────────────────────────────────────────────────────────
+
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+
+# ── Session state initialization ──────────────────────────────────────────
+
+def init_session_state():
+ defaults = {
+ "active_profile": None,
+ "chat_history": [],
+ "provisioning_log": [],
+ "debug_results": {},
+ "log_analysis_results": {},
+ }
+ for key, value in defaults.items():
+ if key not in st.session_state:
+ st.session_state[key] = value
+
+
+init_session_state()
+
+
+# ── Sidebar: Profile Manager + Navigation ─────────────────────────────────
+
+def render_sidebar():
+ with st.sidebar:
+ st.markdown('
☸ K8s Agent
', unsafe_allow_html=True)
+ st.markdown('', unsafe_allow_html=True)
+
+ st.divider()
+
+ # ── Profile selector ──
+ st.markdown("### Cluster Profiles")
+ profiles = list_profiles()
+ profile_names = [p.name for p in profiles]
+
+ if profile_names:
+ selected = st.selectbox(
+ "Active Profile",
+ options=["(none)"] + profile_names,
+ index=(
+ profile_names.index(st.session_state.active_profile) + 1
+ if st.session_state.active_profile in profile_names
+ else 0
+ ),
+ key="profile_selector",
+ )
+ if selected != "(none)":
+ st.session_state.active_profile = selected
+ profile = load_profile(selected)
+ if profile:
+ status_class = f"status-{profile.status}"
+ st.markdown(
+ f"**Status:** {profile.status.upper()}",
+ unsafe_allow_html=True,
+ )
+ st.caption(
+ f"K8s {profile.kubernetes_version} | CRI-O {profile.crio_version} | "
+ f"{len(profile.get_control_plane_nodes())} CP + "
+ f"{len(profile.get_worker_nodes())} Workers"
+ )
+ else:
+ st.session_state.active_profile = None
+ else:
+ st.info("No profiles yet. Create one in Profile Manager.")
+
+ st.divider()
+
+ # ── Navigation ──
+ selected_page = option_menu(
+ menu_title="Navigation",
+ options=[
+ "Profile Manager",
+ "Cluster Creation",
+ "Cluster Debugger",
+ "Monitoring Setup",
+ "Log Analysis",
+ "AI Assistant",
+ ],
+ icons=[
+ "person-gear",
+ "hdd-rack",
+ "bug",
+ "graph-up",
+ "journal-text",
+ "robot",
+ ],
+ menu_icon="list",
+ default_index=0,
+ )
+
+ st.divider()
+
+ # ── LLM config ──
+ with st.expander("LLM Settings"):
+ st.text_input(
+ "API URL",
+ value=config.LLM_API_URL,
+ key="llm_api_url",
+ help="Endpoint for the LLM API",
+ )
+ st.text_input(
+ "API Key",
+ value=config.LLM_API_KEY[:8] + "..." if config.LLM_API_KEY else "",
+ type="password",
+ key="llm_api_key_display",
+ disabled=True,
+ help="Set via LLM_API_KEY or INFOSYS_CODER_API_KEY env var",
+ )
+ st.selectbox(
+ "Model",
+ options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"],
+ index=0,
+ key="llm_model_select",
+ )
+
+ return selected_page
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Profile Manager
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_profile_manager():
+ st.markdown("## Cluster Profile Manager")
+ st.markdown("Create, edit, and manage profiles for your on-prem Kubernetes clusters.")
+
+ tab_create, tab_list, tab_import = st.tabs(["Create Profile", "Manage Profiles", "Import / Export"])
+
+ # ── Create Profile ────────────────────────────────────────────────────
+ with tab_create:
+ with st.form("create_profile_form"):
+ st.markdown("### New Cluster Profile")
+ col1, col2 = st.columns(2)
+
+ with col1:
+ name = st.text_input("Profile Name *", placeholder="production-cluster")
+ description = st.text_area("Description", placeholder="Production on-prem cluster")
+ k8s_version = st.selectbox("Kubernetes Version", ["1.30", "1.29", "1.28", "1.27"], index=0)
+ crio_version = st.selectbox("CRI-O Version", ["1.30", "1.29", "1.28", "1.27"], index=0)
+ pod_security = st.selectbox(
+ "Pod Security Standard",
+ ["restricted", "baseline", "privileged"],
+ index=0,
+ )
+
+ with col2:
+ pod_cidr = st.text_input("Pod CIDR", value="10.244.0.0/16")
+ service_cidr = st.text_input("Service CIDR", value="10.96.0.0/12")
+ dns_domain = st.text_input("DNS Domain", value="cluster.local")
+
+ st.divider()
+ st.markdown("### Nodes")
+ st.markdown("Define your control-plane and worker nodes.")
+
+ num_nodes = st.number_input("Number of Nodes", min_value=1, max_value=50, value=3, step=1)
+
+ nodes = []
+ for i in range(int(num_nodes)):
+ st.markdown(f"**Node {i + 1}**")
+ ncol1, ncol2, ncol3, ncol4, ncol5 = st.columns([2, 2, 1.5, 1, 1.5])
+ with ncol1:
+ hostname = st.text_input(f"Hostname", key=f"host_{i}", placeholder=f"node-{i + 1}")
+ with ncol2:
+ ip_addr = st.text_input(f"IP Address", key=f"ip_{i}", placeholder="192.168.1.x")
+ with ncol3:
+ role = st.selectbox(f"Role", ["control-plane", "worker"], key=f"role_{i}",
+ index=0 if i == 0 else 1)
+ with ncol4:
+ ssh_user = st.text_input(f"SSH User", key=f"user_{i}", value="root")
+ with ncol5:
+ ssh_key = st.text_input(f"SSH Key Path", key=f"key_{i}", value="~/.ssh/id_rsa")
+
+ nodes.append({
+ "hostname": hostname,
+ "ip_address": ip_addr,
+ "role": role,
+ "ssh_user": ssh_user,
+ "ssh_port": 22,
+ "ssh_key_path": ssh_key,
+ })
+
+ submitted = st.form_submit_button("Create Profile", type="primary", use_container_width=True)
+
+ if submitted:
+ if not name:
+ st.error("Profile name is required.")
+ elif not any(n["ip_address"] for n in nodes):
+ st.error("At least one node must have an IP address.")
+ elif not any(n["role"] == "control-plane" for n in nodes):
+ st.error("At least one control-plane node is required.")
+ else:
+ valid_nodes = [n for n in nodes if n["ip_address"]]
+ profile = ClusterProfile(
+ name=name,
+ description=description,
+ kubernetes_version=k8s_version,
+ crio_version=crio_version,
+ cni_plugin="flannel",
+ pod_cidr=pod_cidr,
+ service_cidr=service_cidr,
+ dns_domain=dns_domain,
+ nodes=valid_nodes,
+ pod_security_standard=pod_security,
+ )
+ path = save_profile(profile)
+ st.session_state.active_profile = name
+ st.success(f"Profile '{name}' created successfully!")
+ st.rerun()
+
+ # ── Manage Profiles ───────────────────────────────────────────────────
+ with tab_list:
+ profiles = list_profiles()
+ if not profiles:
+ st.info("No profiles created yet.")
+ return
+
+ for profile in profiles:
+ with st.expander(f"**{profile.name}** — {profile.status.upper()}", expanded=False):
+ col1, col2, col3 = st.columns([2, 2, 1])
+ with col1:
+ st.markdown(f"**Description:** {profile.description or 'N/A'}")
+ st.markdown(f"**Kubernetes:** {profile.kubernetes_version} | **CRI-O:** {profile.crio_version}")
+ st.markdown(f"**Pod CIDR:** {profile.pod_cidr} | **Service CIDR:** {profile.service_cidr}")
+ st.markdown(f"**Pod Security:** {profile.pod_security_standard}")
+ with col2:
+ st.markdown("**Nodes:**")
+ for node in profile.nodes:
+ icon = "🔵" if node["role"] == "control-plane" else "🟢"
+ st.markdown(
+ f"{icon} `{node.get('hostname', 'N/A')}` — "
+ f"`{node['ip_address']}` ({node['role']})"
+ )
+ with col3:
+ st.markdown(f"**Created:** {profile.created_at[:10] if profile.created_at else 'N/A'}")
+ st.markdown(f"**Updated:** {profile.updated_at[:10] if profile.updated_at else 'N/A'}")
+ if st.button("Set Active", key=f"activate_{profile.name}"):
+ st.session_state.active_profile = profile.name
+ st.rerun()
+ if st.button("Delete", key=f"delete_{profile.name}", type="secondary"):
+ delete_profile(profile.name)
+ if st.session_state.active_profile == profile.name:
+ st.session_state.active_profile = None
+ st.rerun()
+
+ # ── Import / Export ───────────────────────────────────────────────────
+ with tab_import:
+ col_export, col_import = st.columns(2)
+ with col_export:
+ st.markdown("### Export Profile")
+ profiles = list_profiles()
+ if profiles:
+ export_name = st.selectbox("Select profile to export", [p.name for p in profiles])
+ if st.button("Export as JSON"):
+ profile = load_profile(export_name)
+ if profile:
+ from dataclasses import asdict
+ st.download_button(
+ label="Download JSON",
+ data=json.dumps(asdict(profile), indent=2),
+ file_name=f"{export_name}.json",
+ mime="application/json",
+ )
+
+ with col_import:
+ st.markdown("### Import Profile")
+ uploaded = st.file_uploader("Upload profile JSON", type=["json"])
+ if uploaded:
+ try:
+ data = json.loads(uploaded.read())
+ profile = ClusterProfile(**data)
+ save_profile(profile)
+ st.success(f"Profile '{profile.name}' imported!")
+ st.rerun()
+ except Exception as e:
+ st.error(f"Failed to import: {e}")
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Cluster Creation
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_cluster_creation():
+ st.markdown("## Cluster Creation")
+ st.markdown("Provision an on-prem K8s cluster via SSH with CRI-O, Flannel CNI, and best practices.")
+
+ profile = _get_active_profile()
+ if not profile:
+ return
+
+ _show_profile_summary(profile)
+
+ tab_preflight, tab_provision, tab_scripts, tab_advice = st.tabs([
+ "Pre-flight Checks",
+ "Provision Cluster",
+ "View Scripts",
+ "AI Advice",
+ ])
+
+ # ── Pre-flight: SSH connectivity ──────────────────────────────────────
+ with tab_preflight:
+ st.markdown("### SSH Connectivity Test")
+ st.markdown("Test SSH access to all nodes before provisioning.")
+
+ if st.button("Test All Nodes", type="primary"):
+ for node in profile.nodes:
+ with st.status(f"Testing {node.get('hostname', node['ip_address'])}...", expanded=True):
+ result = test_ssh_connectivity(node)
+ if result.success:
+ st.success(f"Connected to {node['ip_address']}")
+ st.code(result.stdout, language="text")
+ else:
+ st.error(f"Failed to connect to {node['ip_address']}")
+ st.code(result.stderr, language="text")
+
+ # ── Provision ─────────────────────────────────────────────────────────
+ with tab_provision:
+ st.markdown("### Automated Cluster Provisioning")
+ st.warning(
+ "This will SSH into each node and install Kubernetes components. "
+ "Ensure all nodes are accessible and you have root/sudo access."
+ )
+
+ cp_nodes = profile.get_control_plane_nodes()
+ worker_nodes = profile.get_worker_nodes()
+
+ st.markdown(f"**Control Plane:** {len(cp_nodes)} node(s) | **Workers:** {len(worker_nodes)} node(s)")
+
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ step1 = st.checkbox("Step 1: Common Setup (all nodes)", value=True)
+ with col2:
+ step2 = st.checkbox("Step 2: Init Control Plane", value=True)
+ with col3:
+ step3 = st.checkbox("Step 3: Join Workers", value=True)
+ step4 = st.checkbox("Step 4: Apply Best Practices", value=True)
+
+ if st.button("Start Provisioning", type="primary", use_container_width=True):
+ update_profile_status(profile.name, "provisioning")
+
+ # Step 1: Common setup on all nodes
+ if step1:
+ st.markdown("---")
+ st.markdown("### Step 1: Common Setup")
+ for node in profile.nodes:
+ with st.status(
+ f"Setting up {node.get('hostname', node['ip_address'])} ({node['role']})...",
+ expanded=True,
+ ):
+ result = provision_node_common(node, profile)
+ if result.success:
+ st.success(f"Common setup complete on {node['ip_address']}")
+ else:
+ st.error(f"Setup failed on {node['ip_address']}")
+ st.code(result.stderr, language="text")
+
+ # Step 2: Initialize control plane
+ if step2 and cp_nodes:
+ st.markdown("---")
+ st.markdown("### Step 2: Control Plane Initialization")
+ cp_node = cp_nodes[0]
+ with st.status(f"Initializing control plane on {cp_node['ip_address']}...", expanded=True):
+ result = init_control_plane(cp_node, profile)
+ if result.success:
+ st.success("Control plane initialized!")
+ st.code(result.stdout[-2000:], language="text")
+ else:
+ st.error("Control plane initialization failed!")
+ st.code(result.stderr, language="text")
+
+ # Step 3: Join worker nodes
+ if step3 and worker_nodes and cp_nodes:
+ st.markdown("---")
+ st.markdown("### Step 3: Join Worker Nodes")
+ join_cmd = retrieve_join_command(cp_nodes[0])
+ if join_cmd:
+ for node in worker_nodes:
+ with st.status(f"Joining {node.get('hostname', node['ip_address'])}...", expanded=True):
+ result = join_worker_node(node, join_cmd)
+ if result.success:
+ st.success(f"Worker {node['ip_address']} joined!")
+ else:
+ st.error(f"Failed to join {node['ip_address']}")
+ st.code(result.stderr, language="text")
+ else:
+ st.error("Could not retrieve join command from control plane.")
+
+ # Step 4: Best practices
+ if step4 and cp_nodes:
+ st.markdown("---")
+ st.markdown("### Step 4: Best Practices")
+ with st.status("Applying security and resource best practices...", expanded=True):
+ result = apply_best_practices(cp_nodes[0])
+ if result.success:
+ st.success("Best practices applied!")
+ st.code(result.stdout, language="text")
+ else:
+ st.error("Failed to apply best practices")
+ st.code(result.stderr, language="text")
+
+ # Final status
+ st.markdown("---")
+ st.markdown("### Cluster Status")
+ if cp_nodes:
+ result = get_cluster_status(cp_nodes[0])
+ if result.success:
+ update_profile_status(profile.name, "active")
+ st.success("Cluster is active!")
+ st.code(result.stdout, language="text")
+ else:
+ update_profile_status(profile.name, "error")
+ st.error("Could not verify cluster status")
+ st.code(result.stderr, language="text")
+
+ # ── View Scripts ──────────────────────────────────────────────────────
+ with tab_scripts:
+ st.markdown("### Generated Scripts")
+ st.markdown("Review the scripts that will be executed during provisioning.")
+
+ with st.expander("Common Setup Script (all nodes)", expanded=False):
+ st.code(generate_common_setup_script(profile), language="bash")
+
+ with st.expander("Control Plane Init Script", expanded=False):
+ st.code(generate_control_plane_init_script(profile), language="bash")
+
+ with st.expander("Worker Join Script", expanded=False):
+ st.code(generate_worker_join_script(), language="bash")
+
+ with st.expander("Best Practices Script", expanded=False):
+ st.code(generate_best_practices_script(), language="bash")
+
+ # ── AI Advice ─────────────────────────────────────────────────────────
+ with tab_advice:
+ st.markdown("### AI Cluster Setup Advisor")
+ context = st.text_area(
+ "Additional context or questions",
+ placeholder="e.g., We have 3 nodes with 16GB RAM each. Any special considerations?",
+ )
+ if st.button("Get AI Recommendations", type="primary"):
+ with st.spinner("Analyzing your cluster configuration..."):
+ advice = get_llm_cluster_advice(profile, context)
+ st.markdown(advice)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Cluster Debugger
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_cluster_debugger():
+ st.markdown("## Cluster Debugger")
+ st.markdown("Diagnose issues and get AI-powered recommendations.")
+
+ profile = _get_active_profile()
+ if not profile:
+ return
+
+ cp_nodes = profile.get_control_plane_nodes()
+ if not cp_nodes:
+ st.error("No control-plane node defined in this profile.")
+ return
+ cp_node = cp_nodes[0]
+
+ tab_quick, tab_category, tab_custom, tab_ai = st.tabs([
+ "Quick Diagnostics",
+ "Category Scan",
+ "Custom Command",
+ "AI Debug Assistant",
+ ])
+
+ # ── Quick Diagnostics ─────────────────────────────────────────────────
+ with tab_quick:
+ st.markdown("### Quick Diagnostic Checks")
+ col1, col2 = st.columns(2)
+ with col1:
+ selected_checks = st.multiselect(
+ "Select checks to run",
+ options=list(DIAGNOSTIC_COMMANDS.keys()),
+ default=["Node Status", "Pod Status (All Namespaces)", "Events (Recent)"],
+ )
+ with col2:
+ run_all = st.checkbox("Run ALL diagnostics")
+
+ if st.button("Run Diagnostics", type="primary"):
+ if run_all:
+ with st.spinner("Running all diagnostics..."):
+ results = run_all_diagnostics(cp_node)
+ else:
+ results = {}
+ for check in selected_checks:
+ with st.spinner(f"Running: {check}..."):
+ results[check] = run_diagnostic(cp_node, check)
+
+ st.session_state.debug_results = results
+
+ for name, result in results.items():
+ status_icon = "+" if result.success else "-"
+ with st.expander(f"{'✅' if result.success else '❌'} {name}", expanded=not result.success):
+ st.code(result.stdout if result.success else result.stderr, language="text")
+
+ if st.session_state.debug_results and st.button("Analyze with AI", type="secondary"):
+ with st.spinner("AI is analyzing diagnostics..."):
+ analysis = analyze_diagnostics(
+ st.session_state.debug_results,
+ profile=profile,
+ )
+ st.markdown(analysis)
+
+ # ── Category Scan ─────────────────────────────────────────────────────
+ with tab_category:
+ st.markdown("### Category-Based Diagnostics")
+ category = st.selectbox("Select Category", options=list(CATEGORY_MAP.keys()))
+
+ if st.button("Run Category Scan", type="primary", key="cat_scan"):
+ with st.spinner(f"Running {category} diagnostics..."):
+ results = run_category_diagnostics(cp_node, category)
+
+ for name, result in results.items():
+ with st.expander(f"{'✅' if result.success else '❌'} {name}"):
+ st.code(result.stdout if result.success else result.stderr, language="text")
+
+ if st.button("Analyze Category with AI", key="cat_ai"):
+ with st.spinner("Analyzing..."):
+ analysis = analyze_diagnostics(results, profile=profile)
+ st.markdown(analysis)
+
+ # ── Custom Command ────────────────────────────────────────────────────
+ with tab_custom:
+ st.markdown("### Run Custom Command")
+ st.warning("Commands execute on the control-plane node via SSH.")
+ custom_cmd = st.text_area(
+ "Command",
+ placeholder="kubectl get pods -A -o wide",
+ height=100,
+ )
+ if st.button("Execute", type="primary", key="exec_custom") and custom_cmd:
+ with st.spinner("Executing..."):
+ result = run_custom_command(cp_node, custom_cmd)
+ if result.success:
+ st.code(result.stdout, language="text")
+ else:
+ st.error("Command failed")
+ st.code(result.stderr, language="text")
+
+ # ── AI Debug Assistant ────────────────────────────────────────────────
+ with tab_ai:
+ st.markdown("### AI Debug Assistant")
+ st.markdown("Describe your issue and get AI-powered debugging help.")
+
+ issue = st.text_area(
+ "Describe the issue",
+ placeholder="e.g., Pods are stuck in CrashLoopBackOff in the default namespace",
+ height=120,
+ )
+
+ col1, col2 = st.columns(2)
+ with col1:
+ auto_collect = st.checkbox("Auto-collect relevant diagnostics", value=True)
+ with col2:
+ check_pods = st.checkbox("Check for problematic pods", value=True)
+
+ if st.button("Debug", type="primary", key="ai_debug") and issue:
+ collected_data = ""
+
+ if check_pods:
+ with st.spinner("Checking pod issues..."):
+ pod_result = check_pod_issues(cp_node)
+ if pod_result.success and pod_result.stdout.strip():
+ collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}"
+ with st.expander("Problematic Pods"):
+ st.code(pod_result.stdout, language="text")
+
+ if auto_collect:
+ with st.spinner("Collecting diagnostics..."):
+ diag_results = run_category_diagnostics(cp_node, "Cluster Overview")
+ for name, result in diag_results.items():
+ if result.success:
+ collected_data += f"\n\n{name}:\n{result.stdout}"
+
+ with st.spinner("AI is analyzing the issue..."):
+ full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}"
+ suggestion = get_debug_suggestion(issue, collected_data)
+ st.markdown("### AI Recommendation")
+ st.markdown(suggestion)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Monitoring Setup
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_monitoring_setup():
+ st.markdown("## Monitoring Setup")
+ st.markdown("Deploy Prometheus, Grafana, dashboards, and alerting rules.")
+
+ profile = _get_active_profile()
+ if not profile:
+ return
+
+ cp_nodes = profile.get_control_plane_nodes()
+ if not cp_nodes:
+ st.error("No control-plane node defined in this profile.")
+ return
+ cp_node = cp_nodes[0]
+
+ namespace = st.text_input("Monitoring Namespace", value="monitoring")
+
+ tab_install, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([
+ "Install Stack",
+ "Dashboards",
+ "Alert Rules",
+ "Status",
+ "View Scripts",
+ "AI Advice",
+ ])
+
+ # ── Install ───────────────────────────────────────────────────────────
+ with tab_install:
+ st.markdown("### Install Monitoring Stack")
+ st.markdown("This installs **kube-prometheus-stack** (Prometheus + Grafana + exporters).")
+
+ col1, col2 = st.columns(2)
+ with col1:
+ install_helm_first = st.checkbox("Install Helm (if not present)", value=True)
+ with col2:
+ install_alerts_too = st.checkbox("Also install alert rules", value=True)
+
+ if st.button("Install Prometheus + Grafana", type="primary", use_container_width=True):
+ if install_helm_first:
+ with st.status("Installing Helm...", expanded=True):
+ result = install_helm(cp_node)
+ if result.success:
+ st.success("Helm ready!")
+ else:
+ st.error("Helm installation failed")
+ st.code(result.stderr, language="text")
+
+ with st.status("Installing kube-prometheus-stack (this may take several minutes)...", expanded=True):
+ result = install_prometheus_stack(cp_node, namespace)
+ if result.success:
+ st.success("Prometheus + Grafana installed!")
+ st.code(result.stdout[-2000:], language="text")
+ else:
+ st.error("Installation failed")
+ st.code(result.stderr, language="text")
+
+ if install_alerts_too:
+ with st.status("Installing alert rules...", expanded=True):
+ result = install_alert_rules(cp_node, namespace)
+ if result.success:
+ st.success("Alert rules installed!")
+ else:
+ st.error("Alert rules installation failed")
+ st.code(result.stderr, language="text")
+
+ # ── Dashboards ────────────────────────────────────────────────────────
+ with tab_dashboards:
+ st.markdown("### Grafana Dashboards")
+ st.markdown("Select dashboards to import into Grafana.")
+
+ selected_dashboards = []
+ cols = st.columns(2)
+ for i, (key, dash) in enumerate(GRAFANA_DASHBOARDS.items()):
+ with cols[i % 2]:
+ if st.checkbox(f"**{dash['name']}**\n{dash['description']}", value=True, key=f"dash_{key}"):
+ selected_dashboards.append(key)
+
+ if st.button("Import Dashboards", type="primary") and selected_dashboards:
+ with st.status("Importing dashboards...", expanded=True):
+ result = install_dashboards(cp_node, selected_dashboards, namespace)
+ if result.success:
+ st.success(f"Imported {len(selected_dashboards)} dashboards!")
+ st.code(result.stdout, language="text")
+ else:
+ st.error("Dashboard import failed")
+ st.code(result.stderr, language="text")
+
+ # ── Alert Rules ───────────────────────────────────────────────────────
+ with tab_alerts:
+ st.markdown("### Alerting Rules")
+ st.markdown("Install production-ready alerting rules for nodes, pods, and etcd.")
+
+ with st.expander("View Alert Rules", expanded=False):
+ st.code(generate_alerting_rules_script(namespace), language="yaml")
+
+ if st.button("Install Alert Rules", type="primary", key="install_alerts"):
+ with st.spinner("Installing alert rules..."):
+ result = install_alert_rules(cp_node, namespace)
+ if result.success:
+ st.success("Alert rules installed!")
+ st.code(result.stdout, language="text")
+ else:
+ st.error("Failed to install alert rules")
+ st.code(result.stderr, language="text")
+
+ # ── Status ────────────────────────────────────────────────────────────
+ with tab_status:
+ st.markdown("### Monitoring Stack Status")
+ if st.button("Check Status", type="primary", key="mon_status"):
+ with st.spinner("Checking monitoring stack..."):
+ result = get_monitoring_status(cp_node, namespace)
+ if result.success:
+ st.code(result.stdout, language="text")
+ else:
+ st.warning("Could not retrieve monitoring status")
+ st.code(result.stderr, language="text")
+
+ # ── View Scripts ──────────────────────────────────────────────────────
+ with tab_scripts:
+ st.markdown("### Generated Scripts")
+ with st.expander("Prometheus Install Script"):
+ st.code(generate_prometheus_install_script(namespace), language="bash")
+ with st.expander("Dashboard Import Script"):
+ all_keys = list(GRAFANA_DASHBOARDS.keys())
+ st.code(generate_dashboard_import_script(all_keys, namespace), language="bash")
+ with st.expander("Alert Rules Script"):
+ st.code(generate_alerting_rules_script(namespace), language="bash")
+
+ # ── AI Advice ─────────────────────────────────────────────────────────
+ with tab_advice:
+ st.markdown("### AI Monitoring Advisor")
+ if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"):
+ current_status = ""
+ status_result = get_monitoring_status(cp_node, namespace)
+ if status_result.success:
+ current_status = status_result.stdout
+
+ with st.spinner("Getting AI recommendations..."):
+ advice = get_monitoring_advice(profile, current_status)
+ st.markdown(advice)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Log Analysis
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_log_analysis():
+ st.markdown("## Log Analysis & Error Correlation")
+ st.markdown("Collect, parse, and analyze logs from your cluster components.")
+
+ profile = _get_active_profile()
+ if not profile:
+ return
+
+ cp_nodes = profile.get_control_plane_nodes()
+ if not cp_nodes:
+ st.error("No control-plane node defined in this profile.")
+ return
+ cp_node = cp_nodes[0]
+
+ tab_system, tab_pod, tab_correlation, tab_ai = st.tabs([
+ "System Logs",
+ "Pod Logs",
+ "Error Correlation",
+ "AI Log Analysis",
+ ])
+
+ # ── System Logs ───────────────────────────────────────────────────────
+ with tab_system:
+ st.markdown("### System Component Logs")
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ sources = st.multiselect(
+ "Log Sources",
+ options=list(LOG_SOURCES.keys()),
+ default=["Kubelet", "CRI-O", "Events"],
+ )
+ with col2:
+ log_lines = st.number_input("Lines to fetch", min_value=50, max_value=1000, value=200)
+ with col3:
+ since_options = {"Last 15 min": ("15 minutes ago", "15m"),
+ "Last 1 hour": ("1 hour ago", "1h"),
+ "Last 6 hours": ("6 hours ago", "6h"),
+ "Last 24 hours": ("24 hours ago", "24h")}
+ since_label = st.selectbox("Time Range", options=list(since_options.keys()), index=1)
+ since, since_k8s = since_options[since_label]
+
+ if st.button("Collect Logs", type="primary", key="collect_sys"):
+ log_data = {}
+ for source in sources:
+ with st.spinner(f"Collecting {source} logs..."):
+ result = collect_logs(cp_node, source, log_lines, since, since_k8s)
+ if result.success:
+ log_data[source] = result.stdout
+ analysis = analyze_logs(result.stdout, source)
+
+ with st.expander(
+ f"{'❌' if analysis.error_count > 0 else '✅'} {source} "
+ f"({analysis.error_count} errors, {analysis.warning_count} warnings)",
+ expanded=analysis.error_count > 0,
+ ):
+ # Metrics
+ m1, m2, m3 = st.columns(3)
+ m1.metric("Total Lines", analysis.total_lines)
+ m2.metric("Errors", analysis.error_count)
+ m3.metric("Warnings", analysis.warning_count)
+
+ if analysis.error_patterns:
+ st.markdown("**Top Error Patterns:**")
+ for pattern, count in list(analysis.error_patterns.items())[:5]:
+ st.markdown(f"- `{pattern}` (x{count})")
+
+ st.code(result.stdout[-3000:], language="text")
+ else:
+ with st.expander(f"❌ {source} — FAILED"):
+ st.code(result.stderr, language="text")
+
+ st.session_state.log_analysis_results = log_data
+
+ # ── Pod Logs ──────────────────────────────────────────────────────────
+ with tab_pod:
+ st.markdown("### Pod Logs")
+ col1, col2 = st.columns(2)
+ with col1:
+ pod_ns = st.text_input("Namespace", value="default", key="pod_ns")
+ pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz", key="pod_name_input")
+ with col2:
+ container = st.text_input("Container (optional)", key="pod_container")
+ pod_lines = st.number_input("Lines", min_value=50, max_value=1000, value=200, key="pod_lines")
+ pod_previous = st.checkbox("Previous container logs (crash recovery)")
+
+ if st.button("Fetch Pod Logs", type="primary", key="fetch_pod") and pod_name:
+ with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."):
+ result = collect_pod_logs(
+ cp_node, pod_ns, pod_name, container, pod_lines,
+ "1h", pod_previous,
+ )
+ if result.success:
+ analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}")
+ m1, m2, m3 = st.columns(3)
+ m1.metric("Total Lines", analysis.total_lines)
+ m2.metric("Errors", analysis.error_count)
+ m3.metric("Warnings", analysis.warning_count)
+
+ if analysis.error_patterns:
+ st.markdown("**Error Patterns:**")
+ for pattern, count in list(analysis.error_patterns.items())[:10]:
+ st.markdown(f"- `{pattern}` (x{count})")
+
+ st.code(result.stdout[-5000:], language="text")
+
+ if analysis.error_count > 0 and st.button("Analyze with AI", key="pod_ai"):
+ with st.spinner("AI analyzing pod logs..."):
+ ai_analysis = llm_analyze_logs(
+ result.stdout, f"{pod_ns}/{pod_name}"
+ )
+ st.markdown(ai_analysis)
+ else:
+ st.error("Failed to fetch pod logs")
+ st.code(result.stderr, language="text")
+
+ # ── Error Correlation ─────────────────────────────────────────────────
+ with tab_correlation:
+ st.markdown("### Cross-Source Error Correlation")
+ st.markdown("Collect logs from multiple sources and correlate errors across them.")
+
+ corr_sources = st.multiselect(
+ "Sources to correlate",
+ options=list(LOG_SOURCES.keys()),
+ default=["Kubelet", "CRI-O", "API Server", "Events"],
+ key="corr_sources",
+ )
+
+ if st.button("Collect & Correlate", type="primary", key="correlate"):
+ with st.spinner("Collecting logs from multiple sources..."):
+ results = collect_multi_source_logs(cp_node, corr_sources, lines=150)
+
+ correlated = correlate_errors(results)
+
+ if correlated:
+ st.markdown(f"### Found {len(correlated)} correlated error groups")
+ for i, group in enumerate(correlated):
+ with st.expander(
+ f"Correlation #{i + 1}: {', '.join(group['sources_involved'])}",
+ expanded=True,
+ ):
+ st.markdown(f"**Primary Error** ({group['primary']['source']}):")
+ st.code(group["primary"]["message"], language="text")
+ st.markdown("**Related Errors:**")
+ for related in group["related"]:
+ st.markdown(f"- **{related['source']}**: `{related['message'][:200]}`")
+ else:
+ st.info("No correlated errors found across sources.")
+
+ # LLM correlation analysis
+ if st.button("Deep AI Correlation Analysis", key="deep_corr"):
+ multi_logs = {
+ src: res.stdout for src, res in results.items() if res.success
+ }
+ with st.spinner("AI is performing deep correlation analysis..."):
+ analysis = llm_correlate_analysis(multi_logs)
+ st.markdown(analysis)
+
+ # ── AI Log Analysis ───────────────────────────────────────────────────
+ with tab_ai:
+ st.markdown("### AI-Powered Log Analysis")
+ st.markdown("Paste logs or describe an issue for AI analysis.")
+
+ log_input = st.text_area(
+ "Paste log output",
+ height=200,
+ placeholder="Paste your Kubernetes logs here...",
+ )
+ context_input = st.text_input(
+ "Additional context",
+ placeholder="e.g., This started happening after we upgraded to K8s 1.30",
+ )
+
+ if st.button("Analyze Logs", type="primary", key="ai_log_analyze") and log_input:
+ with st.spinner("AI is analyzing logs..."):
+ analysis = llm_analyze_logs(log_input, context=context_input)
+ st.markdown(analysis)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: AI Assistant
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_ai_assistant():
+ st.markdown("## AI Kubernetes Assistant")
+ st.markdown("Chat with the AI about any Kubernetes topic.")
+
+ # Chat history
+ for msg in st.session_state.chat_history:
+ with st.chat_message(msg["role"]):
+ st.markdown(msg["content"])
+
+ # Chat input
+ if prompt := st.chat_input("Ask about Kubernetes..."):
+ st.session_state.chat_history.append({"role": "user", "content": prompt})
+ with st.chat_message("user"):
+ st.markdown(prompt)
+
+ with st.chat_message("assistant"):
+ placeholder = st.empty()
+ full_response = ""
+ for chunk in stream_llm(
+ prompt,
+ conversation_history=st.session_state.chat_history[:-1],
+ ):
+ full_response += chunk
+ placeholder.markdown(full_response + "▌")
+ placeholder.markdown(full_response)
+
+ st.session_state.chat_history.append({"role": "assistant", "content": full_response})
+
+
+# ── Helper functions ──────────────────────────────────────────────────────
+
+def _get_active_profile() -> ClusterProfile | None:
+ """Get the active profile or show a warning."""
+ if not st.session_state.active_profile:
+ st.warning("No active cluster profile selected. Please create or select one in the Profile Manager.")
+ return None
+ profile = load_profile(st.session_state.active_profile)
+ if not profile:
+ st.error(f"Profile '{st.session_state.active_profile}' not found.")
+ return None
+ return profile
+
+
+def _show_profile_summary(profile: ClusterProfile):
+ """Display a compact profile summary."""
+ cols = st.columns(5)
+ cols[0].metric("Profile", profile.name)
+ cols[1].metric("K8s Version", profile.kubernetes_version)
+ cols[2].metric("Runtime", f"CRI-O {profile.crio_version}")
+ cols[3].metric("CNI", "Flannel")
+ cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W")
+
+
+# ── Main Router ───────────────────────────────────────────────────────────
+
+def main():
+ page = render_sidebar()
+
+ if page == "Profile Manager":
+ page_profile_manager()
+ elif page == "Cluster Creation":
+ page_cluster_creation()
+ elif page == "Cluster Debugger":
+ page_cluster_debugger()
+ elif page == "Monitoring Setup":
+ page_monitoring_setup()
+ elif page == "Log Analysis":
+ page_log_analysis()
+ elif page == "AI Assistant":
+ page_ai_assistant()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
new file mode 100644
index 0000000..e46dd95
--- /dev/null
+++ b/k8s-agent/config.py
@@ -0,0 +1,21 @@
+"""Configuration for the K8s Agent application."""
+
+import os
+
+# LLM Configuration
+LLM_API_URL = os.getenv(
+ "LLM_API_URL",
+ "https://aigateway-intern.ad.infosys.com/aigateway/chat/completions",
+)
+LLM_API_KEY = os.getenv("LLM_API_KEY", os.getenv("INFOSYS_CODER_API_KEY", ""))
+LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4")
+LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
+LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096"))
+
+# Application paths
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+PROFILES_DIR = os.path.join(DATA_DIR, "profiles")
+TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
+
+# Ensure directories exist
+os.makedirs(PROFILES_DIR, exist_ok=True)
diff --git a/k8s-agent/data/profiles/.gitkeep b/k8s-agent/data/profiles/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/k8s-agent/data/profiles/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/k8s-agent/modules/__init__.py b/k8s-agent/modules/__init__.py
new file mode 100644
index 0000000..fc1c144
--- /dev/null
+++ b/k8s-agent/modules/__init__.py
@@ -0,0 +1 @@
+"""K8s Agent modules."""
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
new file mode 100644
index 0000000..ef89dea
--- /dev/null
+++ b/k8s-agent/modules/cluster_creator.py
@@ -0,0 +1,545 @@
+"""Cluster Creator — SSH-based K8s cluster provisioning with CRI-O + Flannel."""
+
+import subprocess
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+from modules.llm_client import query_llm
+from modules.profile_manager import ClusterProfile
+
+
+@dataclass
+class SSHResult:
+ """Result of an SSH command execution."""
+
+ hostname: str
+ command: str
+ return_code: int
+ stdout: str
+ stderr: str
+ success: bool
+
+
+def run_ssh_command(
+ ip_address: str,
+ command: str,
+ ssh_user: str = "root",
+ ssh_port: int = 22,
+ ssh_key_path: str = "~/.ssh/id_rsa",
+ timeout: int = 600,
+) -> SSHResult:
+ """Execute a command on a remote node via SSH.
+
+ Args:
+ ip_address: Target node IP.
+ command: Shell command to execute remotely.
+ ssh_user: SSH username.
+ ssh_port: SSH port number.
+ ssh_key_path: Path to SSH private key.
+ timeout: Command timeout in seconds.
+
+ Returns:
+ SSHResult with command output and status.
+ """
+ ssh_cmd = [
+ "ssh",
+ "-o", "StrictHostKeyChecking=no",
+ "-o", "UserKnownHostsFile=/dev/null",
+ "-o", "ConnectTimeout=10",
+ "-o", "BatchMode=yes",
+ "-i", ssh_key_path,
+ "-p", str(ssh_port),
+ f"{ssh_user}@{ip_address}",
+ command,
+ ]
+
+ try:
+ result = subprocess.run(
+ ssh_cmd,
+ capture_output=True,
+ text=True,
+ timeout=timeout,
+ )
+ return SSHResult(
+ hostname=ip_address,
+ command=command,
+ return_code=result.returncode,
+ stdout=result.stdout,
+ stderr=result.stderr,
+ success=result.returncode == 0,
+ )
+ except subprocess.TimeoutExpired:
+ return SSHResult(
+ hostname=ip_address,
+ command=command,
+ return_code=-1,
+ stdout="",
+ stderr=f"Command timed out after {timeout}s",
+ success=False,
+ )
+ except Exception as exc:
+ return SSHResult(
+ hostname=ip_address,
+ command=command,
+ return_code=-1,
+ stdout="",
+ stderr=str(exc),
+ success=False,
+ )
+
+
+def test_ssh_connectivity(node: dict) -> SSHResult:
+ """Test SSH connectivity to a node."""
+ return run_ssh_command(
+ ip_address=node["ip_address"],
+ command="echo 'SSH connection successful' && hostname && uname -r",
+ ssh_user=node.get("ssh_user", "root"),
+ ssh_port=node.get("ssh_port", 22),
+ ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=15,
+ )
+
+
+def generate_common_setup_script(profile: ClusterProfile) -> str:
+ """Generate the common setup script that runs on ALL nodes (control-plane + workers)."""
+ return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== K8s Node Common Setup ==="
+echo "Kubernetes Version: {profile.kubernetes_version}"
+echo "CRI-O Version: {profile.crio_version}"
+echo "Timestamp: $(date -u)"
+
+# ── 1. System prerequisites ──────────────────────────────────────────────
+echo ">> Disabling swap..."
+swapoff -a
+sed -i '/\\bswap\\b/d' /etc/fstab
+
+echo ">> Loading kernel modules..."
+cat > /etc/modules-load.d/k8s.conf <> Setting sysctl parameters..."
+cat > /etc/sysctl.d/99-kubernetes.conf <> Disabling SELinux (if present)..."
+if command -v setenforce &>/dev/null; then
+ setenforce 0 || true
+ sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/null || true
+fi
+
+echo ">> Configuring firewalld (if present)..."
+if systemctl is-active --quiet firewalld; then
+ firewall-cmd --permanent --add-port=6443/tcp # API server
+ firewall-cmd --permanent --add-port=2379-2380/tcp # etcd
+ firewall-cmd --permanent --add-port=10250/tcp # Kubelet API
+ firewall-cmd --permanent --add-port=10259/tcp # kube-scheduler
+ firewall-cmd --permanent --add-port=10257/tcp # kube-controller-manager
+ firewall-cmd --permanent --add-port=30000-32767/tcp # NodePort
+ firewall-cmd --permanent --add-port=8472/udp # Flannel VXLAN
+ firewall-cmd --reload
+fi
+
+# ── 2. Install CRI-O ─────────────────────────────────────────────────────
+echo ">> Installing CRI-O {profile.crio_version}..."
+
+OS="$(. /etc/os-release && echo "$ID")"
+VERSION_ID="$(. /etc/os-release && echo "$VERSION_ID")"
+
+if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then
+ apt-get update -y
+ apt-get install -y software-properties-common curl gnupg2
+
+ CRIO_VERSION="{profile.crio_version}"
+ curl -fsSL "https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/Release.key" | \\
+ gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
+ echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/ /" | \\
+ tee /etc/apt/sources.list.d/cri-o.list
+
+ apt-get update -y
+ apt-get install -y cri-o
+elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then
+ CRIO_VERSION="{profile.crio_version}"
+ cat > /etc/yum.repos.d/cri-o.repo <> CRI-O installed and running."
+
+# ── 3. Install kubeadm, kubelet, kubectl ──────────────────────────────────
+echo ">> Installing Kubernetes {profile.kubernetes_version} components..."
+
+K8S_VERSION="{profile.kubernetes_version}"
+
+if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then
+ curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/Release.key" | \\
+ gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
+ echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/ /" | \\
+ tee /etc/apt/sources.list.d/kubernetes.list
+
+ apt-get update -y
+ apt-get install -y kubelet kubeadm kubectl
+ apt-mark hold kubelet kubeadm kubectl
+elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then
+ cat > /etc/yum.repos.d/kubernetes.repo <> Kubernetes components installed."
+
+echo "=== Common setup complete ==="
+"""
+
+
+def generate_control_plane_init_script(profile: ClusterProfile) -> str:
+ """Generate the kubeadm init script for the control-plane node."""
+ cp_nodes = profile.get_control_plane_nodes()
+ cp_ip = cp_nodes[0]["ip_address"] if cp_nodes else "CONTROL_PLANE_IP"
+
+ return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Initializing Kubernetes Control Plane ==="
+
+# ── kubeadm init ──────────────────────────────────────────────────────────
+cat > /tmp/kubeadm-config.yaml <> Running kubeadm init..."
+kubeadm init --config=/tmp/kubeadm-config.yaml --upload-certs | tee /tmp/kubeadm-init.log
+
+# ── Configure kubectl for root ────────────────────────────────────────────
+echo ">> Configuring kubectl..."
+mkdir -p /root/.kube
+cp /etc/kubernetes/admin.conf /root/.kube/config
+chown root:root /root/.kube/config
+
+# ── Install Flannel CNI ───────────────────────────────────────────────────
+echo ">> Installing Flannel CNI..."
+kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
+
+# Wait for Flannel to be ready
+echo ">> Waiting for Flannel pods to be ready..."
+kubectl -n kube-flannel wait --for=condition=ready pod -l app=flannel --timeout=120s || true
+
+# ── Apply Pod Security Standards ──────────────────────────────────────────
+echo ">> Applying Pod Security Standards ({profile.pod_security_standard})..."
+kubectl label namespace default \\
+ pod-security.kubernetes.io/enforce={profile.pod_security_standard} \\
+ pod-security.kubernetes.io/warn={profile.pod_security_standard} \\
+ pod-security.kubernetes.io/audit={profile.pod_security_standard} \\
+ --overwrite
+
+# ── Generate join command ─────────────────────────────────────────────────
+echo ">> Generating worker join command..."
+kubeadm token create --print-join-command > /tmp/kubeadm-join-command.txt
+echo "Join command saved to /tmp/kubeadm-join-command.txt"
+
+echo ""
+echo "=== Control Plane initialization complete ==="
+echo "Join command:"
+cat /tmp/kubeadm-join-command.txt
+"""
+
+
+def generate_worker_join_script() -> str:
+ """Generate the script that runs on worker nodes to join the cluster."""
+ return """#!/bin/bash
+set -euo pipefail
+
+echo "=== Joining Worker Node to Cluster ==="
+
+JOIN_COMMAND="$1"
+
+if [ -z "$JOIN_COMMAND" ]; then
+ echo "ERROR: Join command not provided."
+ echo "Usage: $0 ''"
+ exit 1
+fi
+
+echo ">> Executing join command..."
+eval "$JOIN_COMMAND --cri-socket unix:///var/run/crio/crio.sock"
+
+echo "=== Worker node joined successfully ==="
+"""
+
+
+def generate_best_practices_script() -> str:
+ """Generate a post-install best practices hardening script."""
+ return """#!/bin/bash
+set -euo pipefail
+
+echo "=== Applying Kubernetes Best Practices ==="
+
+# ── Default Network Policy (deny-all) ────────────────────────────────────
+echo ">> Creating default-deny network policy for default namespace..."
+cat <> Setting resource quotas..."
+cat <> Setting limit ranges..."
+cat <> Creating read-only ClusterRole..."
+cat <> Ensuring audit log directory exists..."
+mkdir -p /var/log/kubernetes
+
+echo "=== Best practices applied ==="
+echo ""
+echo "Summary of applied best practices:"
+echo " - Default-deny NetworkPolicy in default namespace"
+echo " - ResourceQuota for default namespace (CPU: 4/8, Memory: 8/16Gi)"
+echo " - LimitRange with default container limits"
+echo " - Read-only ClusterRole (cluster-reader)"
+echo " - Audit logging directory configured"
+"""
+
+
+def provision_node_common(node: dict, profile: ClusterProfile) -> SSHResult:
+ """Run the common setup script on a single node via SSH."""
+ script = generate_common_setup_script(profile)
+ return run_ssh_command(
+ ip_address=node["ip_address"],
+ command=script,
+ ssh_user=node.get("ssh_user", "root"),
+ ssh_port=node.get("ssh_port", 22),
+ ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=600,
+ )
+
+
+def init_control_plane(node: dict, profile: ClusterProfile) -> SSHResult:
+ """Initialize the control plane on the given node."""
+ script = generate_control_plane_init_script(profile)
+ return run_ssh_command(
+ ip_address=node["ip_address"],
+ command=script,
+ ssh_user=node.get("ssh_user", "root"),
+ ssh_port=node.get("ssh_port", 22),
+ ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=600,
+ )
+
+
+def retrieve_join_command(control_plane_node: dict) -> Optional[str]:
+ """Retrieve the kubeadm join command from the control-plane node."""
+ result = run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command="cat /tmp/kubeadm-join-command.txt",
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=30,
+ )
+ if result.success:
+ return result.stdout.strip()
+ return None
+
+
+def join_worker_node(node: dict, join_command: str) -> SSHResult:
+ """Join a worker node to the cluster."""
+ full_command = f"{join_command} --cri-socket unix:///var/run/crio/crio.sock"
+ return run_ssh_command(
+ ip_address=node["ip_address"],
+ command=full_command,
+ ssh_user=node.get("ssh_user", "root"),
+ ssh_port=node.get("ssh_port", 22),
+ ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=300,
+ )
+
+
+def apply_best_practices(control_plane_node: dict) -> SSHResult:
+ """Apply best practices hardening on the cluster via the control-plane."""
+ script = generate_best_practices_script()
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=script,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=120,
+ )
+
+
+def get_cluster_status(control_plane_node: dict) -> SSHResult:
+ """Get the cluster node status from the control-plane."""
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command="kubectl get nodes -o wide && echo '---' && kubectl get pods -A",
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=30,
+ )
+
+
+def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
+ """Ask the LLM for cluster setup advice based on the profile."""
+ nodes_desc = []
+ for n in profile.nodes:
+ nodes_desc.append(f" - {n.get('hostname', 'unknown')} ({n['ip_address']}) — role: {n['role']}")
+ nodes_str = "\n".join(nodes_desc)
+
+ prompt = f"""I am setting up an on-premises Kubernetes cluster with the following configuration:
+
+- Kubernetes Version: {profile.kubernetes_version}
+- Container Runtime: CRI-O {profile.crio_version}
+- CNI Plugin: Flannel
+- Pod CIDR: {profile.pod_cidr}
+- Service CIDR: {profile.service_cidr}
+- Pod Security Standard: {profile.pod_security_standard}
+
+Nodes:
+{nodes_str}
+
+{context}
+
+Please review this configuration and provide:
+1. Any potential issues or conflicts
+2. Recommended optimizations
+3. Security hardening recommendations specific to this setup
+4. Network configuration tips for Flannel with CRI-O
+"""
+ return query_llm(prompt)
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
new file mode 100644
index 0000000..9ee9dd1
--- /dev/null
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -0,0 +1,228 @@
+"""Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations."""
+
+from modules.cluster_creator import run_ssh_command, SSHResult
+from modules.llm_client import query_llm, stream_llm
+from modules.profile_manager import ClusterProfile
+
+
+# ── Diagnostic command definitions ────────────────────────────────────────
+
+DIAGNOSTIC_COMMANDS = {
+ "Node Status": "kubectl get nodes -o wide",
+ "Pod Status (All Namespaces)": "kubectl get pods -A -o wide",
+ "Events (Recent)": "kubectl get events -A --sort-by='.lastTimestamp' | tail -50",
+ "Component Status": "kubectl get componentstatuses 2>/dev/null; kubectl get --raw='/healthz?verbose' 2>/dev/null || true",
+ "System Pods": "kubectl -n kube-system get pods -o wide",
+ "Node Resources": "kubectl top nodes 2>/dev/null || echo 'metrics-server not installed'",
+ "Pod Resources": "kubectl top pods -A 2>/dev/null || echo 'metrics-server not installed'",
+ "Cluster Info": "kubectl cluster-info",
+ "CRI-O Status": "systemctl status crio --no-pager -l",
+ "Kubelet Status": "systemctl status kubelet --no-pager -l",
+ "Kubelet Logs (Recent)": "journalctl -u kubelet --no-pager -n 50",
+ "CRI-O Logs (Recent)": "journalctl -u crio --no-pager -n 50",
+ "Flannel Status": "kubectl -n kube-flannel get pods -o wide 2>/dev/null || kubectl -n kube-system get pods -l app=flannel -o wide 2>/dev/null || echo 'Flannel pods not found'",
+ "Network Policies": "kubectl get networkpolicies -A",
+ "Services": "kubectl get svc -A",
+ "PVCs": "kubectl get pvc -A",
+ "Ingresses": "kubectl get ingress -A 2>/dev/null || true",
+ "Disk Usage": "df -h / /var/lib/containers /var/lib/kubelet 2>/dev/null || df -h /",
+ "Memory Info": "free -h",
+ "DNS Resolution": "kubectl run dns-test --image=busybox:1.36 --rm -it --restart=Never -- nslookup kubernetes.default 2>/dev/null || echo 'DNS test skipped'",
+ "Certificate Expiry": "kubeadm certs check-expiration 2>/dev/null || echo 'Not a kubeadm node or kubeadm not found'",
+}
+
+CATEGORY_MAP = {
+ "Cluster Overview": [
+ "Node Status",
+ "Pod Status (All Namespaces)",
+ "Cluster Info",
+ "Component Status",
+ ],
+ "Pod & Workload Health": [
+ "Pod Status (All Namespaces)",
+ "System Pods",
+ "Events (Recent)",
+ ],
+ "Resource Usage": [
+ "Node Resources",
+ "Pod Resources",
+ "Disk Usage",
+ "Memory Info",
+ ],
+ "Networking": [
+ "Flannel Status",
+ "Network Policies",
+ "Services",
+ "Ingresses",
+ "DNS Resolution",
+ ],
+ "Container Runtime & Kubelet": [
+ "CRI-O Status",
+ "Kubelet Status",
+ "CRI-O Logs (Recent)",
+ "Kubelet Logs (Recent)",
+ ],
+ "Security & Certificates": [
+ "Certificate Expiry",
+ "Network Policies",
+ ],
+ "Storage": [
+ "PVCs",
+ "Disk Usage",
+ ],
+}
+
+
+def run_diagnostic(
+ control_plane_node: dict,
+ command_name: str,
+) -> SSHResult:
+ """Run a single diagnostic command on the control-plane node."""
+ command = DIAGNOSTIC_COMMANDS.get(command_name)
+ if not command:
+ return SSHResult(
+ hostname=control_plane_node["ip_address"],
+ command=command_name,
+ return_code=1,
+ stdout="",
+ stderr=f"Unknown diagnostic command: {command_name}",
+ success=False,
+ )
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=60,
+ )
+
+
+def run_category_diagnostics(
+ control_plane_node: dict,
+ category: str,
+) -> dict[str, SSHResult]:
+ """Run all diagnostic commands for a given category."""
+ results = {}
+ command_names = CATEGORY_MAP.get(category, [])
+ for name in command_names:
+ results[name] = run_diagnostic(control_plane_node, name)
+ return results
+
+
+def run_all_diagnostics(control_plane_node: dict) -> dict[str, SSHResult]:
+ """Run every diagnostic command."""
+ results = {}
+ for name in DIAGNOSTIC_COMMANDS:
+ results[name] = run_diagnostic(control_plane_node, name)
+ return results
+
+
+def run_custom_command(
+ control_plane_node: dict,
+ command: str,
+) -> SSHResult:
+ """Run a custom command on the control-plane node."""
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=60,
+ )
+
+
+def format_diagnostics_for_llm(results: dict[str, SSHResult]) -> str:
+ """Format diagnostic results into a text block for the LLM."""
+ sections = []
+ for name, result in results.items():
+ status = "OK" if result.success else "FAILED"
+ output = result.stdout if result.success else result.stderr
+ sections.append(
+ f"### {name} [{status}]\n"
+ f"```\n{output.strip()}\n```\n"
+ )
+ return "\n".join(sections)
+
+
+def analyze_diagnostics(
+ results: dict[str, SSHResult],
+ user_description: str = "",
+ profile: ClusterProfile | None = None,
+) -> str:
+ """Send diagnostic results to the LLM for analysis and recommendations."""
+ diag_text = format_diagnostics_for_llm(results)
+
+ cluster_info = ""
+ if profile:
+ cluster_info = f"""
+Cluster Configuration:
+- Kubernetes: {profile.kubernetes_version}
+- Runtime: CRI-O {profile.crio_version}
+- CNI: Flannel
+- Pod CIDR: {profile.pod_cidr}
+- Service CIDR: {profile.service_cidr}
+"""
+
+ prompt = f"""Analyze the following Kubernetes cluster diagnostic output and provide a detailed assessment.
+{cluster_info}
+
+User's issue description: {user_description or 'General health check'}
+
+== Diagnostic Output ==
+{diag_text}
+== End Diagnostic Output ==
+
+Please provide:
+1. **Health Summary**: Overall cluster health status (Healthy / Degraded / Critical)
+2. **Issues Found**: List each issue with severity (Critical / Warning / Info)
+3. **Root Cause Analysis**: For each issue, explain the likely root cause
+4. **Remediation Steps**: Specific commands or actions to fix each issue
+5. **Preventive Recommendations**: Steps to prevent these issues in the future
+
+Format your response with clear headings and actionable commands where applicable.
+"""
+ return query_llm(prompt)
+
+
+def get_debug_suggestion(
+ error_message: str,
+ context: str = "",
+) -> str:
+ """Get a quick debugging suggestion from the LLM for a specific error."""
+ prompt = f"""I encountered the following error in my Kubernetes cluster (CRI-O + Flannel):
+
+Error: {error_message}
+
+Additional context: {context or 'None'}
+
+Provide a concise diagnosis and the exact commands to fix this issue.
+"""
+ return query_llm(prompt)
+
+
+def check_pod_issues(control_plane_node: dict, namespace: str = "") -> SSHResult:
+ """Check for pods in non-running states."""
+ ns_flag = f"-n {namespace}" if namespace else "-A"
+ command = (
+ f"kubectl get pods {ns_flag} --field-selector="
+ "'status.phase!=Running,status.phase!=Succeeded' -o wide 2>/dev/null; "
+ f"echo '---DESCRIBE---'; "
+ f"for pod in $(kubectl get pods {ns_flag} --field-selector="
+ "'status.phase!=Running,status.phase!=Succeeded' "
+ "-o jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name} {end}' 2>/dev/null); do "
+ "ns=$(echo $pod | cut -d/ -f1); "
+ "name=$(echo $pod | cut -d/ -f2); "
+ "echo \"=== $ns/$name ===\"; "
+ "kubectl describe pod $name -n $ns 2>/dev/null | tail -20; "
+ "done"
+ )
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=60,
+ )
diff --git a/k8s-agent/modules/llm_client.py b/k8s-agent/modules/llm_client.py
new file mode 100644
index 0000000..2b7eb44
--- /dev/null
+++ b/k8s-agent/modules/llm_client.py
@@ -0,0 +1,145 @@
+"""LLM client for the Infosys AI Gateway."""
+
+import json
+from typing import Generator, Optional
+
+import requests
+
+import config
+
+
+SYSTEM_PROMPT = """You are an expert Kubernetes platform engineer specializing in on-premises
+cluster administration. You have deep knowledge of:
+- Kubernetes cluster setup with CRI-O container runtime and Flannel CNI
+- kubeadm-based cluster bootstrapping and lifecycle management
+- Cluster debugging, troubleshooting, and remediation
+- Prometheus and Grafana monitoring stack setup and dashboard design
+- Kubernetes log analysis, error correlation, and root cause analysis
+- Security best practices including RBAC, network policies, and pod security standards
+
+Always provide actionable, production-ready advice. When generating scripts, include
+error handling and idempotency. When diagnosing issues, ask clarifying questions if
+the provided information is insufficient."""
+
+
+def query_llm(
+ user_message: str,
+ system_message: Optional[str] = None,
+ conversation_history: Optional[list[dict]] = None,
+ temperature: Optional[float] = None,
+ max_tokens: Optional[int] = None,
+) -> str:
+ """Send a query to the LLM and return the response text.
+
+ Args:
+ user_message: The user's message/query.
+ system_message: Optional system prompt override.
+ conversation_history: Optional list of prior messages for context.
+ temperature: Optional temperature override.
+ max_tokens: Optional max tokens override.
+
+ Returns:
+ The assistant's response text.
+ """
+ messages = []
+
+ sys_msg = system_message or SYSTEM_PROMPT
+ messages.append({"role": "system", "content": sys_msg})
+
+ if conversation_history:
+ messages.extend(conversation_history)
+
+ messages.append({"role": "user", "content": user_message})
+
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {config.LLM_API_KEY}",
+ }
+
+ payload = {
+ "model": config.LLM_MODEL,
+ "messages": messages,
+ "temperature": temperature if temperature is not None else config.LLM_TEMPERATURE,
+ "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
+ }
+
+ try:
+ response = requests.post(
+ config.LLM_API_URL,
+ headers=headers,
+ json=payload,
+ timeout=120,
+ )
+ response.raise_for_status()
+ data = response.json()
+ return data["choices"][0]["message"]["content"]
+ except requests.exceptions.Timeout:
+ return "Error: LLM request timed out. Please try again."
+ except requests.exceptions.ConnectionError:
+ return "Error: Could not connect to the LLM endpoint. Please check your network and LLM_API_URL configuration."
+ except requests.exceptions.HTTPError as exc:
+ return f"Error: LLM API returned HTTP {exc.response.status_code}: {exc.response.text}"
+ except (KeyError, IndexError, json.JSONDecodeError) as exc:
+ return f"Error: Unexpected LLM response format: {exc}"
+
+
+def stream_llm(
+ user_message: str,
+ system_message: Optional[str] = None,
+ conversation_history: Optional[list[dict]] = None,
+ temperature: Optional[float] = None,
+ max_tokens: Optional[int] = None,
+) -> Generator[str, None, None]:
+ """Stream a response from the LLM token-by-token.
+
+ Yields chunks of text as they arrive from the API.
+ """
+ messages = []
+
+ sys_msg = system_message or SYSTEM_PROMPT
+ messages.append({"role": "system", "content": sys_msg})
+
+ if conversation_history:
+ messages.extend(conversation_history)
+
+ messages.append({"role": "user", "content": user_message})
+
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {config.LLM_API_KEY}",
+ }
+
+ payload = {
+ "model": config.LLM_MODEL,
+ "messages": messages,
+ "temperature": temperature if temperature is not None else config.LLM_TEMPERATURE,
+ "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
+ "stream": True,
+ }
+
+ try:
+ response = requests.post(
+ config.LLM_API_URL,
+ headers=headers,
+ json=payload,
+ timeout=120,
+ stream=True,
+ )
+ response.raise_for_status()
+
+ for line in response.iter_lines(decode_unicode=True):
+ if not line or not line.startswith("data: "):
+ continue
+ data_str = line[len("data: "):]
+ if data_str.strip() == "[DONE]":
+ break
+ try:
+ chunk = json.loads(data_str)
+ delta = chunk.get("choices", [{}])[0].get("delta", {})
+ content = delta.get("content", "")
+ if content:
+ yield content
+ except (json.JSONDecodeError, KeyError, IndexError):
+ continue
+ except requests.exceptions.RequestException as exc:
+ yield f"\n\nError during streaming: {exc}"
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
new file mode 100644
index 0000000..9abc838
--- /dev/null
+++ b/k8s-agent/modules/log_analyzer.py
@@ -0,0 +1,345 @@
+"""Log Analyzer — Kubernetes log collection, parsing, error correlation, and analysis."""
+
+import re
+from collections import Counter
+from dataclasses import dataclass, field
+from typing import Optional
+
+from modules.cluster_creator import run_ssh_command, SSHResult
+from modules.llm_client import query_llm
+
+
+@dataclass
+class LogEntry:
+ """Represents a parsed log line."""
+
+ timestamp: str = ""
+ level: str = "INFO"
+ source: str = ""
+ message: str = ""
+ raw: str = ""
+
+
+@dataclass
+class LogAnalysisResult:
+ """Results from log analysis."""
+
+ total_lines: int = 0
+ error_count: int = 0
+ warning_count: int = 0
+ error_patterns: dict[str, int] = field(default_factory=dict)
+ warning_patterns: dict[str, int] = field(default_factory=dict)
+ timeline: list[dict] = field(default_factory=list)
+ correlated_errors: list[dict] = field(default_factory=list)
+
+
+# ── Log collection commands ───────────────────────────────────────────────
+
+LOG_SOURCES = {
+ "Kubelet": "journalctl -u kubelet --no-pager -n {lines} --since '{since}'",
+ "CRI-O": "journalctl -u crio --no-pager -n {lines} --since '{since}'",
+ "API Server": "kubectl logs -n kube-system -l component=kube-apiserver --tail={lines} --since={since_k8s}",
+ "Controller Manager": "kubectl logs -n kube-system -l component=kube-controller-manager --tail={lines} --since={since_k8s}",
+ "Scheduler": "kubectl logs -n kube-system -l component=kube-scheduler --tail={lines} --since={since_k8s}",
+ "CoreDNS": "kubectl logs -n kube-system -l k8s-app=kube-dns --tail={lines} --since={since_k8s}",
+ "Flannel": "kubectl logs -n kube-flannel -l app=flannel --tail={lines} --since={since_k8s} 2>/dev/null || kubectl logs -n kube-system -l app=flannel --tail={lines} --since={since_k8s} 2>/dev/null || echo 'Flannel logs not found'",
+ "etcd": "kubectl logs -n kube-system -l component=etcd --tail={lines} --since={since_k8s}",
+ "Events": "kubectl get events -A --sort-by='.lastTimestamp' | tail -{lines}",
+}
+
+POD_LOG_COMMAND = "kubectl logs {pod_ref} --tail={lines} --since={since_k8s} {container_flag}"
+POD_PREVIOUS_LOG_COMMAND = "kubectl logs {pod_ref} --previous --tail={lines} {container_flag} 2>/dev/null || echo 'No previous logs available'"
+
+
+def collect_logs(
+ control_plane_node: dict,
+ source: str,
+ lines: int = 200,
+ since: str = "1 hour ago",
+ since_k8s: str = "1h",
+) -> SSHResult:
+ """Collect logs from a specific source on the cluster."""
+ cmd_template = LOG_SOURCES.get(source)
+ if not cmd_template:
+ return SSHResult(
+ hostname=control_plane_node["ip_address"],
+ command=source,
+ return_code=1,
+ stdout="",
+ stderr=f"Unknown log source: {source}",
+ success=False,
+ )
+
+ command = cmd_template.format(
+ lines=lines,
+ since=since,
+ since_k8s=since_k8s,
+ )
+
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=60,
+ )
+
+
+def collect_pod_logs(
+ control_plane_node: dict,
+ namespace: str,
+ pod_name: str,
+ container: str = "",
+ lines: int = 200,
+ since_k8s: str = "1h",
+ previous: bool = False,
+) -> SSHResult:
+ """Collect logs from a specific pod."""
+ pod_ref = f"-n {namespace} {pod_name}"
+ container_flag = f"-c {container}" if container else ""
+
+ if previous:
+ command = POD_PREVIOUS_LOG_COMMAND.format(
+ pod_ref=pod_ref,
+ lines=lines,
+ container_flag=container_flag,
+ )
+ else:
+ command = POD_LOG_COMMAND.format(
+ pod_ref=pod_ref,
+ lines=lines,
+ since_k8s=since_k8s,
+ container_flag=container_flag,
+ )
+
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=60,
+ )
+
+
+def collect_multi_source_logs(
+ control_plane_node: dict,
+ sources: list[str],
+ lines: int = 100,
+ since: str = "1 hour ago",
+ since_k8s: str = "1h",
+) -> dict[str, SSHResult]:
+ """Collect logs from multiple sources."""
+ results = {}
+ for source in sources:
+ results[source] = collect_logs(
+ control_plane_node, source, lines, since, since_k8s
+ )
+ return results
+
+
+# ── Log parsing ───────────────────────────────────────────────────────────
+
+ERROR_PATTERNS = [
+ re.compile(r"\b(?:error|err|fatal|panic|fail(?:ed|ure)?)\b", re.IGNORECASE),
+]
+
+WARNING_PATTERNS = [
+ re.compile(r"\b(?:warn(?:ing)?|deprecated)\b", re.IGNORECASE),
+]
+
+TIMESTAMP_PATTERNS = [
+ re.compile(r"(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2})"),
+ re.compile(r"([A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})"),
+]
+
+
+def parse_log_line(line: str, source: str = "") -> LogEntry:
+ """Parse a single log line into a LogEntry."""
+ entry = LogEntry(raw=line, source=source)
+
+ for pattern in TIMESTAMP_PATTERNS:
+ match = pattern.search(line)
+ if match:
+ entry.timestamp = match.group(1)
+ break
+
+ for pattern in ERROR_PATTERNS:
+ if pattern.search(line):
+ entry.level = "ERROR"
+ break
+ else:
+ for pattern in WARNING_PATTERNS:
+ if pattern.search(line):
+ entry.level = "WARNING"
+ break
+
+ entry.message = line.strip()
+ return entry
+
+
+def analyze_logs(log_text: str, source: str = "") -> LogAnalysisResult:
+ """Analyze a block of log text and extract patterns."""
+ result = LogAnalysisResult()
+ lines = log_text.strip().split("\n")
+ result.total_lines = len(lines)
+
+ error_messages = []
+ warning_messages = []
+
+ for line in lines:
+ if not line.strip():
+ continue
+ entry = parse_log_line(line, source)
+
+ if entry.level == "ERROR":
+ result.error_count += 1
+ normalized = _normalize_error(entry.message)
+ error_messages.append(normalized)
+ elif entry.level == "WARNING":
+ result.warning_count += 1
+ normalized = _normalize_error(entry.message)
+ warning_messages.append(normalized)
+
+ result.error_patterns = dict(Counter(error_messages).most_common(20))
+ result.warning_patterns = dict(Counter(warning_messages).most_common(20))
+
+ return result
+
+
+def _normalize_error(message: str) -> str:
+ """Normalize an error message by removing variable parts for grouping."""
+ normalized = re.sub(r"\b[0-9a-f]{8,}\b", "", message)
+ normalized = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?", "", normalized)
+ normalized = re.sub(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[^\s]*", "", normalized)
+ normalized = re.sub(r"pod/[\w-]+", "pod/", normalized)
+ normalized = re.sub(r"node/[\w.-]+", "node/", normalized)
+ if len(normalized) > 150:
+ normalized = normalized[:150] + "..."
+ return normalized
+
+
+def correlate_errors(
+ multi_source_results: dict[str, SSHResult],
+) -> list[dict]:
+ """Correlate errors across multiple log sources to find related issues."""
+ all_errors = []
+
+ for source, result in multi_source_results.items():
+ if not result.success:
+ continue
+ for line in result.stdout.split("\n"):
+ entry = parse_log_line(line, source)
+ if entry.level == "ERROR":
+ all_errors.append({
+ "source": source,
+ "timestamp": entry.timestamp,
+ "message": entry.message,
+ })
+
+ all_errors.sort(key=lambda e: e.get("timestamp", ""))
+
+ correlated = []
+ window_seconds = 30
+ used = set()
+
+ for i, err in enumerate(all_errors):
+ if i in used:
+ continue
+ group = [err]
+ used.add(i)
+
+ for j in range(i + 1, len(all_errors)):
+ if j in used:
+ continue
+ if all_errors[j].get("source") != err.get("source"):
+ group.append(all_errors[j])
+ used.add(j)
+
+ if len(group) > 1:
+ correlated.append({
+ "primary": err,
+ "related": group[1:],
+ "sources_involved": list({e["source"] for e in group}),
+ })
+
+ return correlated
+
+
+# ── LLM-powered analysis ─────────────────────────────────────────────────
+
+def llm_analyze_logs(
+ log_text: str,
+ source: str = "",
+ context: str = "",
+) -> str:
+ """Send log output to the LLM for deep analysis."""
+ truncated = log_text[-8000:] if len(log_text) > 8000 else log_text
+
+ prompt = f"""Analyze the following Kubernetes logs and provide a detailed assessment.
+
+Log Source: {source or 'Multiple sources'}
+Context: {context or 'General analysis'}
+
+== Log Output ==
+{truncated}
+== End Log Output ==
+
+Please provide:
+1. **Error Summary**: List all distinct errors found with frequency
+2. **Root Cause Analysis**: For each error pattern, explain the likely root cause
+3. **Error Correlation**: Identify errors that are likely related / cascading
+4. **Impact Assessment**: What is the impact of these errors on the cluster?
+5. **Remediation Steps**: Specific commands to fix each issue
+6. **Patterns & Trends**: Any concerning patterns (increasing errors, recurring issues)
+"""
+ return query_llm(prompt)
+
+
+def llm_correlate_analysis(
+ multi_source_logs: dict[str, str],
+ issue_description: str = "",
+) -> str:
+ """Send logs from multiple sources to the LLM for cross-source correlation."""
+ log_sections = []
+ for source, log_text in multi_source_logs.items():
+ truncated = log_text[-3000:] if len(log_text) > 3000 else log_text
+ log_sections.append(f"### {source}\n```\n{truncated}\n```\n")
+
+ all_logs = "\n".join(log_sections)
+
+ prompt = f"""Perform a cross-source correlation analysis on these Kubernetes cluster logs.
+
+Issue Description: {issue_description or 'General health analysis'}
+
+== Multi-Source Logs ==
+{all_logs}
+== End Logs ==
+
+Please provide:
+1. **Cross-Source Correlation**: Identify errors that appear related across different components
+2. **Causal Chain**: Determine the sequence of events / root cause chain
+3. **Timeline Reconstruction**: Reconstruct what happened based on timestamps
+4. **Root Cause**: Identify the single most likely root cause
+5. **Remediation Plan**: Step-by-step plan to resolve the issue
+6. **Monitoring Recommendations**: What alerts/metrics should be added to catch this earlier
+"""
+ return query_llm(prompt)
+
+
+def get_pod_list(
+ control_plane_node: dict,
+ namespace: str = "",
+) -> SSHResult:
+ """Get list of pods for the log analysis UI."""
+ ns_flag = f"-n {namespace}" if namespace else "-A"
+ command = f"kubectl get pods {ns_flag} -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,CONTAINERS:.spec.containers[*].name' --no-headers"
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=30,
+ )
diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py
new file mode 100644
index 0000000..fdf8b42
--- /dev/null
+++ b/k8s-agent/modules/monitoring_setup.py
@@ -0,0 +1,440 @@
+"""Monitoring Setup — Prometheus, Grafana, and dashboard provisioning via SSH."""
+
+from modules.cluster_creator import run_ssh_command, SSHResult
+from modules.llm_client import query_llm
+from modules.profile_manager import ClusterProfile
+
+
+def generate_helm_install_script() -> str:
+ """Generate script to install Helm on the control-plane node."""
+ return """#!/bin/bash
+set -euo pipefail
+
+echo "=== Installing Helm ==="
+
+if command -v helm &>/dev/null; then
+ echo "Helm already installed: $(helm version --short)"
+else
+ curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+ echo "Helm installed: $(helm version --short)"
+fi
+"""
+
+
+def generate_prometheus_install_script(namespace: str = "monitoring") -> str:
+ """Generate script to install kube-prometheus-stack via Helm."""
+ return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Installing Prometheus Stack ==="
+
+# Add Helm repos
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
+helm repo update
+
+# Create namespace
+kubectl create namespace {namespace} --dry-run=client -o yaml | kubectl apply -f -
+
+# Install kube-prometheus-stack
+helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \\
+ --namespace {namespace} \\
+ --set prometheus.prometheusSpec.retention=15d \\
+ --set prometheus.prometheusSpec.resources.requests.memory=512Mi \\
+ --set prometheus.prometheusSpec.resources.requests.cpu=250m \\
+ --set prometheus.prometheusSpec.resources.limits.memory=2Gi \\
+ --set prometheus.prometheusSpec.resources.limits.cpu=1000m \\
+ --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.accessModes[0]=ReadWriteOnce \\
+ --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi \\
+ --set alertmanager.alertmanagerSpec.resources.requests.memory=128Mi \\
+ --set alertmanager.alertmanagerSpec.resources.requests.cpu=50m \\
+ --set grafana.enabled=true \\
+ --set grafana.adminPassword=admin \\
+ --set grafana.persistence.enabled=true \\
+ --set grafana.persistence.size=10Gi \\
+ --set grafana.resources.requests.memory=256Mi \\
+ --set grafana.resources.requests.cpu=100m \\
+ --set grafana.resources.limits.memory=512Mi \\
+ --set grafana.resources.limits.cpu=500m \\
+ --set grafana.sidecar.dashboards.enabled=true \\
+ --set grafana.sidecar.dashboards.searchNamespace=ALL \\
+ --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \\
+ --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \\
+ --wait --timeout 10m
+
+echo ""
+echo "=== Prometheus Stack installed ==="
+echo ""
+kubectl -n {namespace} get pods
+"""
+
+
+def generate_standalone_grafana_script(namespace: str = "monitoring") -> str:
+ """Generate script to install standalone Grafana with provisioned dashboards."""
+ return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Installing Standalone Grafana ==="
+
+helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
+helm repo update
+
+kubectl create namespace {namespace} --dry-run=client -o yaml | kubectl apply -f -
+
+helm upgrade --install grafana grafana/grafana \\
+ --namespace {namespace} \\
+ --set adminPassword=admin \\
+ --set persistence.enabled=true \\
+ --set persistence.size=10Gi \\
+ --set resources.requests.memory=256Mi \\
+ --set resources.requests.cpu=100m \\
+ --set resources.limits.memory=512Mi \\
+ --set resources.limits.cpu=500m \\
+ --set sidecar.dashboards.enabled=true \\
+ --set sidecar.dashboards.searchNamespace=ALL \\
+ --set sidecar.datasources.enabled=true \\
+ --set 'datasources.datasources\\.yaml.apiVersion=1' \\
+ --set 'datasources.datasources\\.yaml.datasources[0].name=Prometheus' \\
+ --set 'datasources.datasources\\.yaml.datasources[0].type=prometheus' \\
+ --set 'datasources.datasources\\.yaml.datasources[0].url=http://prometheus-kube-prometheus-prometheus.{namespace}.svc:9090' \\
+ --set 'datasources.datasources\\.yaml.datasources[0].access=proxy' \\
+ --set 'datasources.datasources\\.yaml.datasources[0].isDefault=true' \\
+ --wait --timeout 5m
+
+echo ""
+echo "=== Grafana installed ==="
+kubectl -n {namespace} get pods -l app.kubernetes.io/name=grafana
+"""
+
+
+GRAFANA_DASHBOARDS = {
+ "cluster-overview": {
+ "name": "Kubernetes Cluster Overview",
+ "description": "Overall cluster health, node status, resource utilization",
+ "gnet_id": 15520,
+ },
+ "node-exporter": {
+ "name": "Node Exporter Full",
+ "description": "Detailed node metrics — CPU, memory, disk, network",
+ "gnet_id": 1860,
+ },
+ "pod-monitoring": {
+ "name": "Kubernetes Pods",
+ "description": "Pod-level CPU, memory, network, restarts",
+ "gnet_id": 15760,
+ },
+ "namespace-resources": {
+ "name": "Namespace Resources",
+ "description": "Resource usage per namespace with quota tracking",
+ "gnet_id": 15758,
+ },
+ "coredns": {
+ "name": "CoreDNS",
+ "description": "DNS query rates, latency, errors",
+ "gnet_id": 15762,
+ },
+ "etcd": {
+ "name": "etcd",
+ "description": "etcd cluster health, leader changes, WAL sync duration",
+ "gnet_id": 3070,
+ },
+ "api-server": {
+ "name": "Kubernetes API Server",
+ "description": "API server request rates, latency, errors",
+ "gnet_id": 15761,
+ },
+ "persistent-volumes": {
+ "name": "Persistent Volumes",
+ "description": "PV/PVC usage and capacity tracking",
+ "gnet_id": 13646,
+ },
+}
+
+
+def generate_dashboard_import_script(
+ dashboard_keys: list[str],
+ namespace: str = "monitoring",
+) -> str:
+ """Generate script to import Grafana dashboards as ConfigMaps."""
+ configmaps = []
+ for key in dashboard_keys:
+ dash = GRAFANA_DASHBOARDS.get(key)
+ if not dash:
+ continue
+ configmaps.append(f"""
+# Import: {dash['name']}
+cat <<'DASHEOF' | kubectl apply -f -
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: grafana-dashboard-{key}
+ namespace: {namespace}
+ labels:
+ grafana_dashboard: "1"
+data:
+ {key}.json: |
+ {{
+ "annotations": {{"list": []}},
+ "description": "{dash['description']}",
+ "editable": true,
+ "gnetId": {dash['gnet_id']},
+ "title": "{dash['name']}",
+ "uid": "{key}",
+ "version": 1,
+ "__inputs": [
+ {{
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "type": "datasource",
+ "pluginId": "prometheus"
+ }}
+ ]
+ }}
+DASHEOF
+echo " Imported: {dash['name']} (grafana.net #{dash['gnet_id']})"
+""")
+
+ script_body = "\n".join(configmaps)
+
+ return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Importing Grafana Dashboards ==="
+{script_body}
+
+# Also import from grafana.net directly via Grafana API
+GRAFANA_POD=$(kubectl -n {namespace} get pods -l app.kubernetes.io/name=grafana -o jsonpath='{{.items[0].metadata.name}}' 2>/dev/null || echo "")
+
+if [ -n "$GRAFANA_POD" ]; then
+ echo ""
+ echo ">> Importing full dashboards from grafana.net via API..."
+ kubectl -n {namespace} port-forward "$GRAFANA_POD" 3000:3000 &
+ PF_PID=$!
+ sleep 3
+
+ for gnet_id in {' '.join(str(GRAFANA_DASHBOARDS[k]['gnet_id']) for k in dashboard_keys if k in GRAFANA_DASHBOARDS)}; do
+ curl -s -X POST http://localhost:3000/api/dashboards/import \\
+ -H "Content-Type: application/json" \\
+ -u admin:admin \\
+ -d "{{
+ \\"dashboard\\": {{\\"id\\": null}},
+ \\"overwrite\\": true,
+ \\"inputs\\": [{{\\"name\\": \\"DS_PROMETHEUS\\", \\"type\\": \\"datasource\\", \\"pluginId\\": \\"prometheus\\", \\"value\\": \\"Prometheus\\"}}],
+ \\"folderId\\": 0,
+ \\"gnetId\\": $gnet_id
+ }}" 2>/dev/null && echo " Imported grafana.net #$gnet_id" || echo " Failed grafana.net #$gnet_id (non-critical)"
+ done
+
+ kill $PF_PID 2>/dev/null || true
+fi
+
+echo ""
+echo "=== Dashboard import complete ==="
+"""
+
+
+def generate_alerting_rules_script(namespace: str = "monitoring") -> str:
+ """Generate PrometheusRule resources for common K8s alerts."""
+ return f"""#!/bin/bash
+set -euo pipefail
+
+echo "=== Installing Alert Rules ==="
+
+cat <<'EOF' | kubectl apply -f -
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: k8s-cluster-alerts
+ namespace: {namespace}
+ labels:
+ release: prometheus
+spec:
+ groups:
+ - name: k8s-node-alerts
+ rules:
+ - alert: NodeNotReady
+ expr: kube_node_status_condition{{condition="Ready",status="true"}} == 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Node {{{{ $labels.node }}}} is not ready"
+ description: "Node has been in NotReady state for more than 5 minutes."
+ - alert: NodeHighCPU
+ expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{{mode="idle"}}[5m])) * 100) > 85
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Node {{{{ $labels.instance }}}} has high CPU usage"
+ description: "CPU usage is above 85% for more than 10 minutes."
+ - alert: NodeHighMemory
+ expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Node {{{{ $labels.instance }}}} has high memory usage"
+ description: "Memory usage is above 85% for more than 10 minutes."
+ - alert: NodeDiskPressure
+ expr: (1 - node_filesystem_avail_bytes{{mountpoint="/"}} / node_filesystem_size_bytes{{mountpoint="/"}}) * 100 > 85
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Node {{{{ $labels.instance }}}} disk usage is high"
+ description: "Root filesystem usage is above 85%."
+ - name: k8s-pod-alerts
+ rules:
+ - alert: PodCrashLooping
+ expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 5
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Pod {{{{ $labels.namespace }}}}/{{{{ $labels.pod }}}} is crash looping"
+ description: "Pod has restarted more than 5 times in the last 15 minutes."
+ - alert: PodNotReady
+ expr: kube_pod_status_ready{{condition="true"}} == 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Pod {{{{ $labels.namespace }}}}/{{{{ $labels.pod }}}} is not ready"
+ description: "Pod has been in a non-ready state for more than 10 minutes."
+ - alert: PVCAlmostFull
+ expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 > 85
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "PVC {{{{ $labels.persistentvolumeclaim }}}} is almost full"
+ description: "PVC in namespace {{{{ $labels.namespace }}}} is over 85% full."
+ - name: k8s-etcd-alerts
+ rules:
+ - alert: EtcdHighLatency
+ expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: "etcd WAL fsync latency is high"
+ description: "99th percentile etcd WAL fsync duration exceeds 500ms."
+EOF
+
+echo "=== Alert rules installed ==="
+"""
+
+
+def install_helm(control_plane_node: dict) -> SSHResult:
+ """Install Helm on the control-plane node."""
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=generate_helm_install_script(),
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=120,
+ )
+
+
+def install_prometheus_stack(
+ control_plane_node: dict,
+ namespace: str = "monitoring",
+) -> SSHResult:
+ """Install the full kube-prometheus-stack."""
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=generate_prometheus_install_script(namespace),
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=900,
+ )
+
+
+def install_dashboards(
+ control_plane_node: dict,
+ dashboard_keys: list[str],
+ namespace: str = "monitoring",
+) -> SSHResult:
+ """Import selected Grafana dashboards."""
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=generate_dashboard_import_script(dashboard_keys, namespace),
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=300,
+ )
+
+
+def install_alert_rules(
+ control_plane_node: dict,
+ namespace: str = "monitoring",
+) -> SSHResult:
+ """Install Prometheus alerting rules."""
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=generate_alerting_rules_script(namespace),
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=60,
+ )
+
+
+def get_monitoring_status(
+ control_plane_node: dict,
+ namespace: str = "monitoring",
+) -> SSHResult:
+ """Check the status of the monitoring stack."""
+ command = f"""
+echo "=== Monitoring Stack Status ==="
+echo ""
+echo ">> Pods:"
+kubectl -n {namespace} get pods -o wide
+echo ""
+echo ">> Services:"
+kubectl -n {namespace} get svc
+echo ""
+echo ">> PVCs:"
+kubectl -n {namespace} get pvc
+echo ""
+echo ">> PrometheusRules:"
+kubectl -n {namespace} get prometheusrules 2>/dev/null || echo "No PrometheusRules found"
+echo ""
+echo ">> ServiceMonitors:"
+kubectl -n {namespace} get servicemonitors 2>/dev/null || echo "No ServiceMonitors found"
+"""
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=30,
+ )
+
+
+def get_monitoring_advice(
+ profile: ClusterProfile,
+ current_status: str = "",
+) -> str:
+ """Ask the LLM for monitoring setup advice."""
+ prompt = f"""I have a Kubernetes cluster with the following setup:
+- Kubernetes: {profile.kubernetes_version}
+- Runtime: CRI-O {profile.crio_version}
+- CNI: Flannel
+- Nodes: {len(profile.nodes)} ({len(profile.get_control_plane_nodes())} control-plane, {len(profile.get_worker_nodes())} workers)
+
+Current monitoring status:
+{current_status or 'Not yet installed'}
+
+Please recommend:
+1. The optimal Prometheus retention and resource settings for this cluster size
+2. Essential Grafana dashboards to install
+3. Critical alerting rules beyond the standard set
+4. Any additional exporters I should install (e.g., blackbox, SNMP)
+5. Log aggregation recommendations (Loki, EFK, etc.)
+"""
+ return query_llm(prompt)
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
new file mode 100644
index 0000000..a4ad9cf
--- /dev/null
+++ b/k8s-agent/modules/profile_manager.py
@@ -0,0 +1,119 @@
+"""Cluster Profile Manager — CRUD operations for K8s cluster profiles."""
+
+import json
+import os
+import time
+from dataclasses import asdict, dataclass, field
+from typing import Optional
+
+import config
+
+
+@dataclass
+class NodeInfo:
+ """Represents a node in the cluster."""
+
+ hostname: str
+ ip_address: str
+ role: str # "control-plane" or "worker"
+ ssh_user: str = "root"
+ ssh_port: int = 22
+ ssh_key_path: str = "~/.ssh/id_rsa"
+
+
+@dataclass
+class ClusterProfile:
+ """Represents a complete cluster profile configuration."""
+
+ name: str
+ description: str = ""
+ kubernetes_version: str = "1.30"
+ crio_version: str = "1.30"
+ cni_plugin: str = "flannel"
+ pod_cidr: str = "10.244.0.0/16"
+ service_cidr: str = "10.96.0.0/12"
+ dns_domain: str = "cluster.local"
+ nodes: list[dict] = field(default_factory=list)
+ created_at: str = ""
+ updated_at: str = ""
+ status: str = "draft" # draft, provisioning, active, error
+ kubeconfig_path: str = ""
+ monitoring_enabled: bool = False
+ pod_security_standard: str = "restricted" # privileged, baseline, restricted
+
+ def get_control_plane_nodes(self) -> list[dict]:
+ return [n for n in self.nodes if n.get("role") == "control-plane"]
+
+ def get_worker_nodes(self) -> list[dict]:
+ return [n for n in self.nodes if n.get("role") == "worker"]
+
+
+def _profile_path(name: str) -> str:
+ """Return the file path for a given profile name."""
+ safe_name = name.replace(" ", "_").replace("/", "_").lower()
+ return os.path.join(config.PROFILES_DIR, f"{safe_name}.json")
+
+
+def save_profile(profile: ClusterProfile) -> str:
+ """Save a cluster profile to disk.
+
+ Returns the file path where the profile was saved.
+ """
+ now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+ if not profile.created_at:
+ profile.created_at = now
+ profile.updated_at = now
+
+ path = _profile_path(profile.name)
+ with open(path, "w") as f:
+ json.dump(asdict(profile), f, indent=2)
+ return path
+
+
+def load_profile(name: str) -> Optional[ClusterProfile]:
+ """Load a cluster profile from disk by name."""
+ path = _profile_path(name)
+ if not os.path.exists(path):
+ return None
+ with open(path, "r") as f:
+ data = json.load(f)
+ return ClusterProfile(**data)
+
+
+def list_profiles() -> list[ClusterProfile]:
+ """List all saved cluster profiles."""
+ profiles = []
+ if not os.path.exists(config.PROFILES_DIR):
+ return profiles
+ for filename in sorted(os.listdir(config.PROFILES_DIR)):
+ if filename.endswith(".json"):
+ filepath = os.path.join(config.PROFILES_DIR, filename)
+ try:
+ with open(filepath, "r") as f:
+ data = json.load(f)
+ profiles.append(ClusterProfile(**data))
+ except (json.JSONDecodeError, TypeError):
+ continue
+ return profiles
+
+
+def delete_profile(name: str) -> bool:
+ """Delete a cluster profile by name.
+
+ Returns True if the profile was deleted, False if it didn't exist.
+ """
+ path = _profile_path(name)
+ if os.path.exists(path):
+ os.remove(path)
+ return True
+ return False
+
+
+def update_profile_status(name: str, status: str) -> bool:
+ """Update the status field of an existing profile."""
+ profile = load_profile(name)
+ if profile is None:
+ return False
+ profile.status = status
+ save_profile(profile)
+ return True
diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt
new file mode 100644
index 0000000..effeada
--- /dev/null
+++ b/k8s-agent/requirements.txt
@@ -0,0 +1,6 @@
+streamlit>=1.32.0
+requests>=2.31.0
+pyyaml>=6.0.1
+pandas>=2.2.0
+plotly>=5.18.0
+streamlit-option-menu>=0.3.12
diff --git a/k8s-agent/templates/.gitkeep b/k8s-agent/templates/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/k8s-agent/templates/.gitkeep
@@ -0,0 +1 @@
+
From dc1fb13c25429f4e01358a3e5bc683812301f26a Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 09:49:27 +0000
Subject: [PATCH 02/31] Clean up unused imports and dependencies
- Remove unused NodeInfo import from app.py
- Remove unused pyyaml and pandas from requirements.txt
---
k8s-agent/app.py | 1 -
k8s-agent/requirements.txt | 2 --
2 files changed, 3 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 4225a81..59ff5aa 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -13,7 +13,6 @@
import config
from modules.profile_manager import (
ClusterProfile,
- NodeInfo,
save_profile,
load_profile,
list_profiles,
diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt
index effeada..96fb8cf 100644
--- a/k8s-agent/requirements.txt
+++ b/k8s-agent/requirements.txt
@@ -1,6 +1,4 @@
streamlit>=1.32.0
requests>=2.31.0
-pyyaml>=6.0.1
-pandas>=2.2.0
plotly>=5.18.0
streamlit-option-menu>=0.3.12
From 5566c5e43aad878bdbbe05b0e9c1eb3b124354c9 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 09:59:19 +0000
Subject: [PATCH 03/31] Add CRI-O custom storage paths and proxy settings for
master node
- Add crio_root, crio_runroot, kubelet_root, log_root fields to ClusterProfile
- Add http_proxy, https_proxy, no_proxy, http_proxy_alt, https_proxy_alt fields
- Update generated scripts to configure CRI-O storage paths via crio.conf.d
- Update control-plane init script to use custom audit log dir and kubelet root
- Add proxy env vars to common setup and control-plane init scripts
- Add Storage Paths and Proxy Settings sections to Profile Manager UI
- Show storage/proxy details in Manage Profiles view and profile summary
---
k8s-agent/app.py | 105 +++++++++++++++++++++++++
k8s-agent/modules/cluster_creator.py | 111 +++++++++++++++++++++++++--
k8s-agent/modules/profile_manager.py | 11 +++
3 files changed, 220 insertions(+), 7 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 59ff5aa..471d1a0 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -269,6 +269,76 @@ def page_profile_manager():
service_cidr = st.text_input("Service CIDR", value="10.96.0.0/12")
dns_domain = st.text_input("DNS Domain", value="cluster.local")
+ st.divider()
+ st.markdown("### Storage Paths")
+ st.markdown(
+ "Configure where CRI-O stores container images, pods, and logs. "
+ "Change these to use a dedicated disk instead of the default `/var/lib`."
+ )
+ scol1, scol2 = st.columns(2)
+ with scol1:
+ crio_root = st.text_input(
+ "CRI-O Storage Root",
+ value="/var/lib/containers/storage",
+ help="Root directory for CRI-O container/image storage (default: /var/lib/containers/storage)",
+ )
+ crio_runroot = st.text_input(
+ "CRI-O Run Root",
+ value="/run/containers/storage",
+ help="Runtime root for CRI-O (default: /run/containers/storage)",
+ )
+ with scol2:
+ kubelet_root = st.text_input(
+ "Kubelet Data Directory",
+ value="/var/lib/kubelet",
+ help="Kubelet data directory for pods, volumes, etc. (default: /var/lib/kubelet)",
+ )
+ log_root = st.text_input(
+ "Log Root Directory",
+ value="/var/log",
+ help="Base directory for all logs — CRI-O pod logs, kubernetes audit logs, etc. (default: /var/log)",
+ )
+
+ st.divider()
+ st.markdown("### Proxy Settings (Master Node)")
+ st.markdown(
+ "Configure HTTP/HTTPS proxy for the master/control-plane node. "
+ "These are used during package installation and cluster initialization."
+ )
+ pcol1, pcol2 = st.columns(2)
+ with pcol1:
+ http_proxy = st.text_input(
+ "HTTP Proxy",
+ value="",
+ placeholder="http://proxy.example.com:8080",
+ help="Primary HTTP proxy for outbound connections",
+ )
+ https_proxy = st.text_input(
+ "HTTPS Proxy",
+ value="",
+ placeholder="http://proxy.example.com:8443",
+ help="Primary HTTPS proxy for outbound connections",
+ )
+ no_proxy = st.text_input(
+ "No Proxy",
+ value="",
+ placeholder="localhost,127.0.0.1,10.96.0.0/12,10.244.0.0/16",
+ help="Comma-separated list of hosts/CIDRs to bypass proxy",
+ )
+ with pcol2:
+ http_proxy_alt = st.text_input(
+ "Alternate HTTP Proxy",
+ value="",
+ placeholder="http://backup-proxy.example.com:8080",
+ help="Fallback HTTP proxy if the primary is unavailable",
+ )
+ https_proxy_alt = st.text_input(
+ "Alternate HTTPS Proxy",
+ value="",
+ placeholder="http://backup-proxy.example.com:8443",
+ help="Fallback HTTPS proxy if the primary is unavailable",
+ )
+
st.divider()
st.markdown("### Nodes")
st.markdown("Define your control-plane and worker nodes.")
@@ -322,6 +392,15 @@ def page_profile_manager():
dns_domain=dns_domain,
nodes=valid_nodes,
pod_security_standard=pod_security,
+ crio_root=crio_root,
+ crio_runroot=crio_runroot,
+ kubelet_root=kubelet_root,
+ log_root=log_root,
+ http_proxy=http_proxy,
+ https_proxy=https_proxy,
+ no_proxy=no_proxy,
+ http_proxy_alt=http_proxy_alt,
+ https_proxy_alt=https_proxy_alt,
)
path = save_profile(profile)
st.session_state.active_profile = name
@@ -343,6 +422,12 @@ def page_profile_manager():
st.markdown(f"**Kubernetes:** {profile.kubernetes_version} | **CRI-O:** {profile.crio_version}")
st.markdown(f"**Pod CIDR:** {profile.pod_cidr} | **Service CIDR:** {profile.service_cidr}")
st.markdown(f"**Pod Security:** {profile.pod_security_standard}")
+ st.markdown(f"**CRI-O Root:** `{profile.crio_root}` | **Kubelet Dir:** `{profile.kubelet_root}`")
+ st.markdown(f"**Log Root:** `{profile.log_root}`")
+ if profile.http_proxy or profile.https_proxy:
+ st.markdown(f"**Proxy:** `{profile.http_proxy or profile.https_proxy}`")
+ if profile.http_proxy_alt or profile.https_proxy_alt:
+ st.markdown(f"**Alt Proxy:** `{profile.http_proxy_alt or profile.https_proxy_alt}`")
with col2:
st.markdown("**Nodes:**")
for node in profile.nodes:
@@ -1086,6 +1171,26 @@ def _show_profile_summary(profile: ClusterProfile):
cols[3].metric("CNI", "Flannel")
cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W")
+ with st.expander("Storage & Proxy Details", expanded=False):
+ scol1, scol2, scol3 = st.columns(3)
+ with scol1:
+ st.markdown(f"**CRI-O Root:** `{profile.crio_root}`")
+ st.markdown(f"**CRI-O RunRoot:** `{profile.crio_runroot}`")
+ with scol2:
+ st.markdown(f"**Kubelet Dir:** `{profile.kubelet_root}`")
+ st.markdown(f"**Log Root:** `{profile.log_root}`")
+ with scol3:
+ if profile.http_proxy or profile.https_proxy:
+ st.markdown(f"**HTTP Proxy:** `{profile.http_proxy or 'N/A'}`")
+ st.markdown(f"**HTTPS Proxy:** `{profile.https_proxy or 'N/A'}`")
+ if profile.no_proxy:
+ st.markdown(f"**No Proxy:** `{profile.no_proxy}`")
+ if profile.http_proxy_alt or profile.https_proxy_alt:
+ st.markdown(f"**Alt HTTP Proxy:** `{profile.http_proxy_alt or 'N/A'}`")
+ st.markdown(f"**Alt HTTPS Proxy:** `{profile.https_proxy_alt or 'N/A'}`")
+ if not (profile.http_proxy or profile.https_proxy or profile.http_proxy_alt or profile.https_proxy_alt):
+ st.markdown("**Proxy:** Not configured")
+
# ── Main Router ───────────────────────────────────────────────────────────
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index ef89dea..830a56e 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -101,16 +101,76 @@ def test_ssh_connectivity(node: dict) -> SSHResult:
)
+def _proxy_env_block(profile: ClusterProfile) -> str:
+ """Generate shell export lines for proxy environment variables."""
+ lines = []
+ proxy = profile.http_proxy or profile.http_proxy_alt
+ proxys = profile.https_proxy or profile.https_proxy_alt
+ if proxy:
+ lines.append(f'export http_proxy="{proxy}"')
+ lines.append(f'export HTTP_PROXY="{proxy}"')
+ if proxys:
+ lines.append(f'export https_proxy="{proxys}"')
+ lines.append(f'export HTTPS_PROXY="{proxys}"')
+ if profile.no_proxy:
+ lines.append(f'export no_proxy="{profile.no_proxy}"')
+ lines.append(f'export NO_PROXY="{profile.no_proxy}"')
+ return "\n".join(lines)
+
+
def generate_common_setup_script(profile: ClusterProfile) -> str:
"""Generate the common setup script that runs on ALL nodes (control-plane + workers)."""
+ proxy_block = _proxy_env_block(profile)
+ proxy_section = ""
+ if proxy_block:
+ proxy_section = f"""
+# ── 0. Proxy configuration ───────────────────────────────────────────────
+echo ">> Configuring proxy settings..."
+{proxy_block}
+
+# Persist proxy in /etc/environment for all users
+cat >> /etc/environment <> Configuring CRI-O custom storage root: {profile.crio_root}"
+mkdir -p "{profile.crio_root}"
+mkdir -p "{profile.crio_runroot}"
+"""
+
+ kubelet_section = ""
+ if profile.kubelet_root != "/var/lib/kubelet":
+ kubelet_section = f"""
+# ── Custom kubelet data directory ────────────────────────────────────────
+echo ">> Configuring kubelet data directory: {profile.kubelet_root}"
+mkdir -p "{profile.kubelet_root}"
+"""
+
+ log_section = ""
+ if profile.log_root != "/var/log":
+ log_section = f"""
+# ── Custom log directory ─────────────────────────────────────────────────
+echo ">> Configuring custom log root: {profile.log_root}"
+mkdir -p "{profile.log_root}/pods"
+mkdir -p "{profile.log_root}/containers"
+"""
+
return f"""#!/bin/bash
set -euo pipefail
echo "=== K8s Node Common Setup ==="
echo "Kubernetes Version: {profile.kubernetes_version}"
echo "CRI-O Version: {profile.crio_version}"
+echo "CRI-O Storage Root: {profile.crio_root}"
+echo "Kubelet Data Dir: {profile.kubelet_root}"
+echo "Log Root: {profile.log_root}"
echo "Timestamp: $(date -u)"
-
+{proxy_section}{crio_storage_section}{kubelet_section}{log_section}
# ── 1. System prerequisites ──────────────────────────────────────────────
echo ">> Disabling swap..."
swapoff -a
@@ -182,8 +242,19 @@ def generate_common_setup_script(profile: ClusterProfile) -> str:
fi
systemctl daemon-reload
+
+# ── Configure CRI-O storage paths ────────────────────────────────────────
+echo ">> Configuring CRI-O storage to {profile.crio_root}..."
+mkdir -p /etc/crio/crio.conf.d
+cat > /etc/crio/crio.conf.d/01-storage.conf <> CRI-O installed and running."
+echo ">> CRI-O installed and configured (storage: {profile.crio_root})."
# ── 3. Install kubeadm, kubelet, kubectl ──────────────────────────────────
echo ">> Installing Kubernetes {profile.kubernetes_version} components..."
@@ -223,12 +294,31 @@ def generate_control_plane_init_script(profile: ClusterProfile) -> str:
cp_nodes = profile.get_control_plane_nodes()
cp_ip = cp_nodes[0]["ip_address"] if cp_nodes else "CONTROL_PLANE_IP"
+ # Build proxy environment block for the control-plane
+ proxy_block = _proxy_env_block(profile)
+ proxy_section = ""
+ if proxy_block:
+ proxy_section = f"""
+# ── Proxy configuration (master node) ───────────────────────────────────
+echo ">> Setting proxy environment for kubeadm..."
+{proxy_block}
+"""
+
+ # Audit log path respects custom log_root
+ audit_log_dir = f"{profile.log_root}/kubernetes"
+
+ # Extra kubelet args for custom root dir
+ kubelet_extra = ' container-runtime-endpoint: "unix:///var/run/crio/crio.sock"'
+ if profile.kubelet_root != "/var/lib/kubelet":
+ kubelet_extra += f'\n root-dir: "{profile.kubelet_root}"'
+
return f"""#!/bin/bash
set -euo pipefail
echo "=== Initializing Kubernetes Control Plane ==="
-
+{proxy_section}
# ── kubeadm init ──────────────────────────────────────────────────────────
+mkdir -p "{audit_log_dir}"
cat > /tmp/kubeadm-config.yaml < str:
nodeRegistration:
criSocket: "unix:///var/run/crio/crio.sock"
kubeletExtraArgs:
- container-runtime-endpoint: "unix:///var/run/crio/crio.sock"
+{kubelet_extra}
---
apiVersion: kubeadm.k8s.io/v1beta3
kind: ClusterConfiguration
@@ -252,14 +342,14 @@ def generate_control_plane_init_script(profile: ClusterProfile) -> str:
extraArgs:
authorization-mode: "Node,RBAC"
enable-admission-plugins: "NodeRestriction,PodSecurity"
- audit-log-path: "/var/log/kubernetes/audit.log"
+ audit-log-path: "{audit_log_dir}/audit.log"
audit-log-maxage: "30"
audit-log-maxbackup: "10"
audit-log-maxsize: "100"
extraVolumes:
- name: audit-log
- hostPath: "/var/log/kubernetes"
- mountPath: "/var/log/kubernetes"
+ hostPath: "{audit_log_dir}"
+ mountPath: "{audit_log_dir}"
pathType: DirectoryOrCreate
controllerManager:
extraArgs:
@@ -530,6 +620,13 @@ def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
- Pod CIDR: {profile.pod_cidr}
- Service CIDR: {profile.service_cidr}
- Pod Security Standard: {profile.pod_security_standard}
+- CRI-O Storage Root: {profile.crio_root}
+- Kubelet Data Dir: {profile.kubelet_root}
+- Log Root: {profile.log_root}
+- HTTP Proxy: {profile.http_proxy or 'none'}
+- HTTPS Proxy: {profile.https_proxy or 'none'}
+- Alternate HTTP Proxy: {profile.http_proxy_alt or 'none'}
+- Alternate HTTPS Proxy: {profile.https_proxy_alt or 'none'}
Nodes:
{nodes_str}
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
index a4ad9cf..9754801 100644
--- a/k8s-agent/modules/profile_manager.py
+++ b/k8s-agent/modules/profile_manager.py
@@ -40,6 +40,17 @@ class ClusterProfile:
kubeconfig_path: str = ""
monitoring_enabled: bool = False
pod_security_standard: str = "restricted" # privileged, baseline, restricted
+ # CRI-O storage paths (override defaults in /var/lib)
+ crio_root: str = "/var/lib/containers/storage" # container storage root
+ crio_runroot: str = "/run/containers/storage" # runtime root
+ kubelet_root: str = "/var/lib/kubelet" # kubelet data dir
+ log_root: str = "/var/log" # base log directory
+ # Proxy settings for master node
+ http_proxy: str = ""
+ https_proxy: str = ""
+ no_proxy: str = ""
+ http_proxy_alt: str = "" # alternate proxy
+ https_proxy_alt: str = "" # alternate proxy
def get_control_plane_nodes(self) -> list[dict]:
return [n for n in self.nodes if n.get("role") == "control-plane"]
From b47ee9e8be44d3ff60dbcd560980e4a5ad47fc35 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 10:16:25 +0000
Subject: [PATCH 04/31] Add gitignore entries for profile data and pycache
---
.gitignore | 2 ++
1 file changed, 2 insertions(+)
diff --git a/.gitignore b/.gitignore
index 2a1bb18..dc0bed2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,5 @@ charts/*/charts/
*.key
kubeconfig*
k8s-agent/__pycache__/
+k8s-agent/data/profiles/*.json
+k8s-agent/modules/__pycache__/
From cb3658f682f1d95f651afc823b7e7673d41baa6b Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 10:34:30 +0000
Subject: [PATCH 05/31] Add step-by-step SSH provisioning with granular
per-step progress
---
k8s-agent/app.py | 170 ++++++---
k8s-agent/modules/cluster_creator.py | 523 ++++++++++++++++++++++++++-
2 files changed, 637 insertions(+), 56 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 471d1a0..c41ba61 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -32,6 +32,12 @@
apply_best_practices,
get_cluster_status,
get_llm_cluster_advice,
+ ProvisionStep,
+ _run_step,
+ get_common_setup_steps,
+ get_control_plane_steps,
+ get_worker_join_steps,
+ get_best_practices_steps,
)
from modules.cluster_debugger import (
DIAGNOSTIC_COMMANDS,
@@ -522,8 +528,8 @@ def page_cluster_creation():
with tab_provision:
st.markdown("### Automated Cluster Provisioning")
st.warning(
- "This will SSH into each node and install Kubernetes components. "
- "Ensure all nodes are accessible and you have root/sudo access."
+ "This will SSH into each node and execute every provisioning step "
+ "automatically. Ensure all nodes are accessible and you have root/sudo access."
)
cp_nodes = profile.get_control_plane_nodes()
@@ -542,80 +548,136 @@ def page_cluster_creation():
if st.button("Start Provisioning", type="primary", use_container_width=True):
update_profile_status(profile.name, "provisioning")
+ overall_success = True
- # Step 1: Common setup on all nodes
+ # ── Step 1: Common setup on ALL nodes (granular per-step) ────
if step1:
st.markdown("---")
- st.markdown("### Step 1: Common Setup")
+ st.markdown("### Step 1: Common Node Setup")
+ common_steps = get_common_setup_steps(profile)
for node in profile.nodes:
- with st.status(
- f"Setting up {node.get('hostname', node['ip_address'])} ({node['role']})...",
- expanded=True,
- ):
- result = provision_node_common(node, profile)
- if result.success:
- st.success(f"Common setup complete on {node['ip_address']}")
- else:
- st.error(f"Setup failed on {node['ip_address']}")
- st.code(result.stderr, language="text")
+ node_label = f"{node.get('hostname', node['ip_address'])} ({node['role']})"
+ st.markdown(f"#### Node: {node_label}")
+ node_ok = True
+ progress = st.progress(0, text=f"Starting setup on {node_label}...")
+ for idx, step in enumerate(common_steps):
+ pct = int((idx / len(common_steps)) * 100)
+ progress.progress(pct, text=f"[{idx+1}/{len(common_steps)}] {step.title}")
+ with st.status(f"{step.title}...", expanded=False) as status:
+ result = _run_step(node, step)
+ if result.success:
+ st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text")
+ status.update(label=f"{step.title} — done", state="complete")
+ else:
+ st.error(f"FAILED: {step.title}")
+ st.code(result.stderr or result.stdout, language="text")
+ status.update(label=f"{step.title} — FAILED", state="error")
+ node_ok = False
+ if step.fatal:
+ overall_success = False
+ break
+ progress.progress(100, text=f"{'Setup complete' if node_ok else 'Setup FAILED'} on {node_label}")
+ if node_ok:
+ st.success(f"Common setup complete on {node['ip_address']}")
+ else:
+ st.error(f"Common setup failed on {node['ip_address']}")
- # Step 2: Initialize control plane
- if step2 and cp_nodes:
+ # ── Step 2: Control plane init (granular per-step) ───────────
+ if step2 and cp_nodes and overall_success:
st.markdown("---")
st.markdown("### Step 2: Control Plane Initialization")
cp_node = cp_nodes[0]
- with st.status(f"Initializing control plane on {cp_node['ip_address']}...", expanded=True):
- result = init_control_plane(cp_node, profile)
- if result.success:
- st.success("Control plane initialized!")
- st.code(result.stdout[-2000:], language="text")
- else:
- st.error("Control plane initialization failed!")
- st.code(result.stderr, language="text")
+ cp_steps = get_control_plane_steps(profile)
+ progress = st.progress(0, text="Starting control plane init...")
+ for idx, step in enumerate(cp_steps):
+ pct = int((idx / len(cp_steps)) * 100)
+ progress.progress(pct, text=f"[{idx+1}/{len(cp_steps)}] {step.title}")
+ with st.status(f"{step.title}...", expanded=False) as status:
+ result = _run_step(cp_node, step)
+ if result.success:
+ st.code(result.stdout[-2000:] if result.stdout else "(no output)", language="text")
+ status.update(label=f"{step.title} — done", state="complete")
+ else:
+ st.error(f"FAILED: {step.title}")
+ st.code(result.stderr or result.stdout, language="text")
+ status.update(label=f"{step.title} — FAILED", state="error")
+ overall_success = False
+ if step.fatal:
+ break
+ progress.progress(100, text="Control plane initialization complete" if overall_success else "Control plane init FAILED")
+ if overall_success:
+ st.success("Control plane initialized!")
+ else:
+ st.error("Control plane initialization failed!")
- # Step 3: Join worker nodes
- if step3 and worker_nodes and cp_nodes:
+ # ── Step 3: Join workers (granular per-step) ─────────────────
+ if step3 and worker_nodes and cp_nodes and overall_success:
st.markdown("---")
st.markdown("### Step 3: Join Worker Nodes")
join_cmd = retrieve_join_command(cp_nodes[0])
if join_cmd:
+ worker_join_steps = get_worker_join_steps(join_cmd)
for node in worker_nodes:
- with st.status(f"Joining {node.get('hostname', node['ip_address'])}...", expanded=True):
- result = join_worker_node(node, join_cmd)
- if result.success:
- st.success(f"Worker {node['ip_address']} joined!")
- else:
- st.error(f"Failed to join {node['ip_address']}")
- st.code(result.stderr, language="text")
+ node_label = f"{node.get('hostname', node['ip_address'])}"
+ st.markdown(f"#### Worker: {node_label}")
+ for step in worker_join_steps:
+ with st.status(f"{step.title} on {node_label}...", expanded=False) as status:
+ result = _run_step(node, step)
+ if result.success:
+ st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text")
+ status.update(label=f"{step.title} — done", state="complete")
+ st.success(f"Worker {node['ip_address']} joined!")
+ else:
+ st.error(f"FAILED to join {node['ip_address']}")
+ st.code(result.stderr or result.stdout, language="text")
+ status.update(label=f"{step.title} — FAILED", state="error")
else:
st.error("Could not retrieve join command from control plane.")
- # Step 4: Best practices
- if step4 and cp_nodes:
+ # ── Step 4: Best practices (granular per-step) ───────────────
+ if step4 and cp_nodes and overall_success:
st.markdown("---")
- st.markdown("### Step 4: Best Practices")
- with st.status("Applying security and resource best practices...", expanded=True):
- result = apply_best_practices(cp_nodes[0])
+ st.markdown("### Step 4: Apply Best Practices")
+ bp_steps = get_best_practices_steps()
+ progress = st.progress(0, text="Applying best practices...")
+ for idx, step in enumerate(bp_steps):
+ pct = int((idx / len(bp_steps)) * 100)
+ progress.progress(pct, text=f"[{idx+1}/{len(bp_steps)}] {step.title}")
+ with st.status(f"{step.title}...", expanded=False) as status:
+ result = _run_step(cp_nodes[0], step)
+ if result.success:
+ st.code(result.stdout[-1000:] if result.stdout else "(no output)", language="text")
+ status.update(label=f"{step.title} — done", state="complete")
+ else:
+ st.error(f"FAILED: {step.title}")
+ st.code(result.stderr or result.stdout, language="text")
+ status.update(label=f"{step.title} — FAILED", state="error")
+ if step.fatal:
+ overall_success = False
+ break
+ progress.progress(100, text="Best practices applied" if overall_success else "Best practices FAILED")
+ if overall_success:
+ st.success("Best practices applied!")
+
+ # ── Final cluster status ─────────────────────────────────────
+ st.markdown("---")
+ st.markdown("### Cluster Status")
+ if cp_nodes and overall_success:
+ with st.status("Checking cluster status...", expanded=True) as status:
+ result = get_cluster_status(cp_nodes[0])
if result.success:
- st.success("Best practices applied!")
+ update_profile_status(profile.name, "active")
+ st.success("Cluster is active!")
st.code(result.stdout, language="text")
+ status.update(label="Cluster is active", state="complete")
else:
- st.error("Failed to apply best practices")
+ update_profile_status(profile.name, "error")
+ st.error("Could not verify cluster status")
st.code(result.stderr, language="text")
-
- # Final status
- st.markdown("---")
- st.markdown("### Cluster Status")
- if cp_nodes:
- result = get_cluster_status(cp_nodes[0])
- if result.success:
- update_profile_status(profile.name, "active")
- st.success("Cluster is active!")
- st.code(result.stdout, language="text")
- else:
- update_profile_status(profile.name, "error")
- st.error("Could not verify cluster status")
- st.code(result.stderr, language="text")
+ status.update(label="Status check failed", state="error")
+ elif not overall_success:
+ update_profile_status(profile.name, "error")
+ st.error("Provisioning did not complete successfully. Check the errors above.")
# ── View Scripts ──────────────────────────────────────────────────────
with tab_scripts:
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 830a56e..d7935e1 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -2,8 +2,8 @@
import subprocess
import time
-from dataclasses import dataclass
-from typing import Optional
+from dataclasses import dataclass, field
+from typing import List, Optional
from modules.llm_client import query_llm
from modules.profile_manager import ClusterProfile
@@ -526,6 +526,525 @@ def generate_best_practices_script() -> str:
"""
+# ══════════════════════════════════════════════════════════════════════════
+# Step-based provisioning — granular SSH execution with per-step progress
+# ══════════════════════════════════════════════════════════════════════════
+
+
+@dataclass
+class ProvisionStep:
+ """A single discrete provisioning step to be executed over SSH."""
+
+ name: str # short identifier, e.g. "disable_swap"
+ title: str # human-readable label for the UI
+ script: str # shell snippet to execute
+ timeout: int = 300 # per-step timeout in seconds
+ fatal: bool = True # if True, abort provisioning on failure
+
+
+def _run_step(node: dict, step: ProvisionStep) -> SSHResult:
+ """Execute a single ProvisionStep on a node via SSH."""
+ return run_ssh_command(
+ ip_address=node["ip_address"],
+ command=step.script,
+ ssh_user=node.get("ssh_user", "root"),
+ ssh_port=node.get("ssh_port", 22),
+ ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=step.timeout,
+ )
+
+
+def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
+ """Return the ordered list of discrete steps for common node setup."""
+ proxy_block = _proxy_env_block(profile)
+ steps: List[ProvisionStep] = []
+
+ # 0. Proxy (optional)
+ if proxy_block:
+ steps.append(ProvisionStep(
+ name="configure_proxy",
+ title="Configure Proxy Settings",
+ script=f"""set -euo pipefail
+echo '>> Configuring proxy settings...'
+{proxy_block}
+# Persist proxy in /etc/environment for all users
+cat >> /etc/environment <<'PROXYEOF'
+{proxy_block}
+PROXYEOF
+echo 'Proxy configured.'
+""",
+ timeout=30,
+ ))
+
+ # 1. System prerequisites
+ steps.append(ProvisionStep(
+ name="system_prerequisites",
+ title="System Prerequisites (swap, modules, sysctl, firewall)",
+ script="""set -euo pipefail
+echo '>> Disabling swap...'
+swapoff -a
+sed -i '/\\bswap\\b/d' /etc/fstab
+
+echo '>> Loading kernel modules...'
+cat > /etc/modules-load.d/k8s.conf <> Setting sysctl parameters...'
+cat > /etc/sysctl.d/99-kubernetes.conf <> Disabling SELinux (if present)...'
+if command -v setenforce &>/dev/null; then
+ setenforce 0 || true
+ sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/null || true
+fi
+
+echo '>> Configuring firewalld (if present)...'
+if systemctl is-active --quiet firewalld; then
+ firewall-cmd --permanent --add-port=6443/tcp
+ firewall-cmd --permanent --add-port=2379-2380/tcp
+ firewall-cmd --permanent --add-port=10250/tcp
+ firewall-cmd --permanent --add-port=10259/tcp
+ firewall-cmd --permanent --add-port=10257/tcp
+ firewall-cmd --permanent --add-port=30000-32767/tcp
+ firewall-cmd --permanent --add-port=8472/udp
+ firewall-cmd --reload
+fi
+echo 'System prerequisites configured.'
+""",
+ timeout=120,
+ ))
+
+ # 2. Custom storage directories (optional)
+ dir_cmds = []
+ if profile.crio_root != "/var/lib/containers/storage":
+ dir_cmds.append(f'mkdir -p "{profile.crio_root}"')
+ dir_cmds.append(f'mkdir -p "{profile.crio_runroot}"')
+ if profile.kubelet_root != "/var/lib/kubelet":
+ dir_cmds.append(f'mkdir -p "{profile.kubelet_root}"')
+ if profile.log_root != "/var/log":
+ dir_cmds.append(f'mkdir -p "{profile.log_root}/pods"')
+ dir_cmds.append(f'mkdir -p "{profile.log_root}/containers"')
+ if dir_cmds:
+ steps.append(ProvisionStep(
+ name="create_custom_dirs",
+ title="Create Custom Storage Directories",
+ script="set -euo pipefail\necho '>> Creating custom storage directories...'\n"
+ + "\n".join(dir_cmds) + "\necho 'Custom directories created.'",
+ timeout=30,
+ ))
+
+ # 3. Install CRI-O
+ steps.append(ProvisionStep(
+ name="install_crio",
+ title=f"Install CRI-O {profile.crio_version}",
+ script=f"""set -euo pipefail
+echo '>> Installing CRI-O {profile.crio_version}...'
+
+OS="$(. /etc/os-release && echo "$ID")"
+VERSION_ID="$(. /etc/os-release && echo "$VERSION_ID")"
+
+if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then
+ apt-get update -y
+ apt-get install -y software-properties-common curl gnupg2
+ CRIO_VERSION="{profile.crio_version}"
+ curl -fsSL "https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/Release.key" | \\
+ gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
+ echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/ /" | \\
+ tee /etc/apt/sources.list.d/cri-o.list
+ apt-get update -y
+ apt-get install -y cri-o
+elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then
+ CRIO_VERSION="{profile.crio_version}"
+ cat > /etc/yum.repos.d/cri-o.repo <> Configuring CRI-O storage to {profile.crio_root}...'
+systemctl daemon-reload
+mkdir -p /etc/crio/crio.conf.d
+cat > /etc/crio/crio.conf.d/01-storage.conf <> Installing Kubernetes {profile.kubernetes_version} components...'
+
+OS="$(. /etc/os-release && echo "$ID")"
+K8S_VERSION="{profile.kubernetes_version}"
+
+if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then
+ curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/Release.key" | \\
+ gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
+ echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/ /" | \\
+ tee /etc/apt/sources.list.d/kubernetes.list
+ apt-get update -y
+ apt-get install -y kubelet kubeadm kubectl
+ apt-mark hold kubelet kubeadm kubectl
+elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then
+ cat > /etc/yum.repos.d/kubernetes.repo < List[ProvisionStep]:
+ """Return the ordered list of discrete steps for control-plane init."""
+ cp_nodes = profile.get_control_plane_nodes()
+ cp_ip = cp_nodes[0]["ip_address"] if cp_nodes else "CONTROL_PLANE_IP"
+
+ proxy_block = _proxy_env_block(profile)
+ audit_log_dir = f"{profile.log_root}/kubernetes"
+
+ kubelet_extra = ' container-runtime-endpoint: "unix:///var/run/crio/crio.sock"'
+ if profile.kubelet_root != "/var/lib/kubelet":
+ kubelet_extra += f'\n root-dir: "{profile.kubelet_root}"'
+
+ steps: List[ProvisionStep] = []
+
+ # 0. Proxy on CP (optional)
+ if proxy_block:
+ steps.append(ProvisionStep(
+ name="cp_proxy",
+ title="Set Proxy Environment for kubeadm",
+ script=f"""set -euo pipefail
+echo '>> Setting proxy environment for kubeadm...'
+{proxy_block}
+echo 'Proxy environment set.'
+""",
+ timeout=15,
+ ))
+
+ # 1. kubeadm init
+ steps.append(ProvisionStep(
+ name="kubeadm_init",
+ title="Run kubeadm init",
+ script=f"""set -euo pipefail
+echo '>> Preparing kubeadm config...'
+mkdir -p "{audit_log_dir}"
+cat > /tmp/kubeadm-config.yaml <> Running kubeadm init (this may take a few minutes)...'
+kubeadm init --config=/tmp/kubeadm-config.yaml --upload-certs | tee /tmp/kubeadm-init.log
+echo 'kubeadm init complete.'
+""",
+ timeout=600,
+ ))
+
+ # 2. Configure kubectl
+ steps.append(ProvisionStep(
+ name="configure_kubectl",
+ title="Configure kubectl for root user",
+ script="""set -euo pipefail
+echo '>> Configuring kubectl...'
+mkdir -p /root/.kube
+cp /etc/kubernetes/admin.conf /root/.kube/config
+chown root:root /root/.kube/config
+kubectl get nodes
+echo 'kubectl configured.'
+""",
+ timeout=30,
+ ))
+
+ # 3. Install Flannel CNI
+ steps.append(ProvisionStep(
+ name="install_flannel",
+ title="Install Flannel CNI",
+ script="""set -euo pipefail
+echo '>> Installing Flannel CNI...'
+kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
+echo '>> Waiting for Flannel pods to be ready...'
+kubectl -n kube-flannel wait --for=condition=ready pod -l app=flannel --timeout=120s || true
+echo 'Flannel CNI installed.'
+""",
+ timeout=180,
+ ))
+
+ # 4. Pod Security Standards
+ steps.append(ProvisionStep(
+ name="pod_security",
+ title=f"Apply Pod Security Standards ({profile.pod_security_standard})",
+ script=f"""set -euo pipefail
+echo '>> Applying Pod Security Standards ({profile.pod_security_standard})...'
+kubectl label namespace default \\
+ pod-security.kubernetes.io/enforce={profile.pod_security_standard} \\
+ pod-security.kubernetes.io/warn={profile.pod_security_standard} \\
+ pod-security.kubernetes.io/audit={profile.pod_security_standard} \\
+ --overwrite
+echo 'Pod Security Standards applied.'
+""",
+ timeout=30,
+ ))
+
+ # 5. Generate join command
+ steps.append(ProvisionStep(
+ name="generate_join_cmd",
+ title="Generate Worker Join Command",
+ script="""set -euo pipefail
+echo '>> Generating worker join command...'
+kubeadm token create --print-join-command > /tmp/kubeadm-join-command.txt
+echo 'Join command:'
+cat /tmp/kubeadm-join-command.txt
+""",
+ timeout=30,
+ ))
+
+ return steps
+
+
+def get_worker_join_steps(join_command: str) -> List[ProvisionStep]:
+ """Return the step(s) to join a worker node to the cluster."""
+ return [
+ ProvisionStep(
+ name="join_cluster",
+ title="Join Cluster",
+ script=f"""set -euo pipefail
+echo '>> Joining cluster...'
+{join_command} --cri-socket unix:///var/run/crio/crio.sock
+echo 'Successfully joined the cluster.'
+""",
+ timeout=300,
+ ),
+ ]
+
+
+def get_best_practices_steps() -> List[ProvisionStep]:
+ """Return the ordered list of best-practices hardening steps."""
+ return [
+ ProvisionStep(
+ name="network_policy",
+ title="Apply Default-Deny NetworkPolicy",
+ script="""set -euo pipefail
+echo '>> Creating default-deny network policy for default namespace...'
+cat <> Setting resource quotas...'
+cat <> Setting limit ranges...'
+cat <> Creating read-only ClusterRole...'
+cat <> Ensuring audit log directory exists...'
+mkdir -p /var/log/kubernetes
+echo 'Audit log directory ready.'
+""",
+ timeout=15,
+ fatal=False,
+ ),
+ ]
+
+
+def execute_provision_steps(
+ node: dict,
+ steps: List[ProvisionStep],
+) -> List[tuple]:
+ """Execute a list of provision steps on a node.
+
+ Returns a list of (ProvisionStep, SSHResult) tuples.
+ Stops at the first fatal failure.
+ """
+ results: List[tuple] = []
+ for step in steps:
+ result = _run_step(node, step)
+ results.append((step, result))
+ if not result.success and step.fatal:
+ break
+ return results
+
+
+# ── Legacy wrapper functions (kept for backward compatibility) ────────────
+
+
def provision_node_common(node: dict, profile: ClusterProfile) -> SSHResult:
"""Run the common setup script on a single node via SSH."""
script = generate_common_setup_script(profile)
From 3abe2dd5e9ff7346bc49940c1b7a81f5f670b80b Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 10:48:10 +0000
Subject: [PATCH 06/31] Replace streamlit-option-menu with native st.radio for
reliable sidebar navigation
---
k8s-agent/app.py | 36 +++++++++++++++---------------------
k8s-agent/requirements.txt | 1 -
2 files changed, 15 insertions(+), 22 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index c41ba61..8c354dd 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -8,7 +8,7 @@
import json
import streamlit as st
-from streamlit_option_menu import option_menu
+# Navigation uses native st.radio — no third-party component needed.
import config
from modules.profile_manager import (
@@ -193,26 +193,20 @@ def render_sidebar():
st.divider()
# ── Navigation ──
- selected_page = option_menu(
- menu_title="Navigation",
- options=[
- "Profile Manager",
- "Cluster Creation",
- "Cluster Debugger",
- "Monitoring Setup",
- "Log Analysis",
- "AI Assistant",
- ],
- icons=[
- "person-gear",
- "hdd-rack",
- "bug",
- "graph-up",
- "journal-text",
- "robot",
- ],
- menu_icon="list",
- default_index=0,
+ st.markdown("### Navigation")
+ nav_options = [
+ "Profile Manager",
+ "Cluster Creation",
+ "Cluster Debugger",
+ "Monitoring Setup",
+ "Log Analysis",
+ "AI Assistant",
+ ]
+ selected_page = st.radio(
+ "Go to",
+ options=nav_options,
+ index=0,
+ label_visibility="collapsed",
)
st.divider()
diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt
index 96fb8cf..c9c4741 100644
--- a/k8s-agent/requirements.txt
+++ b/k8s-agent/requirements.txt
@@ -1,4 +1,3 @@
streamlit>=1.32.0
requests>=2.31.0
plotly>=5.18.0
-streamlit-option-menu>=0.3.12
From 067861e9f28b902273d51cd7b2908b9be25092c4 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 11:04:57 +0000
Subject: [PATCH 07/31] Make LLM fully optional with graceful fallbacks and add
offline manifest uploads
- Add is_llm_configured() helper to detect when LLM is not set up
- Make all LLM imports lazy to avoid errors when LLM deps missing
- Guard all AI-powered UI features with is_llm_configured() checks
- Show informative fallback messages when LLM is not configured
- Add Offline Manifests tab for uploading Flannel YAML and other files
- Add flannel_manifest_path/prometheus_manifest_path to ClusterProfile
- SCP user-provided Flannel manifest to nodes during provisioning
- Core features (cluster creation, debugging, monitoring, logs) work without LLM
---
k8s-agent/app.py | 294 ++++++++++++++++++--------
k8s-agent/config.py | 8 +
k8s-agent/modules/cluster_creator.py | 78 ++++++-
k8s-agent/modules/cluster_debugger.py | 15 +-
k8s-agent/modules/llm_client.py | 19 +-
k8s-agent/modules/log_analyzer.py | 15 +-
k8s-agent/modules/monitoring_setup.py | 8 +-
k8s-agent/modules/profile_manager.py | 3 +
8 files changed, 336 insertions(+), 104 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 8c354dd..18f55fb 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -11,6 +11,7 @@
# Navigation uses native st.radio — no third-party component needed.
import config
+from config import is_llm_configured
from modules.profile_manager import (
ClusterProfile,
save_profile,
@@ -32,6 +33,7 @@
apply_best_practices,
get_cluster_status,
get_llm_cluster_advice,
+ upload_flannel_manifest_to_node,
ProvisionStep,
_run_step,
get_common_setup_steps,
@@ -495,10 +497,11 @@ def page_cluster_creation():
_show_profile_summary(profile)
- tab_preflight, tab_provision, tab_scripts, tab_advice = st.tabs([
+ tab_preflight, tab_provision, tab_scripts, tab_manifests, tab_advice = st.tabs([
"Pre-flight Checks",
"Provision Cluster",
"View Scripts",
+ "Offline Manifests",
"AI Advice",
])
@@ -690,17 +693,83 @@ def page_cluster_creation():
with st.expander("Best Practices Script", expanded=False):
st.code(generate_best_practices_script(), language="bash")
+ # ── Offline Manifests ───────────────────────────────────────────────────
+ with tab_manifests:
+ st.markdown("### Offline / Custom Manifests")
+ st.markdown(
+ "If your environment cannot download manifests directly (air-gapped / proxy-restricted), "
+ "upload them here. They will be used instead of the default download URLs during provisioning."
+ )
+
+ st.markdown("#### Flannel CNI Manifest")
+ flannel_file = st.file_uploader(
+ "Upload kube-flannel.yml",
+ type=["yml", "yaml"],
+ key="flannel_upload",
+ help="Download from: https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml",
+ )
+ if flannel_file is not None:
+ flannel_path = os.path.join(config.UPLOADS_DIR, "kube-flannel.yml")
+ with open(flannel_path, "wb") as f:
+ f.write(flannel_file.getvalue())
+ profile.flannel_manifest_path = flannel_path
+ save_profile(profile)
+ st.success(f"Flannel manifest saved. It will be SCP'd to nodes during provisioning.")
+
+ if profile.flannel_manifest_path:
+ st.info(f"Current Flannel manifest: `{profile.flannel_manifest_path}`")
+ if st.button("Clear Flannel manifest (use default URL)", key="clear_flannel"):
+ profile.flannel_manifest_path = ""
+ save_profile(profile)
+ st.rerun()
+ else:
+ st.info("No custom manifest — Flannel will be downloaded from the official GitHub release URL.")
+
+ st.markdown("---")
+ st.markdown("#### Other Manifests")
+ st.markdown(
+ "You can also upload any additional YAML manifests. They will be stored "
+ "and can be applied manually via the **Custom Command** feature in the Cluster Debugger."
+ )
+ extra_file = st.file_uploader(
+ "Upload additional manifest (YAML)",
+ type=["yml", "yaml"],
+ key="extra_manifest_upload",
+ )
+ if extra_file is not None:
+ extra_path = os.path.join(config.UPLOADS_DIR, extra_file.name)
+ with open(extra_path, "wb") as f:
+ f.write(extra_file.getvalue())
+ st.success(f"Saved `{extra_file.name}` to uploads.")
+
+ # List existing uploaded files
+ if os.path.exists(config.UPLOADS_DIR):
+ uploaded_files = [
+ f for f in os.listdir(config.UPLOADS_DIR)
+ if f.endswith((".yml", ".yaml"))
+ ]
+ if uploaded_files:
+ st.markdown("**Uploaded manifests:**")
+ for fname in sorted(uploaded_files):
+ st.markdown(f"- `{fname}`")
+
# ── AI Advice ─────────────────────────────────────────────────────────
with tab_advice:
st.markdown("### AI Cluster Setup Advisor")
- context = st.text_area(
- "Additional context or questions",
- placeholder="e.g., We have 3 nodes with 16GB RAM each. Any special considerations?",
- )
- if st.button("Get AI Recommendations", type="primary"):
- with st.spinner("Analyzing your cluster configuration..."):
- advice = get_llm_cluster_advice(profile, context)
- st.markdown(advice)
+ if not is_llm_configured():
+ st.info(
+ "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "environment variables to enable AI-powered recommendations."
+ )
+ else:
+ context = st.text_area(
+ "Additional context or questions",
+ placeholder="e.g., We have 3 nodes with 16GB RAM each. Any special considerations?",
+ )
+ if st.button("Get AI Recommendations", type="primary"):
+ with st.spinner("Analyzing your cluster configuration..."):
+ advice = get_llm_cluster_advice(profile, context)
+ st.markdown(advice)
# ══════════════════════════════════════════════════════════════════════════
@@ -709,7 +778,7 @@ def page_cluster_creation():
def page_cluster_debugger():
st.markdown("## Cluster Debugger")
- st.markdown("Diagnose issues and get AI-powered recommendations.")
+ st.markdown("Diagnose issues and get recommendations.")
profile = _get_active_profile()
if not profile:
@@ -758,13 +827,16 @@ def page_cluster_debugger():
with st.expander(f"{'✅' if result.success else '❌'} {name}", expanded=not result.success):
st.code(result.stdout if result.success else result.stderr, language="text")
- if st.session_state.debug_results and st.button("Analyze with AI", type="secondary"):
- with st.spinner("AI is analyzing diagnostics..."):
- analysis = analyze_diagnostics(
- st.session_state.debug_results,
- profile=profile,
- )
- st.markdown(analysis)
+ if st.session_state.debug_results:
+ if not is_llm_configured():
+ st.info("Enable AI analysis by setting `LLM_API_URL` and `LLM_API_KEY` env vars.")
+ elif st.button("Analyze with AI", type="secondary"):
+ with st.spinner("AI is analyzing diagnostics..."):
+ analysis = analyze_diagnostics(
+ st.session_state.debug_results,
+ profile=profile,
+ )
+ st.markdown(analysis)
# ── Category Scan ─────────────────────────────────────────────────────
with tab_category:
@@ -779,10 +851,11 @@ def page_cluster_debugger():
with st.expander(f"{'✅' if result.success else '❌'} {name}"):
st.code(result.stdout if result.success else result.stderr, language="text")
- if st.button("Analyze Category with AI", key="cat_ai"):
- with st.spinner("Analyzing..."):
- analysis = analyze_diagnostics(results, profile=profile)
- st.markdown(analysis)
+ if is_llm_configured():
+ if st.button("Analyze Category with AI", key="cat_ai"):
+ with st.spinner("Analyzing..."):
+ analysis = analyze_diagnostics(results, profile=profile)
+ st.markdown(analysis)
# ── Custom Command ────────────────────────────────────────────────────
with tab_custom:
@@ -805,43 +878,53 @@ def page_cluster_debugger():
# ── AI Debug Assistant ────────────────────────────────────────────────
with tab_ai:
st.markdown("### AI Debug Assistant")
- st.markdown("Describe your issue and get AI-powered debugging help.")
+ if not is_llm_configured():
+ st.info(
+ "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "environment variables to enable AI-powered debugging."
+ )
+ st.markdown(
+ "You can still use the **Quick Diagnostics**, **Category Scan**, and "
+ "**Custom Command** tabs to collect diagnostic data without an LLM."
+ )
+ else:
+ st.markdown("Describe your issue and get AI-powered debugging help.")
- issue = st.text_area(
- "Describe the issue",
- placeholder="e.g., Pods are stuck in CrashLoopBackOff in the default namespace",
- height=120,
- )
+ issue = st.text_area(
+ "Describe the issue",
+ placeholder="e.g., Pods are stuck in CrashLoopBackOff in the default namespace",
+ height=120,
+ )
- col1, col2 = st.columns(2)
- with col1:
- auto_collect = st.checkbox("Auto-collect relevant diagnostics", value=True)
- with col2:
- check_pods = st.checkbox("Check for problematic pods", value=True)
-
- if st.button("Debug", type="primary", key="ai_debug") and issue:
- collected_data = ""
-
- if check_pods:
- with st.spinner("Checking pod issues..."):
- pod_result = check_pod_issues(cp_node)
- if pod_result.success and pod_result.stdout.strip():
- collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}"
- with st.expander("Problematic Pods"):
- st.code(pod_result.stdout, language="text")
-
- if auto_collect:
- with st.spinner("Collecting diagnostics..."):
- diag_results = run_category_diagnostics(cp_node, "Cluster Overview")
- for name, result in diag_results.items():
- if result.success:
- collected_data += f"\n\n{name}:\n{result.stdout}"
+ col1, col2 = st.columns(2)
+ with col1:
+ auto_collect = st.checkbox("Auto-collect relevant diagnostics", value=True)
+ with col2:
+ check_pods = st.checkbox("Check for problematic pods", value=True)
+
+ if st.button("Debug", type="primary", key="ai_debug") and issue:
+ collected_data = ""
+
+ if check_pods:
+ with st.spinner("Checking pod issues..."):
+ pod_result = check_pod_issues(cp_node)
+ if pod_result.success and pod_result.stdout.strip():
+ collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}"
+ with st.expander("Problematic Pods"):
+ st.code(pod_result.stdout, language="text")
+
+ if auto_collect:
+ with st.spinner("Collecting diagnostics..."):
+ diag_results = run_category_diagnostics(cp_node, "Cluster Overview")
+ for name, result in diag_results.items():
+ if result.success:
+ collected_data += f"\n\n{name}:\n{result.stdout}"
- with st.spinner("AI is analyzing the issue..."):
- full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}"
- suggestion = get_debug_suggestion(issue, collected_data)
- st.markdown("### AI Recommendation")
- st.markdown(suggestion)
+ with st.spinner("AI is analyzing the issue..."):
+ full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}"
+ suggestion = get_debug_suggestion(issue, collected_data)
+ st.markdown("### AI Recommendation")
+ st.markdown(suggestion)
# ══════════════════════════════════════════════════════════════════════════
@@ -978,15 +1061,21 @@ def page_monitoring_setup():
# ── AI Advice ─────────────────────────────────────────────────────────
with tab_advice:
st.markdown("### AI Monitoring Advisor")
- if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"):
- current_status = ""
- status_result = get_monitoring_status(cp_node, namespace)
- if status_result.success:
- current_status = status_result.stdout
+ if not is_llm_configured():
+ st.info(
+ "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "environment variables to enable AI-powered monitoring advice."
+ )
+ else:
+ if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"):
+ current_status = ""
+ status_result = get_monitoring_status(cp_node, namespace)
+ if status_result.success:
+ current_status = status_result.stdout
- with st.spinner("Getting AI recommendations..."):
- advice = get_monitoring_advice(profile, current_status)
- st.markdown(advice)
+ with st.spinner("Getting AI recommendations..."):
+ advice = get_monitoring_advice(profile, current_status)
+ st.markdown(advice)
# ══════════════════════════════════════════════════════════════════════════
@@ -1098,12 +1187,13 @@ def page_log_analysis():
st.code(result.stdout[-5000:], language="text")
- if analysis.error_count > 0 and st.button("Analyze with AI", key="pod_ai"):
- with st.spinner("AI analyzing pod logs..."):
- ai_analysis = llm_analyze_logs(
- result.stdout, f"{pod_ns}/{pod_name}"
- )
- st.markdown(ai_analysis)
+ if analysis.error_count > 0 and is_llm_configured():
+ if st.button("Analyze with AI", key="pod_ai"):
+ with st.spinner("AI analyzing pod logs..."):
+ ai_analysis = llm_analyze_logs(
+ result.stdout, f"{pod_ns}/{pod_name}"
+ )
+ st.markdown(ai_analysis)
else:
st.error("Failed to fetch pod logs")
st.code(result.stderr, language="text")
@@ -1142,33 +1232,45 @@ def page_log_analysis():
st.info("No correlated errors found across sources.")
# LLM correlation analysis
- if st.button("Deep AI Correlation Analysis", key="deep_corr"):
- multi_logs = {
- src: res.stdout for src, res in results.items() if res.success
- }
- with st.spinner("AI is performing deep correlation analysis..."):
- analysis = llm_correlate_analysis(multi_logs)
- st.markdown(analysis)
+ if is_llm_configured():
+ if st.button("Deep AI Correlation Analysis", key="deep_corr"):
+ multi_logs = {
+ src: res.stdout for src, res in results.items() if res.success
+ }
+ with st.spinner("AI is performing deep correlation analysis..."):
+ analysis = llm_correlate_analysis(multi_logs)
+ st.markdown(analysis)
# ── AI Log Analysis ───────────────────────────────────────────────────
with tab_ai:
st.markdown("### AI-Powered Log Analysis")
- st.markdown("Paste logs or describe an issue for AI analysis.")
+ if not is_llm_configured():
+ st.info(
+ "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "environment variables to enable AI-powered log analysis."
+ )
+ st.markdown(
+ "You can still use the **System Logs**, **Pod Logs**, and "
+ "**Error Correlation** tabs — they work without an LLM and provide "
+ "automated pattern matching and error grouping."
+ )
+ else:
+ st.markdown("Paste logs or describe an issue for AI analysis.")
- log_input = st.text_area(
- "Paste log output",
- height=200,
- placeholder="Paste your Kubernetes logs here...",
- )
- context_input = st.text_input(
- "Additional context",
- placeholder="e.g., This started happening after we upgraded to K8s 1.30",
- )
+ log_input = st.text_area(
+ "Paste log output",
+ height=200,
+ placeholder="Paste your Kubernetes logs here...",
+ )
+ context_input = st.text_input(
+ "Additional context",
+ placeholder="e.g., This started happening after we upgraded to K8s 1.30",
+ )
- if st.button("Analyze Logs", type="primary", key="ai_log_analyze") and log_input:
- with st.spinner("AI is analyzing logs..."):
- analysis = llm_analyze_logs(log_input, context=context_input)
- st.markdown(analysis)
+ if st.button("Analyze Logs", type="primary", key="ai_log_analyze") and log_input:
+ with st.spinner("AI is analyzing logs..."):
+ analysis = llm_analyze_logs(log_input, context=context_input)
+ st.markdown(analysis)
# ══════════════════════════════════════════════════════════════════════════
@@ -1177,6 +1279,18 @@ def page_log_analysis():
def page_ai_assistant():
st.markdown("## AI Kubernetes Assistant")
+
+ if not is_llm_configured():
+ st.info(
+ "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "environment variables to enable the AI chat assistant."
+ )
+ st.markdown(
+ "All other features (Cluster Creation, Debugging, Monitoring, Log Analysis) "
+ "work without an LLM. Only the AI-powered analysis and chat features require it."
+ )
+ return
+
st.markdown("Chat with the AI about any Kubernetes topic.")
# Chat history
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index e46dd95..14fb427 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -12,10 +12,18 @@
LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096"))
+
+def is_llm_configured() -> bool:
+ """Return True if the LLM endpoint and API key are both set."""
+ return bool(LLM_API_URL and LLM_API_KEY)
+
+
# Application paths
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
PROFILES_DIR = os.path.join(DATA_DIR, "profiles")
TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
+UPLOADS_DIR = os.path.join(DATA_DIR, "uploads")
# Ensure directories exist
os.makedirs(PROFILES_DIR, exist_ok=True)
+os.makedirs(UPLOADS_DIR, exist_ok=True)
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index d7935e1..22dcfd9 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -5,9 +5,11 @@
from dataclasses import dataclass, field
from typing import List, Optional
-from modules.llm_client import query_llm
from modules.profile_manager import ClusterProfile
+# Default Flannel manifest URL — can be overridden by user-uploaded file
+FLANNEL_MANIFEST_URL = "https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml"
+
@dataclass
class SSHResult:
@@ -384,7 +386,12 @@ def generate_control_plane_init_script(profile: ClusterProfile) -> str:
# ── Install Flannel CNI ───────────────────────────────────────────────────
echo ">> Installing Flannel CNI..."
-kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
+if [ -f /tmp/kube-flannel-custom.yml ]; then
+ echo ">> Using user-provided Flannel manifest..."
+ kubectl apply -f /tmp/kube-flannel-custom.yml
+else
+ kubectl apply -f {FLANNEL_MANIFEST_URL}
+fi
# Wait for Flannel to be ready
echo ">> Waiting for Flannel pods to be ready..."
@@ -850,12 +857,22 @@ def get_control_plane_steps(profile: ClusterProfile) -> List[ProvisionStep]:
))
# 3. Install Flannel CNI
+ flannel_manifest = profile.flannel_manifest_path or FLANNEL_MANIFEST_URL
+ # If the user uploaded a local file we SCP it first; otherwise download URL
+ if profile.flannel_manifest_path:
+ flannel_apply = (
+ "echo '>> Using user-provided Flannel manifest...'\n"
+ "kubectl apply -f /tmp/kube-flannel-custom.yml"
+ )
+ else:
+ flannel_apply = f"kubectl apply -f {FLANNEL_MANIFEST_URL}"
+
steps.append(ProvisionStep(
name="install_flannel",
title="Install Flannel CNI",
- script="""set -euo pipefail
+ script=f"""set -euo pipefail
echo '>> Installing Flannel CNI...'
-kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
+{flannel_apply}
echo '>> Waiting for Flannel pods to be ready...'
kubectl -n kube-flannel wait --for=condition=ready pod -l app=flannel --timeout=120s || true
echo 'Flannel CNI installed.'
@@ -1125,7 +1142,12 @@ def get_cluster_status(control_plane_node: dict) -> SSHResult:
def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
- """Ask the LLM for cluster setup advice based on the profile."""
+ """Ask the LLM for cluster setup advice based on the profile.
+
+ Returns a graceful message when the LLM is not configured.
+ """
+ from modules.llm_client import query_llm # lazy import — LLM is optional
+
nodes_desc = []
for n in profile.nodes:
nodes_desc.append(f" - {n.get('hostname', 'unknown')} ({n['ip_address']}) — role: {n['role']}")
@@ -1159,3 +1181,49 @@ def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
4. Network configuration tips for Flannel with CRI-O
"""
return query_llm(prompt)
+
+
+def upload_flannel_manifest_to_node(node: dict, local_path: str) -> SSHResult:
+ """SCP a user-provided Flannel manifest to a node as /tmp/kube-flannel-custom.yml."""
+ scp_cmd = [
+ "scp",
+ "-o", "StrictHostKeyChecking=no",
+ "-o", "UserKnownHostsFile=/dev/null",
+ "-P", str(node.get("ssh_port", 22)),
+ "-i", node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ local_path,
+ f"{node.get('ssh_user', 'root')}@{node['ip_address']}:/tmp/kube-flannel-custom.yml",
+ ]
+ try:
+ proc = subprocess.run(
+ scp_cmd,
+ capture_output=True,
+ text=True,
+ timeout=60,
+ )
+ return SSHResult(
+ hostname=node["ip_address"],
+ command="scp flannel manifest",
+ return_code=proc.returncode,
+ stdout=proc.stdout,
+ stderr=proc.stderr,
+ success=proc.returncode == 0,
+ )
+ except subprocess.TimeoutExpired:
+ return SSHResult(
+ hostname=node["ip_address"],
+ command="scp flannel manifest",
+ return_code=-1,
+ stdout="",
+ stderr="SCP timed out after 60 seconds",
+ success=False,
+ )
+ except Exception as exc:
+ return SSHResult(
+ hostname=node["ip_address"],
+ command="scp flannel manifest",
+ return_code=-1,
+ stdout="",
+ stderr=str(exc),
+ success=False,
+ )
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index 9ee9dd1..b6798cc 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -1,7 +1,6 @@
"""Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations."""
from modules.cluster_creator import run_ssh_command, SSHResult
-from modules.llm_client import query_llm, stream_llm
from modules.profile_manager import ClusterProfile
@@ -151,7 +150,12 @@ def analyze_diagnostics(
user_description: str = "",
profile: ClusterProfile | None = None,
) -> str:
- """Send diagnostic results to the LLM for analysis and recommendations."""
+ """Send diagnostic results to the LLM for analysis and recommendations.
+
+ Returns a graceful message when the LLM is not configured.
+ """
+ from modules.llm_client import query_llm # lazy import — LLM is optional
+
diag_text = format_diagnostics_for_llm(results)
cluster_info = ""
@@ -190,7 +194,12 @@ def get_debug_suggestion(
error_message: str,
context: str = "",
) -> str:
- """Get a quick debugging suggestion from the LLM for a specific error."""
+ """Get a quick debugging suggestion from the LLM for a specific error.
+
+ Returns a graceful message when the LLM is not configured.
+ """
+ from modules.llm_client import query_llm # lazy import — LLM is optional
+
prompt = f"""I encountered the following error in my Kubernetes cluster (CRI-O + Flannel):
Error: {error_message}
diff --git a/k8s-agent/modules/llm_client.py b/k8s-agent/modules/llm_client.py
index 2b7eb44..fc77be1 100644
--- a/k8s-agent/modules/llm_client.py
+++ b/k8s-agent/modules/llm_client.py
@@ -1,4 +1,9 @@
-"""LLM client for the Infosys AI Gateway."""
+"""LLM client — optional integration with an OpenAI-compatible endpoint.
+
+All public functions gracefully return a fallback message when the LLM is not
+configured (i.e. ``LLM_API_KEY`` or ``LLM_API_URL`` is empty). The rest of the
+application works without any LLM dependency.
+"""
import json
from typing import Generator, Optional
@@ -7,6 +12,11 @@
import config
+_NOT_CONFIGURED_MSG = (
+ "LLM is not configured. Set the LLM_API_URL and LLM_API_KEY environment "
+ "variables to enable AI-powered features."
+)
+
SYSTEM_PROMPT = """You are an expert Kubernetes platform engineer specializing in on-premises
cluster administration. You have deep knowledge of:
@@ -63,6 +73,9 @@ def query_llm(
"max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
}
+ if not config.is_llm_configured():
+ return _NOT_CONFIGURED_MSG
+
try:
response = requests.post(
config.LLM_API_URL,
@@ -117,6 +130,10 @@ def stream_llm(
"stream": True,
}
+ if not config.is_llm_configured():
+ yield _NOT_CONFIGURED_MSG
+ return
+
try:
response = requests.post(
config.LLM_API_URL,
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index 9abc838..fea9438 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -6,7 +6,6 @@
from typing import Optional
from modules.cluster_creator import run_ssh_command, SSHResult
-from modules.llm_client import query_llm
@dataclass
@@ -274,7 +273,12 @@ def llm_analyze_logs(
source: str = "",
context: str = "",
) -> str:
- """Send log output to the LLM for deep analysis."""
+ """Send log output to the LLM for deep analysis.
+
+ Returns a graceful message when the LLM is not configured.
+ """
+ from modules.llm_client import query_llm # lazy import — LLM is optional
+
truncated = log_text[-8000:] if len(log_text) > 8000 else log_text
prompt = f"""Analyze the following Kubernetes logs and provide a detailed assessment.
@@ -301,7 +305,12 @@ def llm_correlate_analysis(
multi_source_logs: dict[str, str],
issue_description: str = "",
) -> str:
- """Send logs from multiple sources to the LLM for cross-source correlation."""
+ """Send logs from multiple sources to the LLM for cross-source correlation.
+
+ Returns a graceful message when the LLM is not configured.
+ """
+ from modules.llm_client import query_llm # lazy import — LLM is optional
+
log_sections = []
for source, log_text in multi_source_logs.items():
truncated = log_text[-3000:] if len(log_text) > 3000 else log_text
diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py
index fdf8b42..a2272d7 100644
--- a/k8s-agent/modules/monitoring_setup.py
+++ b/k8s-agent/modules/monitoring_setup.py
@@ -1,7 +1,6 @@
"""Monitoring Setup — Prometheus, Grafana, and dashboard provisioning via SSH."""
from modules.cluster_creator import run_ssh_command, SSHResult
-from modules.llm_client import query_llm
from modules.profile_manager import ClusterProfile
@@ -420,7 +419,12 @@ def get_monitoring_advice(
profile: ClusterProfile,
current_status: str = "",
) -> str:
- """Ask the LLM for monitoring setup advice."""
+ """Ask the LLM for monitoring setup advice.
+
+ Returns a graceful message when the LLM is not configured.
+ """
+ from modules.llm_client import query_llm # lazy import — LLM is optional
+
prompt = f"""I have a Kubernetes cluster with the following setup:
- Kubernetes: {profile.kubernetes_version}
- Runtime: CRI-O {profile.crio_version}
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
index 9754801..90cd9bf 100644
--- a/k8s-agent/modules/profile_manager.py
+++ b/k8s-agent/modules/profile_manager.py
@@ -51,6 +51,9 @@ class ClusterProfile:
no_proxy: str = ""
http_proxy_alt: str = "" # alternate proxy
https_proxy_alt: str = "" # alternate proxy
+ # Offline manifest paths — user-provided files for air-gapped environments
+ flannel_manifest_path: str = "" # local path to kube-flannel.yml
+ prometheus_manifest_path: str = "" # local path to prometheus manifest
def get_control_plane_nodes(self) -> list[dict]:
return [n for n in self.nodes if n.get("role") == "control-plane"]
From 58342262d21025b5946f31c715f11eeb28396ef9 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:34:53 +0000
Subject: [PATCH 08/31] Add imported cluster support, upgrade planner, version
1.35, PSS explanations, graphical events timeline
---
k8s-agent/app.py | 1148 +++++++++++++++++++++++--
k8s-agent/modules/cluster_creator.py | 84 ++
k8s-agent/modules/cluster_debugger.py | 138 ++-
k8s-agent/modules/log_analyzer.py | 115 ++-
k8s-agent/modules/monitoring_setup.py | 117 +--
k8s-agent/modules/profile_manager.py | 3 +
6 files changed, 1441 insertions(+), 164 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 18f55fb..0000017 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -34,6 +34,7 @@
get_cluster_status,
get_llm_cluster_advice,
upload_flannel_manifest_to_node,
+ run_kubectl,
ProvisionStep,
_run_step,
get_common_setup_steps,
@@ -43,7 +44,9 @@
)
from modules.cluster_debugger import (
DIAGNOSTIC_COMMANDS,
+ KUBECTL_DIAGNOSTIC_COMMANDS,
CATEGORY_MAP,
+ get_available_commands,
run_diagnostic,
run_category_diagnostics,
run_all_diagnostics,
@@ -66,6 +69,7 @@
)
from modules.log_analyzer import (
LOG_SOURCES,
+ get_available_log_sources,
collect_logs,
collect_pod_logs,
collect_multi_source_logs,
@@ -182,11 +186,16 @@ def render_sidebar():
f"**Status:** {profile.status.upper()}",
unsafe_allow_html=True,
)
- st.caption(
- f"K8s {profile.kubernetes_version} | CRI-O {profile.crio_version} | "
- f"{len(profile.get_control_plane_nodes())} CP + "
- f"{len(profile.get_worker_nodes())} Workers"
- )
+ if profile.cluster_source == "imported":
+ st.caption(
+ f"K8s {profile.kubernetes_version} | Imported Cluster"
+ )
+ else:
+ st.caption(
+ f"K8s {profile.kubernetes_version} | CRI-O {profile.crio_version} | "
+ f"{len(profile.get_control_plane_nodes())} CP + "
+ f"{len(profile.get_worker_nodes())} Workers"
+ )
else:
st.session_state.active_profile = None
else:
@@ -199,9 +208,11 @@ def render_sidebar():
nav_options = [
"Profile Manager",
"Cluster Creation",
+ "Resource Viewer",
"Cluster Debugger",
"Monitoring Setup",
"Log Analysis",
+ "Upgrade Planner",
"AI Assistant",
]
selected_page = st.radio(
@@ -247,7 +258,9 @@ def page_profile_manager():
st.markdown("## Cluster Profile Manager")
st.markdown("Create, edit, and manage profiles for your on-prem Kubernetes clusters.")
- tab_create, tab_list, tab_import = st.tabs(["Create Profile", "Manage Profiles", "Import / Export"])
+ tab_create, tab_import_cluster, tab_list, tab_import = st.tabs([
+ "Create Profile", "Import Existing Cluster", "Manage Profiles", "Import / Export",
+ ])
# ── Create Profile ────────────────────────────────────────────────────
with tab_create:
@@ -258,13 +271,46 @@ def page_profile_manager():
with col1:
name = st.text_input("Profile Name *", placeholder="production-cluster")
description = st.text_area("Description", placeholder="Production on-prem cluster")
- k8s_version = st.selectbox("Kubernetes Version", ["1.30", "1.29", "1.28", "1.27"], index=0)
- crio_version = st.selectbox("CRI-O Version", ["1.30", "1.29", "1.28", "1.27"], index=0)
+ k8s_version = st.selectbox(
+ "Kubernetes Version",
+ ["1.35", "1.34", "1.33", "1.32", "1.31", "1.30", "1.29", "1.28", "1.27"],
+ index=0,
+ )
+ crio_version = st.selectbox(
+ "CRI-O Version",
+ ["1.35", "1.34", "1.33", "1.32", "1.31", "1.30", "1.29", "1.28", "1.27"],
+ index=0,
+ )
pod_security = st.selectbox(
"Pod Security Standard",
["restricted", "baseline", "privileged"],
index=0,
+ help="Controls what pods are allowed to run in the cluster.",
)
+ # Explain each PSS level
+ with st.expander("What do these Pod Security Standards mean?"):
+ st.markdown(
+ "**Restricted** (most secure)\n"
+ "- Heavily restricted policy following Pod hardening best practices.\n"
+ "- Disallows privilege escalation, host namespaces, host paths, and most Linux capabilities.\n"
+ "- Containers must run as non-root with a read-only root filesystem.\n"
+ "- Only allows seccomp profile RuntimeDefault or Localhost.\n"
+ "- Best for: production workloads, multi-tenant clusters, security-sensitive environments.\n\n"
+ "**Baseline** (moderate)\n"
+ "- Minimally restrictive policy that prevents known privilege escalations.\n"
+ "- Allows most default Kubernetes configurations but blocks hostNetwork, hostPID, hostIPC.\n"
+ "- Containers can run as root but cannot use privileged mode.\n"
+ "- Allows all seccomp profiles.\n"
+ "- Best for: general workloads, development/staging, teams new to PSS.\n\n"
+ "**Privileged** (unrestricted)\n"
+ "- Completely unrestricted policy — no security restrictions enforced.\n"
+ "- Allows privileged containers, host namespaces, host paths, any capabilities.\n"
+ "- Containers can run as root with full access to the host.\n"
+ "- Best for: system-level workloads (monitoring agents, CNI plugins, storage drivers), "
+ "trusted single-tenant clusters.\n\n"
+ "**Recommendation:** Start with *Restricted* and relax to *Baseline* only for "
+ "workloads that require it. Avoid *Privileged* unless absolutely necessary."
+ )
with col2:
pod_cidr = st.text_input("Pod CIDR", value="10.244.0.0/16")
@@ -409,6 +455,64 @@ def page_profile_manager():
st.success(f"Profile '{name}' created successfully!")
st.rerun()
+ # ── Import Existing Cluster ──────────────────────────────────────────
+ with tab_import_cluster:
+ st.markdown("### Import Existing Kubernetes Cluster")
+ st.markdown(
+ "Connect to an existing K8s cluster by uploading its **kubeconfig** file. "
+ "This lets you use the Debugger, Monitoring, Log Analysis, and Resource Viewer "
+ "without provisioning a new cluster."
+ )
+
+ with st.form("import_cluster_form"):
+ import_name = st.text_input(
+ "Profile Name *",
+ placeholder="my-existing-cluster",
+ )
+ import_desc = st.text_area(
+ "Description",
+ placeholder="Production cluster running in datacenter A",
+ )
+ kubeconfig_file = st.file_uploader(
+ "Upload kubeconfig file",
+ type=["yaml", "yml", "conf", "config"],
+ key="kubeconfig_upload",
+ help="Usually found at ~/.kube/config on your cluster's control-plane node.",
+ )
+ k8s_ver = st.text_input(
+ "Kubernetes Version (optional)",
+ placeholder="1.30",
+ value="1.30",
+ )
+
+ submitted_import = st.form_submit_button(
+ "Import Cluster", type="primary", use_container_width=True,
+ )
+
+ if submitted_import:
+ if not import_name:
+ st.error("Profile name is required.")
+ elif not kubeconfig_file:
+ st.error("Please upload a kubeconfig file.")
+ else:
+ kubeconfig_content = kubeconfig_file.read().decode("utf-8")
+ profile = ClusterProfile(
+ name=import_name,
+ description=import_desc,
+ kubernetes_version=k8s_ver or "1.30",
+ status="active",
+ cluster_source="imported",
+ kubeconfig_content=kubeconfig_content,
+ )
+ save_profile(profile)
+ st.session_state.active_profile = import_name
+ st.success(
+ f"Cluster '{import_name}' imported! "
+ "Select it from the sidebar to start using Debugger, Monitoring, "
+ "Resource Viewer, etc."
+ )
+ st.rerun()
+
# ── Manage Profiles ───────────────────────────────────────────────────
with tab_list:
profiles = list_profiles()
@@ -784,11 +888,16 @@ def page_cluster_debugger():
if not profile:
return
- cp_nodes = profile.get_control_plane_nodes()
- if not cp_nodes:
- st.error("No control-plane node defined in this profile.")
- return
- cp_node = cp_nodes[0]
+ # For imported clusters we don't need a CP node — commands run locally via kubeconfig
+ cp_node = None
+ if profile.cluster_source != "imported":
+ cp_nodes = profile.get_control_plane_nodes()
+ if not cp_nodes:
+ st.error("No control-plane node defined in this profile.")
+ return
+ cp_node = cp_nodes[0]
+
+ available_commands = get_available_commands(profile)
tab_quick, tab_category, tab_custom, tab_ai = st.tabs([
"Quick Diagnostics",
@@ -804,7 +913,7 @@ def page_cluster_debugger():
with col1:
selected_checks = st.multiselect(
"Select checks to run",
- options=list(DIAGNOSTIC_COMMANDS.keys()),
+ options=list(available_commands.keys()),
default=["Node Status", "Pod Status (All Namespaces)", "Events (Recent)"],
)
with col2:
@@ -813,12 +922,12 @@ def page_cluster_debugger():
if st.button("Run Diagnostics", type="primary"):
if run_all:
with st.spinner("Running all diagnostics..."):
- results = run_all_diagnostics(cp_node)
+ results = run_all_diagnostics(cp_node, profile=profile)
else:
results = {}
for check in selected_checks:
with st.spinner(f"Running: {check}..."):
- results[check] = run_diagnostic(cp_node, check)
+ results[check] = run_diagnostic(cp_node, check, profile=profile)
st.session_state.debug_results = results
@@ -845,7 +954,7 @@ def page_cluster_debugger():
if st.button("Run Category Scan", type="primary", key="cat_scan"):
with st.spinner(f"Running {category} diagnostics..."):
- results = run_category_diagnostics(cp_node, category)
+ results = run_category_diagnostics(cp_node, category, profile=profile)
for name, result in results.items():
with st.expander(f"{'✅' if result.success else '❌'} {name}"):
@@ -860,7 +969,10 @@ def page_cluster_debugger():
# ── Custom Command ────────────────────────────────────────────────────
with tab_custom:
st.markdown("### Run Custom Command")
- st.warning("Commands execute on the control-plane node via SSH.")
+ if profile.cluster_source == "imported":
+ st.info("Commands run locally via kubectl using the imported kubeconfig.")
+ else:
+ st.warning("Commands execute on the control-plane node via SSH.")
custom_cmd = st.text_area(
"Command",
placeholder="kubectl get pods -A -o wide",
@@ -868,7 +980,7 @@ def page_cluster_debugger():
)
if st.button("Execute", type="primary", key="exec_custom") and custom_cmd:
with st.spinner("Executing..."):
- result = run_custom_command(cp_node, custom_cmd)
+ result = run_custom_command(cp_node, custom_cmd, profile=profile)
if result.success:
st.code(result.stdout, language="text")
else:
@@ -907,7 +1019,7 @@ def page_cluster_debugger():
if check_pods:
with st.spinner("Checking pod issues..."):
- pod_result = check_pod_issues(cp_node)
+ pod_result = check_pod_issues(cp_node, profile=profile)
if pod_result.success and pod_result.stdout.strip():
collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}"
with st.expander("Problematic Pods"):
@@ -915,10 +1027,10 @@ def page_cluster_debugger():
if auto_collect:
with st.spinner("Collecting diagnostics..."):
- diag_results = run_category_diagnostics(cp_node, "Cluster Overview")
- for name, result in diag_results.items():
- if result.success:
- collected_data += f"\n\n{name}:\n{result.stdout}"
+ diag_results = run_category_diagnostics(cp_node, "Cluster Overview", profile=profile)
+ for name, result in diag_results.items():
+ if result.success:
+ collected_data += f"\n\n{name}:\n{result.stdout}"
with st.spinner("AI is analyzing the issue..."):
full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}"
@@ -939,11 +1051,14 @@ def page_monitoring_setup():
if not profile:
return
- cp_nodes = profile.get_control_plane_nodes()
- if not cp_nodes:
- st.error("No control-plane node defined in this profile.")
- return
- cp_node = cp_nodes[0]
+ # For imported clusters we don't need a CP node
+ cp_node = None
+ if profile.cluster_source != "imported":
+ cp_nodes = profile.get_control_plane_nodes()
+ if not cp_nodes:
+ st.error("No control-plane node defined in this profile.")
+ return
+ cp_node = cp_nodes[0]
namespace = st.text_input("Monitoring Namespace", value="monitoring")
@@ -970,7 +1085,7 @@ def page_monitoring_setup():
if st.button("Install Prometheus + Grafana", type="primary", use_container_width=True):
if install_helm_first:
with st.status("Installing Helm...", expanded=True):
- result = install_helm(cp_node)
+ result = install_helm(cp_node, profile=profile)
if result.success:
st.success("Helm ready!")
else:
@@ -978,7 +1093,7 @@ def page_monitoring_setup():
st.code(result.stderr, language="text")
with st.status("Installing kube-prometheus-stack (this may take several minutes)...", expanded=True):
- result = install_prometheus_stack(cp_node, namespace)
+ result = install_prometheus_stack(cp_node, namespace, profile=profile)
if result.success:
st.success("Prometheus + Grafana installed!")
st.code(result.stdout[-2000:], language="text")
@@ -988,7 +1103,7 @@ def page_monitoring_setup():
if install_alerts_too:
with st.status("Installing alert rules...", expanded=True):
- result = install_alert_rules(cp_node, namespace)
+ result = install_alert_rules(cp_node, namespace, profile=profile)
if result.success:
st.success("Alert rules installed!")
else:
@@ -1009,7 +1124,7 @@ def page_monitoring_setup():
if st.button("Import Dashboards", type="primary") and selected_dashboards:
with st.status("Importing dashboards...", expanded=True):
- result = install_dashboards(cp_node, selected_dashboards, namespace)
+ result = install_dashboards(cp_node, selected_dashboards, namespace, profile=profile)
if result.success:
st.success(f"Imported {len(selected_dashboards)} dashboards!")
st.code(result.stdout, language="text")
@@ -1027,7 +1142,7 @@ def page_monitoring_setup():
if st.button("Install Alert Rules", type="primary", key="install_alerts"):
with st.spinner("Installing alert rules..."):
- result = install_alert_rules(cp_node, namespace)
+ result = install_alert_rules(cp_node, namespace, profile=profile)
if result.success:
st.success("Alert rules installed!")
st.code(result.stdout, language="text")
@@ -1040,7 +1155,7 @@ def page_monitoring_setup():
st.markdown("### Monitoring Stack Status")
if st.button("Check Status", type="primary", key="mon_status"):
with st.spinner("Checking monitoring stack..."):
- result = get_monitoring_status(cp_node, namespace)
+ result = get_monitoring_status(cp_node, namespace, profile=profile)
if result.success:
st.code(result.stdout, language="text")
else:
@@ -1069,7 +1184,7 @@ def page_monitoring_setup():
else:
if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"):
current_status = ""
- status_result = get_monitoring_status(cp_node, namespace)
+ status_result = get_monitoring_status(cp_node, namespace, profile=profile)
if status_result.success:
current_status = status_result.stdout
@@ -1090,11 +1205,16 @@ def page_log_analysis():
if not profile:
return
- cp_nodes = profile.get_control_plane_nodes()
- if not cp_nodes:
- st.error("No control-plane node defined in this profile.")
- return
- cp_node = cp_nodes[0]
+ # For imported clusters we don't need a CP node
+ cp_node = None
+ if profile.cluster_source != "imported":
+ cp_nodes = profile.get_control_plane_nodes()
+ if not cp_nodes:
+ st.error("No control-plane node defined in this profile.")
+ return
+ cp_node = cp_nodes[0]
+
+ available_log_sources = get_available_log_sources(profile)
tab_system, tab_pod, tab_correlation, tab_ai = st.tabs([
"System Logs",
@@ -1108,10 +1228,13 @@ def page_log_analysis():
st.markdown("### System Component Logs")
col1, col2, col3 = st.columns(3)
with col1:
+ default_sources = [s for s in ["Kubelet", "CRI-O", "Events"] if s in available_log_sources]
+ if not default_sources:
+ default_sources = available_log_sources[:3] if available_log_sources else []
sources = st.multiselect(
"Log Sources",
- options=list(LOG_SOURCES.keys()),
- default=["Kubelet", "CRI-O", "Events"],
+ options=available_log_sources,
+ default=default_sources,
)
with col2:
log_lines = st.number_input("Lines to fetch", min_value=50, max_value=1000, value=200)
@@ -1127,7 +1250,7 @@ def page_log_analysis():
log_data = {}
for source in sources:
with st.spinner(f"Collecting {source} logs..."):
- result = collect_logs(cp_node, source, log_lines, since, since_k8s)
+ result = collect_logs(cp_node, source, log_lines, since, since_k8s, profile=profile)
if result.success:
log_data[source] = result.stdout
analysis = analyze_logs(result.stdout, source)
@@ -1171,7 +1294,7 @@ def page_log_analysis():
with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."):
result = collect_pod_logs(
cp_node, pod_ns, pod_name, container, pod_lines,
- "1h", pod_previous,
+ "1h", pod_previous, profile=profile,
)
if result.success:
analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}")
@@ -1203,16 +1326,19 @@ def page_log_analysis():
st.markdown("### Cross-Source Error Correlation")
st.markdown("Collect logs from multiple sources and correlate errors across them.")
+ default_corr = [s for s in ["Kubelet", "CRI-O", "API Server", "Events"] if s in available_log_sources]
+ if not default_corr:
+ default_corr = available_log_sources[:4] if available_log_sources else []
corr_sources = st.multiselect(
"Sources to correlate",
- options=list(LOG_SOURCES.keys()),
- default=["Kubelet", "CRI-O", "API Server", "Events"],
+ options=available_log_sources,
+ default=default_corr,
key="corr_sources",
)
if st.button("Collect & Correlate", type="primary", key="correlate"):
with st.spinner("Collecting logs from multiple sources..."):
- results = collect_multi_source_logs(cp_node, corr_sources, lines=150)
+ results = collect_multi_source_logs(cp_node, corr_sources, lines=150, profile=profile)
correlated = correlate_errors(results)
@@ -1277,6 +1403,866 @@ def page_log_analysis():
# PAGE: AI Assistant
# ══════════════════════════════════════════════════════════════════════════
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Resource Viewer
+# ══════════════════════════════════════════════════════════════════════════
+
+# Resource definitions: (display_name, kubectl_command, supports_namespace)
+_RESOURCE_TYPES = {
+ "Pods": ("get pods", True),
+ "Deployments": ("get deployments", True),
+ "Services": ("get services", True),
+ "ConfigMaps": ("get configmaps", True),
+ "Secrets": ("get secrets", True),
+ "StatefulSets": ("get statefulsets", True),
+ "DaemonSets": ("get daemonsets", True),
+ "ReplicaSets": ("get replicasets", True),
+ "Jobs": ("get jobs", True),
+ "CronJobs": ("get cronjobs", True),
+ "Ingresses": ("get ingress", True),
+ "NetworkPolicies": ("get networkpolicies", True),
+ "PersistentVolumeClaims": ("get pvc", True),
+ "PersistentVolumes": ("get pv", False),
+ "StorageClasses": ("get storageclasses", False),
+ "Namespaces": ("get namespaces", False),
+ "Nodes": ("get nodes", False),
+ "ServiceAccounts": ("get serviceaccounts", True),
+ "DestinationRules": ("get destinationrules", True),
+ "VirtualServices": ("get virtualservices", True),
+ "HorizontalPodAutoscalers": ("get hpa", True),
+ "PodDisruptionBudgets": ("get pdb", True),
+ "Endpoints": ("get endpoints", True),
+}
+
+
+def page_resource_viewer():
+ st.markdown("## Resource Viewer")
+ st.markdown("Browse live Kubernetes resources from your cluster.")
+
+ profile = _get_active_profile()
+ if not profile:
+ return
+
+ if not profile.kubeconfig_content and not profile.get_control_plane_nodes():
+ st.error(
+ "This profile has no kubeconfig and no control-plane node. "
+ "Import a kubeconfig or add nodes in the Profile Manager."
+ )
+ return
+
+ tab_resources, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+ "Cluster Resources",
+ "Node Health",
+ "RBAC Viewer",
+ "Helm Releases",
+ "Events Timeline",
+ ])
+
+ # ── Cluster Resources ────────────────────────────────────────────────
+ with tab_resources:
+ st.markdown("### Browse Cluster Resources")
+
+ col1, col2, col3 = st.columns([2, 2, 1])
+ with col1:
+ resource_type = st.selectbox(
+ "Resource Type",
+ options=list(_RESOURCE_TYPES.keys()),
+ index=0,
+ )
+ with col2:
+ cmd_base, ns_supported = _RESOURCE_TYPES[resource_type]
+ if ns_supported:
+ ns_choice = st.radio(
+ "Namespace",
+ ["All Namespaces", "Specific"],
+ horizontal=True,
+ key="res_ns_choice",
+ )
+ if ns_choice == "Specific":
+ namespace = st.text_input("Namespace", value="default", key="res_ns")
+ else:
+ namespace = ""
+ else:
+ namespace = ""
+ st.info(f"{resource_type} is a cluster-scoped resource.")
+ with col3:
+ output_format = st.selectbox(
+ "Output",
+ ["wide", "yaml", "json", "name"],
+ index=0,
+ key="res_output",
+ )
+
+ if st.button("Fetch Resources", type="primary", key="fetch_res"):
+ kubectl_cmd = cmd_base
+ if ns_supported and not namespace:
+ kubectl_cmd += " -A"
+ elif ns_supported and namespace:
+ kubectl_cmd += f" -n {namespace}"
+ kubectl_cmd += f" -o {output_format}"
+
+ with st.spinner(f"Fetching {resource_type}..."):
+ result = run_kubectl(profile, kubectl_cmd, timeout=30)
+ if result.success:
+ st.code(result.stdout or "(no resources found)", language="text")
+ else:
+ if "the server doesn't have a resource type" in result.stderr:
+ st.warning(
+ f"{resource_type} is not available on this cluster "
+ "(CRD may not be installed)."
+ )
+ else:
+ st.error("Failed to fetch resources")
+ st.code(result.stderr, language="text")
+
+ # Describe a specific resource
+ st.markdown("---")
+ st.markdown("#### Describe a Resource")
+ desc_col1, desc_col2 = st.columns(2)
+ with desc_col1:
+ desc_name = st.text_input(
+ "Resource name",
+ placeholder="e.g., my-pod-xyz",
+ key="desc_name",
+ )
+ with desc_col2:
+ desc_ns = st.text_input(
+ "Namespace (if applicable)",
+ value="default",
+ key="desc_ns",
+ )
+
+ if st.button("Describe", key="describe_res") and desc_name:
+ # Determine the singular resource type for describe
+ res_singular = resource_type.rstrip("s")
+ if resource_type == "Ingresses":
+ res_singular = "ingress"
+ elif resource_type == "Namespaces":
+ res_singular = "namespace"
+ elif resource_type == "StorageClasses":
+ res_singular = "storageclass"
+ elif resource_type == "Endpoints":
+ res_singular = "endpoints"
+
+ desc_cmd = f"describe {res_singular.lower()} {desc_name}"
+ if ns_supported and desc_ns:
+ desc_cmd += f" -n {desc_ns}"
+
+ with st.spinner(f"Describing {desc_name}..."):
+ result = run_kubectl(profile, desc_cmd, timeout=30)
+ if result.success:
+ st.code(result.stdout, language="yaml")
+ else:
+ st.error("Describe failed")
+ st.code(result.stderr, language="text")
+
+ # ── Node Health ──────────────────────────────────────────────────────
+ with tab_node_health:
+ st.markdown("### Node Health Overview")
+ st.markdown("View node status, resource usage, and conditions.")
+
+ if st.button("Refresh Node Health", type="primary", key="node_health"):
+ col_status, col_top = st.columns(2)
+
+ with col_status:
+ st.markdown("#### Node Status")
+ with st.spinner("Fetching nodes..."):
+ result = run_kubectl(profile, "get nodes -o wide", timeout=15)
+ if result.success:
+ st.code(result.stdout, language="text")
+ else:
+ st.error("Failed to get nodes")
+ st.code(result.stderr, language="text")
+
+ with col_top:
+ st.markdown("#### Resource Usage")
+ with st.spinner("Fetching node metrics..."):
+ result = run_kubectl(profile, "top nodes", timeout=15)
+ if result.success:
+ st.code(result.stdout, language="text")
+ else:
+ st.warning("kubectl top requires metrics-server to be installed.")
+ st.code(result.stderr, language="text")
+
+ st.markdown("---")
+ st.markdown("#### Node Conditions")
+ with st.spinner("Checking node conditions..."):
+ result = run_kubectl(
+ profile,
+ 'get nodes -o custom-columns='
+ '"NAME:.metadata.name,'
+ 'READY:.status.conditions[?(@.type==\\"Ready\\")].status,'
+ 'DISK:.status.conditions[?(@.type==\\"DiskPressure\\")].status,'
+ 'MEMORY:.status.conditions[?(@.type==\\"MemoryPressure\\")].status,'
+ 'PID:.status.conditions[?(@.type==\\"PIDPressure\\")].status"',
+ timeout=15,
+ )
+ if result.success:
+ st.code(result.stdout, language="text")
+ else:
+ st.code(result.stderr, language="text")
+
+ st.markdown("#### Pod Distribution per Node")
+ with st.spinner("Fetching pod distribution..."):
+ result = run_kubectl(
+ profile,
+ 'get pods -A -o custom-columns='
+ '"NODE:.spec.nodeName,NAMESPACE:.metadata.namespace,'
+ 'POD:.metadata.name,STATUS:.status.phase" '
+ '--sort-by=.spec.nodeName',
+ timeout=15,
+ )
+ if result.success:
+ st.code(result.stdout, language="text")
+ else:
+ st.code(result.stderr, language="text")
+
+ # ── RBAC Viewer ──────────────────────────────────────────────────────
+ with tab_rbac:
+ st.markdown("### RBAC Viewer")
+ st.markdown("Browse Roles, ClusterRoles, Bindings, and ServiceAccounts.")
+
+ rbac_type = st.selectbox(
+ "RBAC Resource",
+ [
+ "ClusterRoles",
+ "ClusterRoleBindings",
+ "Roles (namespaced)",
+ "RoleBindings (namespaced)",
+ "ServiceAccounts",
+ ],
+ key="rbac_type",
+ )
+
+ rbac_ns = ""
+ if "(namespaced)" in rbac_type or rbac_type == "ServiceAccounts":
+ rbac_ns = st.text_input(
+ "Namespace",
+ value="default",
+ key="rbac_ns",
+ help="Leave blank for all namespaces",
+ )
+
+ if st.button("Fetch RBAC Resources", type="primary", key="fetch_rbac"):
+ cmd_map = {
+ "ClusterRoles": "get clusterroles",
+ "ClusterRoleBindings": "get clusterrolebindings",
+ "Roles (namespaced)": "get roles",
+ "RoleBindings (namespaced)": "get rolebindings",
+ "ServiceAccounts": "get serviceaccounts",
+ }
+ rbac_cmd = cmd_map[rbac_type]
+ if rbac_ns:
+ rbac_cmd += f" -n {rbac_ns}"
+ elif "(namespaced)" in rbac_type or rbac_type == "ServiceAccounts":
+ rbac_cmd += " -A"
+
+ with st.spinner(f"Fetching {rbac_type}..."):
+ result = run_kubectl(profile, rbac_cmd, timeout=15)
+ if result.success:
+ st.code(result.stdout or "(none found)", language="text")
+ else:
+ st.error("Failed to fetch RBAC resources")
+ st.code(result.stderr, language="text")
+
+ # Describe a specific RBAC resource
+ st.markdown("---")
+ st.markdown("#### Inspect RBAC Resource")
+ rbac_name = st.text_input(
+ "Resource name to describe",
+ placeholder="e.g., cluster-admin",
+ key="rbac_desc_name",
+ )
+ if st.button("Describe RBAC", key="desc_rbac") and rbac_name:
+ type_map = {
+ "ClusterRoles": "clusterrole",
+ "ClusterRoleBindings": "clusterrolebinding",
+ "Roles (namespaced)": "role",
+ "RoleBindings (namespaced)": "rolebinding",
+ "ServiceAccounts": "serviceaccount",
+ }
+ desc_cmd = f"describe {type_map[rbac_type]} {rbac_name}"
+ if rbac_ns:
+ desc_cmd += f" -n {rbac_ns}"
+
+ with st.spinner(f"Describing {rbac_name}..."):
+ result = run_kubectl(profile, desc_cmd, timeout=15)
+ if result.success:
+ st.code(result.stdout, language="yaml")
+ else:
+ st.error("Describe failed")
+ st.code(result.stderr, language="text")
+
+ # ── Helm Releases ────────────────────────────────────────────────────
+ with tab_helm:
+ st.markdown("### Helm Release Manager")
+ st.markdown("List, inspect, and manage Helm releases on your cluster.")
+
+ helm_tab_list, helm_tab_install, helm_tab_history = st.tabs([
+ "List Releases", "Install Chart", "Release History",
+ ])
+
+ with helm_tab_list:
+ helm_ns_all = st.checkbox("All namespaces", value=True, key="helm_ns_all")
+ helm_ns = ""
+ if not helm_ns_all:
+ helm_ns = st.text_input("Namespace", value="default", key="helm_ns")
+
+ if st.button("List Helm Releases", type="primary", key="helm_list"):
+ helm_cmd = "helm list"
+ if helm_ns_all:
+ helm_cmd += " -A"
+ elif helm_ns:
+ helm_cmd += f" -n {helm_ns}"
+ helm_cmd += " -o table"
+
+ with st.spinner("Fetching Helm releases..."):
+ result = run_kubectl(profile, helm_cmd.replace("kubectl ", ""), timeout=15)
+ if result.success:
+ st.code(result.stdout or "(no releases found)", language="text")
+ else:
+ st.warning("Helm may not be installed on this cluster.")
+ st.code(result.stderr, language="text")
+
+ with helm_tab_install:
+ st.markdown("#### Install a Helm Chart")
+ hcol1, hcol2 = st.columns(2)
+ with hcol1:
+ helm_release_name = st.text_input("Release Name", placeholder="my-release", key="helm_rel")
+ helm_chart = st.text_input("Chart", placeholder="prometheus-community/kube-prometheus-stack", key="helm_chart")
+ with hcol2:
+ helm_install_ns = st.text_input("Namespace", value="default", key="helm_install_ns")
+ helm_create_ns = st.checkbox("Create namespace if not exists", value=True, key="helm_create_ns")
+ helm_values = st.text_area(
+ "Values (YAML, optional)",
+ placeholder="# Custom values.yaml content here",
+ height=150,
+ key="helm_values",
+ )
+
+ if st.button("Install Chart", type="primary", key="helm_install") and helm_release_name and helm_chart:
+ install_cmd = f"helm install {helm_release_name} {helm_chart} -n {helm_install_ns}"
+ if helm_create_ns:
+ install_cmd += " --create-namespace"
+ # If user provided values, write to temp file
+ if helm_values.strip():
+ values_path = os.path.join(config.UPLOADS_DIR, f"helm-values-{helm_release_name}.yaml")
+ with open(values_path, "w") as vf:
+ vf.write(helm_values)
+ install_cmd += f" -f {values_path}"
+
+ with st.spinner(f"Installing {helm_chart}..."):
+ result = run_kubectl(profile, install_cmd.replace("kubectl ", ""), timeout=120)
+ if result.success:
+ st.success(f"Release '{helm_release_name}' installed!")
+ st.code(result.stdout, language="text")
+ else:
+ st.error("Helm install failed")
+ st.code(result.stderr, language="text")
+
+ with helm_tab_history:
+ st.markdown("#### Release History")
+ hist_name = st.text_input("Release name", placeholder="my-release", key="helm_hist_name")
+ hist_ns = st.text_input("Namespace", value="default", key="helm_hist_ns")
+
+ if st.button("Get History", key="helm_hist") and hist_name:
+ hist_cmd = f"helm history {hist_name} -n {hist_ns}"
+ with st.spinner("Fetching history..."):
+ result = run_kubectl(profile, hist_cmd.replace("kubectl ", ""), timeout=15)
+ if result.success:
+ st.code(result.stdout, language="text")
+ else:
+ st.error("Could not get release history")
+ st.code(result.stderr, language="text")
+
+ st.markdown("---")
+ st.markdown("#### Rollback Release")
+ rb_name = st.text_input("Release name", placeholder="my-release", key="helm_rb_name")
+ rb_ns = st.text_input("Namespace", value="default", key="helm_rb_ns")
+ rb_rev = st.number_input("Revision number", min_value=1, value=1, key="helm_rb_rev")
+
+ if st.button("Rollback", key="helm_rollback") and rb_name:
+ rb_cmd = f"helm rollback {rb_name} {rb_rev} -n {rb_ns}"
+ with st.spinner(f"Rolling back {rb_name} to revision {rb_rev}..."):
+ result = run_kubectl(profile, rb_cmd.replace("kubectl ", ""), timeout=60)
+ if result.success:
+ st.success(f"Rolled back '{rb_name}' to revision {rb_rev}")
+ st.code(result.stdout, language="text")
+ else:
+ st.error("Rollback failed")
+ st.code(result.stderr, language="text")
+
+ # ── Events Timeline ──────────────────────────────────────────────────
+ with tab_events:
+ st.markdown("### Cluster Events Timeline")
+ st.markdown("View recent Kubernetes events with graphical analysis.")
+
+ ev_col1, ev_col2, ev_col3 = st.columns(3)
+ with ev_col1:
+ ev_ns_all = st.checkbox("All namespaces", value=True, key="ev_ns_all")
+ ev_ns = ""
+ if not ev_ns_all:
+ ev_ns = st.text_input("Namespace", value="default", key="ev_ns")
+ with ev_col2:
+ ev_type = st.selectbox(
+ "Event Type",
+ ["All", "Normal", "Warning"],
+ key="ev_type",
+ )
+ with ev_col3:
+ ev_sort = st.selectbox(
+ "Sort by",
+ ["Last Timestamp", "First Timestamp", "Count"],
+ key="ev_sort",
+ )
+
+ if st.button("Fetch Events", type="primary", key="fetch_events"):
+ # Fetch events in JSON for graphical display
+ ev_json_cmd = "get events"
+ if ev_ns_all:
+ ev_json_cmd += " -A"
+ elif ev_ns:
+ ev_json_cmd += f" -n {ev_ns}"
+ if ev_type != "All":
+ ev_json_cmd += f" --field-selector type={ev_type}"
+ ev_json_cmd += " -o json"
+
+ with st.spinner("Fetching events..."):
+ result = run_kubectl(profile, ev_json_cmd, timeout=15)
+
+ if result.success and result.stdout.strip():
+ try:
+ events_data = json.loads(result.stdout)
+ items = events_data.get("items", [])
+
+ if not items:
+ st.info("No events found.")
+ else:
+ # Parse events into structured data
+ ev_records = []
+ for item in items:
+ ev_records.append({
+ "Namespace": item.get("metadata", {}).get("namespace", ""),
+ "Type": item.get("type", ""),
+ "Reason": item.get("reason", ""),
+ "Object": item.get("involvedObject", {}).get("name", ""),
+ "Kind": item.get("involvedObject", {}).get("kind", ""),
+ "Message": (item.get("message", "") or "")[:120],
+ "Count": item.get("count", 1),
+ "Last Seen": item.get("lastTimestamp", item.get("eventTime", "")),
+ })
+
+ import pandas as pd
+
+ df = pd.DataFrame(ev_records)
+
+ # ── Graphical Summary ────────────────────────
+ st.markdown("#### Event Summary Charts")
+
+ chart_col1, chart_col2 = st.columns(2)
+
+ with chart_col1:
+ st.markdown("**Events by Type**")
+ type_counts = df["Type"].value_counts().reset_index()
+ type_counts.columns = ["Type", "Count"]
+ st.bar_chart(type_counts.set_index("Type"))
+
+ with chart_col2:
+ st.markdown("**Events by Reason (Top 10)**")
+ reason_counts = df["Reason"].value_counts().head(10).reset_index()
+ reason_counts.columns = ["Reason", "Count"]
+ st.bar_chart(reason_counts.set_index("Reason"))
+
+ chart_col3, chart_col4 = st.columns(2)
+
+ with chart_col3:
+ st.markdown("**Events by Namespace (Top 10)**")
+ ns_counts = df["Namespace"].value_counts().head(10).reset_index()
+ ns_counts.columns = ["Namespace", "Count"]
+ st.bar_chart(ns_counts.set_index("Namespace"))
+
+ with chart_col4:
+ st.markdown("**Events by Object Kind**")
+ kind_counts = df["Kind"].value_counts().reset_index()
+ kind_counts.columns = ["Kind", "Count"]
+ st.bar_chart(kind_counts.set_index("Kind"))
+
+ # ── Timeline Chart ────────────────────────────
+ st.markdown("---")
+ st.markdown("#### Event Timeline")
+ if df["Last Seen"].notna().any() and df["Last Seen"].str.strip().any():
+ try:
+ df["Timestamp"] = pd.to_datetime(
+ df["Last Seen"], errors="coerce", utc=True,
+ )
+ ts_df = df.dropna(subset=["Timestamp"])
+ if not ts_df.empty:
+ ts_df = ts_df.set_index("Timestamp")
+ # Events over time grouped by type
+ timeline = ts_df.groupby(
+ [pd.Grouper(freq="1min"), "Type"]
+ ).size().unstack(fill_value=0)
+ if not timeline.empty:
+ st.line_chart(timeline)
+ else:
+ st.info("Not enough timestamp data for timeline chart.")
+ else:
+ st.info("Could not parse event timestamps for timeline.")
+ except Exception:
+ st.info("Could not render timeline chart from event data.")
+ else:
+ st.info("No timestamp data available for timeline chart.")
+
+ # ── High-Count Events ─────────────────────────
+ st.markdown("---")
+ st.markdown("#### High-Frequency Events")
+ high_count = df[df["Count"] > 1].sort_values("Count", ascending=False).head(20)
+ if not high_count.empty:
+ st.dataframe(
+ high_count[["Namespace", "Type", "Reason", "Object", "Count", "Message"]],
+ use_container_width=True,
+ hide_index=True,
+ )
+ else:
+ st.info("No repeated events found.")
+
+ # ── Full Events Table ─────────────────────────
+ st.markdown("---")
+ st.markdown("#### All Events")
+ st.dataframe(df, use_container_width=True, hide_index=True)
+
+ except (json.JSONDecodeError, KeyError):
+ # Fallback to text display
+ st.code(result.stdout, language="text")
+ elif result.success:
+ st.info("No events found.")
+ else:
+ st.error("Failed to fetch events")
+ st.code(result.stderr, language="text")
+
+ # Warning events summary
+ st.markdown("---")
+ st.markdown("#### Warning Events Summary")
+ if st.button("Show Warning Events", key="warn_events"):
+ warn_cmd = (
+ "get events -A --field-selector type=Warning "
+ "-o custom-columns="
+ "'NAMESPACE:.metadata.namespace,"
+ "LAST_SEEN:.lastTimestamp,"
+ "COUNT:.count,"
+ "REASON:.reason,"
+ "OBJECT:.involvedObject.name,"
+ "MESSAGE:.message' "
+ "--sort-by=.lastTimestamp"
+ )
+ with st.spinner("Fetching warning events..."):
+ result = run_kubectl(profile, warn_cmd, timeout=15)
+ if result.success:
+ st.code(result.stdout or "(no warning events)", language="text")
+ else:
+ st.code(result.stderr, language="text")
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Upgrade Planner
+# ══════════════════════════════════════════════════════════════════════════
+
+_K8S_VERSIONS_DETAIL = [
+ {
+ "version": "1.35",
+ "release": "2026-04",
+ "end_of_life": "2027-08",
+ "highlights": "Sidecar containers GA, improved pod lifecycle management, dynamic resource allocation enhancements.",
+ },
+ {
+ "version": "1.34",
+ "release": "2025-12",
+ "end_of_life": "2027-04",
+ "highlights": "Structured authorization config GA, recursive read-only mounts, traffic distribution improvements.",
+ },
+ {
+ "version": "1.33",
+ "release": "2025-08",
+ "end_of_life": "2027-01",
+ "highlights": "In-place pod resize beta, multi-network pods alpha, nftables kube-proxy backend.",
+ },
+ {
+ "version": "1.32",
+ "release": "2025-04",
+ "end_of_life": "2026-08",
+ "highlights": "Dynamic resource allocation (DRA) beta, auto-remove PV claims, job success policy GA.",
+ },
+ {
+ "version": "1.31",
+ "release": "2024-12",
+ "end_of_life": "2026-04",
+ "highlights": "AppArmor GA, nftables proxy GA, improved ingress connectivity reliability, cgroup v2 enhancements.",
+ },
+ {
+ "version": "1.30",
+ "release": "2024-04",
+ "end_of_life": "2025-08",
+ "highlights": "Contextual logging GA, CEL admission improvements, pod scheduling readiness.",
+ },
+ {
+ "version": "1.29",
+ "release": "2023-12",
+ "end_of_life": "2025-02",
+ "highlights": "KMS v2 GA, ReadWriteOncePod GA, networking improvements, node memory manager.",
+ },
+ {
+ "version": "1.28",
+ "release": "2023-08",
+ "end_of_life": "2024-10",
+ "highlights": "Sidecar containers alpha, recovery from non-graceful node shutdown, mixed version proxy.",
+ },
+ {
+ "version": "1.27",
+ "release": "2023-04",
+ "end_of_life": "2024-06",
+ "highlights": "In-place pod resize alpha, VPA improvements, SeccompDefault GA.",
+ },
+]
+
+
+def page_upgrade_planner():
+ st.markdown("## Upgrade Planner")
+ st.markdown("Plan and prepare Kubernetes version upgrades for your cluster.")
+
+ profile = _get_active_profile()
+ if not profile:
+ return
+
+ current_ver = profile.kubernetes_version
+
+ tab_overview, tab_preflight, tab_plan, tab_changelog = st.tabs([
+ "Version Overview",
+ "Pre-flight Checks",
+ "Upgrade Steps",
+ "Changelog & Compatibility",
+ ])
+
+ # ── Version Overview ─────────────────────────────────────────────────
+ with tab_overview:
+ st.markdown("### Kubernetes Version Matrix")
+ st.info(f"Your current cluster version: **{current_ver}**")
+
+ # Build a table
+ rows = []
+ for v in _K8S_VERSIONS_DETAIL:
+ status = ""
+ if v["version"] == current_ver:
+ status = "CURRENT"
+ elif v["version"] > current_ver:
+ status = "UPGRADE AVAILABLE"
+ else:
+ status = "OLDER"
+ rows.append({
+ "Version": v["version"],
+ "Status": status,
+ "Release Date": v["release"],
+ "End of Life": v["end_of_life"],
+ "Highlights": v["highlights"],
+ })
+
+ st.dataframe(rows, use_container_width=True, hide_index=True)
+
+ # Upgrade target selection
+ st.markdown("---")
+ available_upgrades = [
+ v["version"] for v in _K8S_VERSIONS_DETAIL if v["version"] > current_ver
+ ]
+ if available_upgrades:
+ target_version = st.selectbox(
+ "Select target upgrade version",
+ available_upgrades,
+ key="upgrade_target",
+ )
+ skipped = [
+ v for v in _K8S_VERSIONS_DETAIL
+ if current_ver < v["version"] <= target_version
+ ]
+ if len(skipped) > 1:
+ st.warning(
+ f"You are skipping {len(skipped) - 1} minor version(s). "
+ "Kubernetes supports upgrading one minor version at a time. "
+ "Plan incremental upgrades for production clusters."
+ )
+ st.markdown("#### Upgrade Path")
+ path_versions = [current_ver] + [v["version"] for v in reversed(skipped)]
+ st.markdown(" → ".join([f"**{v}**" for v in path_versions]))
+ else:
+ st.success("You are running the latest version!")
+
+ # ── Pre-flight Checks ────────────────────────────────────────────────
+ with tab_preflight:
+ st.markdown("### Pre-Upgrade Checks")
+ st.markdown("Run these checks before starting the upgrade process.")
+
+ checks = [
+ ("Cluster Health", "get nodes -o wide"),
+ ("All Pods Running", "get pods -A --field-selector 'status.phase!=Running,status.phase!=Succeeded'"),
+ ("etcd Health", "get --raw=/healthz"),
+ ("API Server Version", "version"),
+ ("PodDisruptionBudgets", "get pdb -A"),
+ ("Deprecated APIs", "api-resources --api-group=extensions"),
+ ("Persistent Volumes", "get pv"),
+ ("Component Statuses", "get cs 2>/dev/null || echo 'Deprecated in newer versions'"),
+ ]
+
+ if st.button("Run All Pre-flight Checks", type="primary", key="preflight"):
+ all_ok = True
+ for name, cmd in checks:
+ with st.status(f"Checking: {name}...", expanded=False) as status:
+ result = run_kubectl(profile, cmd, timeout=15)
+ if result.success:
+ st.code(result.stdout or "(no output)", language="text")
+ status.update(label=f"{name} — OK", state="complete")
+ else:
+ st.code(result.stderr, language="text")
+ status.update(label=f"{name} — ISSUE", state="error")
+ all_ok = False
+
+ if all_ok:
+ st.success("All pre-flight checks passed! The cluster looks ready for upgrade.")
+ else:
+ st.warning(
+ "Some checks reported issues. Review the output above before proceeding."
+ )
+
+ st.markdown("---")
+ st.markdown("#### Backup Checklist")
+ st.markdown(
+ "Before upgrading, ensure you have:\n\n"
+ "- [ ] **etcd snapshot backup**: `ETCDCTL_API=3 etcdctl snapshot save /backup/etcd-snapshot.db`\n"
+ "- [ ] **Cluster state export**: `kubectl get all -A -o yaml > cluster-backup.yaml`\n"
+ "- [ ] **PV/PVC data backed up** (if applicable)\n"
+ "- [ ] **CNI configuration backed up**: `/etc/cni/net.d/`\n"
+ "- [ ] **kubeadm config backed up**: `kubeadm config view > kubeadm-config.yaml`\n"
+ "- [ ] **VM/node snapshots taken** (if running on VMs)\n"
+ )
+
+ # ── Upgrade Steps ────────────────────────────────────────────────────
+ with tab_plan:
+ st.markdown("### Step-by-Step Upgrade Plan")
+
+ target = st.selectbox(
+ "Target Version",
+ [v["version"] for v in _K8S_VERSIONS_DETAIL if v["version"] > current_ver] or [current_ver],
+ key="upgrade_plan_target",
+ )
+
+ st.markdown(f"#### Upgrading from {current_ver} → {target}")
+
+ st.markdown(
+ f"""
+**Phase 1: Prepare (Control Plane)**
+```bash
+# 1. Update package repositories
+sudo apt-get update
+
+# 2. Check available kubeadm versions
+apt-cache madison kubeadm | grep {target}
+
+# 3. Upgrade kubeadm
+sudo apt-mark unhold kubeadm
+sudo apt-get install -y kubeadm={target}.*
+sudo apt-mark hold kubeadm
+
+# 4. Verify kubeadm version
+kubeadm version
+
+# 5. Check upgrade plan
+sudo kubeadm upgrade plan
+```
+
+**Phase 2: Upgrade Control Plane**
+```bash
+# 1. Drain the control-plane node
+kubectl drain --ignore-daemonsets --delete-emptydir-data
+
+# 2. Apply the upgrade
+sudo kubeadm upgrade apply v{target}.0
+
+# 3. Upgrade kubelet & kubectl
+sudo apt-mark unhold kubelet kubectl
+sudo apt-get install -y kubelet={target}.* kubectl={target}.*
+sudo apt-mark hold kubelet kubectl
+
+# 4. Restart kubelet
+sudo systemctl daemon-reload
+sudo systemctl restart kubelet
+
+# 5. Uncordon the node
+kubectl uncordon
+```
+
+**Phase 3: Upgrade Worker Nodes** (repeat for each worker)
+```bash
+# On each worker node:
+# 1. Drain the worker
+kubectl drain --ignore-daemonsets --delete-emptydir-data
+
+# 2. Upgrade kubeadm, kubelet, kubectl
+sudo apt-mark unhold kubeadm kubelet kubectl
+sudo apt-get install -y kubeadm={target}.* kubelet={target}.* kubectl={target}.*
+sudo apt-mark hold kubeadm kubelet kubectl
+
+# 3. Upgrade node config
+sudo kubeadm upgrade node
+
+# 4. Restart kubelet
+sudo systemctl daemon-reload
+sudo systemctl restart kubelet
+
+# 5. Uncordon
+kubectl uncordon
+```
+
+**Phase 4: Upgrade CRI-O** (on each node)
+```bash
+# Update CRI-O to match the K8s version
+sudo apt-get install -y cri-o={target}.*
+sudo systemctl restart crio
+sudo systemctl restart kubelet
+```
+
+**Phase 5: Verify**
+```bash
+kubectl get nodes -o wide
+kubectl get pods -A
+kubectl version
+```
+"""
+ )
+
+ # ── Changelog & Compatibility ────────────────────────────────────────
+ with tab_changelog:
+ st.markdown("### Version Changelog & Compatibility Notes")
+
+ for v in _K8S_VERSIONS_DETAIL:
+ marker = " ← CURRENT" if v["version"] == current_ver else ""
+ with st.expander(f"Kubernetes {v['version']}{marker}", expanded=(v["version"] == current_ver)):
+ st.markdown(f"**Release Date:** {v['release']}")
+ st.markdown(f"**End of Life:** {v['end_of_life']}")
+ st.markdown(f"**Key Highlights:** {v['highlights']}")
+ st.markdown("---")
+ st.markdown(
+ f"**Compatibility:**\n"
+ f"- CRI-O: {v['version']}.x\n"
+ f"- Flannel: Compatible (check release notes for CNI spec changes)\n"
+ f"- etcd: 3.5.x+ recommended\n"
+ f"- CoreDNS: 1.11.x+ recommended\n"
+ )
+ st.markdown(
+ f"**Upgrade Notes:**\n"
+ f"- Always upgrade one minor version at a time\n"
+ f"- Check deprecated API versions before upgrading\n"
+ f"- Run `kubeadm upgrade plan` to verify compatibility\n"
+ f"- Back up etcd before starting\n"
+ )
+
+
def page_ai_assistant():
st.markdown("## AI Kubernetes Assistant")
@@ -1334,32 +2320,42 @@ def _get_active_profile() -> ClusterProfile | None:
def _show_profile_summary(profile: ClusterProfile):
"""Display a compact profile summary."""
- cols = st.columns(5)
- cols[0].metric("Profile", profile.name)
- cols[1].metric("K8s Version", profile.kubernetes_version)
- cols[2].metric("Runtime", f"CRI-O {profile.crio_version}")
- cols[3].metric("CNI", "Flannel")
- cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W")
-
- with st.expander("Storage & Proxy Details", expanded=False):
- scol1, scol2, scol3 = st.columns(3)
- with scol1:
- st.markdown(f"**CRI-O Root:** `{profile.crio_root}`")
- st.markdown(f"**CRI-O RunRoot:** `{profile.crio_runroot}`")
- with scol2:
- st.markdown(f"**Kubelet Dir:** `{profile.kubelet_root}`")
- st.markdown(f"**Log Root:** `{profile.log_root}`")
- with scol3:
- if profile.http_proxy or profile.https_proxy:
- st.markdown(f"**HTTP Proxy:** `{profile.http_proxy or 'N/A'}`")
- st.markdown(f"**HTTPS Proxy:** `{profile.https_proxy or 'N/A'}`")
- if profile.no_proxy:
- st.markdown(f"**No Proxy:** `{profile.no_proxy}`")
- if profile.http_proxy_alt or profile.https_proxy_alt:
- st.markdown(f"**Alt HTTP Proxy:** `{profile.http_proxy_alt or 'N/A'}`")
- st.markdown(f"**Alt HTTPS Proxy:** `{profile.https_proxy_alt or 'N/A'}`")
- if not (profile.http_proxy or profile.https_proxy or profile.http_proxy_alt or profile.https_proxy_alt):
- st.markdown("**Proxy:** Not configured")
+ if profile.cluster_source == "imported":
+ cols = st.columns(4)
+ cols[0].metric("Profile", profile.name)
+ cols[1].metric("K8s Version", profile.kubernetes_version)
+ cols[2].metric("Source", "Imported (kubeconfig)")
+ cols[3].metric("Status", profile.status.upper())
+ with st.expander("Cluster Details", expanded=False):
+ st.markdown(f"**Description:** {profile.description or 'N/A'}")
+ st.markdown(f"**Kubeconfig:** {'Loaded' if profile.kubeconfig_content else 'Not loaded'}")
+ else:
+ cols = st.columns(5)
+ cols[0].metric("Profile", profile.name)
+ cols[1].metric("K8s Version", profile.kubernetes_version)
+ cols[2].metric("Runtime", f"CRI-O {profile.crio_version}")
+ cols[3].metric("CNI", "Flannel")
+ cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W")
+
+ with st.expander("Storage & Proxy Details", expanded=False):
+ scol1, scol2, scol3 = st.columns(3)
+ with scol1:
+ st.markdown(f"**CRI-O Root:** `{profile.crio_root}`")
+ st.markdown(f"**CRI-O RunRoot:** `{profile.crio_runroot}`")
+ with scol2:
+ st.markdown(f"**Kubelet Dir:** `{profile.kubelet_root}`")
+ st.markdown(f"**Log Root:** `{profile.log_root}`")
+ with scol3:
+ if profile.http_proxy or profile.https_proxy:
+ st.markdown(f"**HTTP Proxy:** `{profile.http_proxy or 'N/A'}`")
+ st.markdown(f"**HTTPS Proxy:** `{profile.https_proxy or 'N/A'}`")
+ if profile.no_proxy:
+ st.markdown(f"**No Proxy:** `{profile.no_proxy}`")
+ if profile.http_proxy_alt or profile.https_proxy_alt:
+ st.markdown(f"**Alt HTTP Proxy:** `{profile.http_proxy_alt or 'N/A'}`")
+ st.markdown(f"**Alt HTTPS Proxy:** `{profile.https_proxy_alt or 'N/A'}`")
+ if not (profile.http_proxy or profile.https_proxy or profile.http_proxy_alt or profile.https_proxy_alt):
+ st.markdown("**Proxy:** Not configured")
# ── Main Router ───────────────────────────────────────────────────────────
@@ -1371,12 +2367,16 @@ def main():
page_profile_manager()
elif page == "Cluster Creation":
page_cluster_creation()
+ elif page == "Resource Viewer":
+ page_resource_viewer()
elif page == "Cluster Debugger":
page_cluster_debugger()
elif page == "Monitoring Setup":
page_monitoring_setup()
elif page == "Log Analysis":
page_log_analysis()
+ elif page == "Upgrade Planner":
+ page_upgrade_planner()
elif page == "AI Assistant":
page_ai_assistant()
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 22dcfd9..79c94cd 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -1,10 +1,12 @@
"""Cluster Creator — SSH-based K8s cluster provisioning with CRI-O + Flannel."""
+import os
import subprocess
import time
from dataclasses import dataclass, field
from typing import List, Optional
+import config
from modules.profile_manager import ClusterProfile
# Default Flannel manifest URL — can be overridden by user-uploaded file
@@ -1183,6 +1185,88 @@ def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str:
return query_llm(prompt)
+def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSHResult:
+ """Run a kubectl (or helm) command against the cluster.
+
+ For imported clusters (with kubeconfig), commands run locally.
+ For provisioned clusters, commands run via SSH on the control-plane node.
+
+ If `command` starts with 'helm ', it is treated as a helm command and
+ the KUBECONFIG env var is set instead of prefixing with 'kubectl'.
+ """
+ is_helm = command.strip().startswith("helm ")
+
+ if profile.kubeconfig_content:
+ # Write kubeconfig to a file and run locally
+ kubeconfig_path = os.path.join(
+ config.DATA_DIR, "kubeconfigs", f"{profile.name}.kubeconfig"
+ )
+ os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+ with open(kubeconfig_path, "w") as f:
+ f.write(profile.kubeconfig_content)
+
+ if is_helm:
+ full_cmd = f"KUBECONFIG={kubeconfig_path} {command}"
+ else:
+ full_cmd = f"kubectl --kubeconfig={kubeconfig_path} {command}"
+ try:
+ proc = subprocess.run(
+ full_cmd,
+ shell=True,
+ capture_output=True,
+ text=True,
+ timeout=timeout,
+ )
+ return SSHResult(
+ hostname="local (kubeconfig)",
+ command=full_cmd,
+ return_code=proc.returncode,
+ stdout=proc.stdout,
+ stderr=proc.stderr,
+ success=proc.returncode == 0,
+ )
+ except subprocess.TimeoutExpired:
+ return SSHResult(
+ hostname="local (kubeconfig)",
+ command=full_cmd,
+ return_code=-1,
+ stdout="",
+ stderr=f"Command timed out after {timeout}s",
+ success=False,
+ )
+ except Exception as exc:
+ return SSHResult(
+ hostname="local (kubeconfig)",
+ command=full_cmd,
+ return_code=-1,
+ stdout="",
+ stderr=str(exc),
+ success=False,
+ )
+ else:
+ # Provisioned cluster — SSH to control-plane
+ cp_nodes = profile.get_control_plane_nodes()
+ if not cp_nodes:
+ return SSHResult(
+ hostname="N/A",
+ command=command,
+ return_code=-1,
+ stdout="",
+ stderr="No control-plane node defined",
+ success=False,
+ )
+ cp = cp_nodes[0]
+ remote_cmd = command if is_helm else f"kubectl {command}"
+ return run_ssh_command(
+ ip_address=cp["ip_address"],
+ command=remote_cmd,
+ ssh_user=cp.get("ssh_user", "root"),
+ ssh_port=cp.get("ssh_port", 22),
+ ssh_key_path=cp.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=timeout,
+ )
+
+
def upload_flannel_manifest_to_node(node: dict, local_path: str) -> SSHResult:
"""SCP a user-provided Flannel manifest to a node as /tmp/kube-flannel-custom.yml."""
scp_cmd = [
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index b6798cc..390e6ed 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -1,11 +1,36 @@
-"""Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations."""
+"""Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations.
+
+Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based).
+"""
+
+import os
+import subprocess
from modules.cluster_creator import run_ssh_command, SSHResult
from modules.profile_manager import ClusterProfile
+import config
# ── Diagnostic command definitions ────────────────────────────────────────
+# kubectl-only commands (work for both imported and provisioned clusters)
+KUBECTL_DIAGNOSTIC_COMMANDS = {
+ "Node Status": "get nodes -o wide",
+ "Pod Status (All Namespaces)": "get pods -A -o wide",
+ "Events (Recent)": "get events -A --sort-by=.lastTimestamp",
+ "Component Status": "get componentstatuses",
+ "System Pods": "-n kube-system get pods -o wide",
+ "Node Resources": "top nodes",
+ "Pod Resources": "top pods -A",
+ "Cluster Info": "cluster-info",
+ "Flannel Status": "-n kube-flannel get pods -o wide",
+ "Network Policies": "get networkpolicies -A",
+ "Services": "get svc -A",
+ "PVCs": "get pvc -A",
+ "Ingresses": "get ingress -A",
+}
+
+# Full SSH commands (backward-compat for provisioned clusters)
DIAGNOSTIC_COMMANDS = {
"Node Status": "kubectl get nodes -o wide",
"Pod Status (All Namespaces)": "kubectl get pods -A -o wide",
@@ -30,6 +55,39 @@
"Certificate Expiry": "kubeadm certs check-expiration 2>/dev/null || echo 'Not a kubeadm node or kubeadm not found'",
}
+
+def _run_local_kubectl(kubeconfig_content: str, kubectl_args: str, timeout: int = 60) -> SSHResult:
+ """Run a kubectl command locally using the given kubeconfig content."""
+ kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_debug_temp.kubeconfig")
+ os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+ with open(kubeconfig_path, "w") as f:
+ f.write(kubeconfig_content)
+ full_cmd = f"kubectl --kubeconfig={kubeconfig_path} {kubectl_args}"
+ try:
+ proc = subprocess.run(full_cmd, shell=True, capture_output=True, text=True, timeout=timeout)
+ return SSHResult(
+ hostname="local", command=full_cmd, return_code=proc.returncode,
+ stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
+ )
+ except subprocess.TimeoutExpired:
+ return SSHResult(
+ hostname="local", command=full_cmd, return_code=-1,
+ stdout="", stderr=f"Command timed out after {timeout}s", success=False,
+ )
+ except Exception as e:
+ return SSHResult(
+ hostname="local", command=full_cmd, return_code=-1,
+ stdout="", stderr=str(e), success=False,
+ )
+
+
+def get_available_commands(profile: ClusterProfile) -> dict[str, str]:
+ """Return available diagnostic commands based on cluster source."""
+ if profile.cluster_source == "imported":
+ return dict(KUBECTL_DIAGNOSTIC_COMMANDS)
+ return dict(DIAGNOSTIC_COMMANDS)
+
+
CATEGORY_MAP = {
"Cluster Overview": [
"Node Status",
@@ -73,10 +131,33 @@
def run_diagnostic(
- control_plane_node: dict,
+ control_plane_node: dict | None,
command_name: str,
+ profile: ClusterProfile | None = None,
) -> SSHResult:
- """Run a single diagnostic command on the control-plane node."""
+ """Run a single diagnostic command.
+
+ For imported clusters, uses kubectl locally with kubeconfig.
+ For provisioned clusters, uses SSH to the control-plane node.
+ """
+ # Imported cluster path
+ if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+ kubectl_args = KUBECTL_DIAGNOSTIC_COMMANDS.get(command_name)
+ if kubectl_args is None:
+ return SSHResult(
+ hostname="local", command=command_name, return_code=1,
+ stdout="",
+ stderr=f"Command '{command_name}' requires SSH (not available for imported clusters).",
+ success=False,
+ )
+ return _run_local_kubectl(profile.kubeconfig_content, kubectl_args, timeout=60)
+
+ # Provisioned cluster path (SSH)
+ if not control_plane_node:
+ return SSHResult(
+ hostname="unknown", command=command_name, return_code=1,
+ stdout="", stderr="No control-plane node available.", success=False,
+ )
command = DIAGNOSTIC_COMMANDS.get(command_name)
if not command:
return SSHResult(
@@ -98,30 +179,47 @@ def run_diagnostic(
def run_category_diagnostics(
- control_plane_node: dict,
+ control_plane_node: dict | None,
category: str,
+ profile: ClusterProfile | None = None,
) -> dict[str, SSHResult]:
"""Run all diagnostic commands for a given category."""
results = {}
command_names = CATEGORY_MAP.get(category, [])
for name in command_names:
- results[name] = run_diagnostic(control_plane_node, name)
+ results[name] = run_diagnostic(control_plane_node, name, profile=profile)
return results
-def run_all_diagnostics(control_plane_node: dict) -> dict[str, SSHResult]:
+def run_all_diagnostics(
+ control_plane_node: dict | None,
+ profile: ClusterProfile | None = None,
+) -> dict[str, SSHResult]:
"""Run every diagnostic command."""
+ commands = get_available_commands(profile) if profile else DIAGNOSTIC_COMMANDS
results = {}
- for name in DIAGNOSTIC_COMMANDS:
- results[name] = run_diagnostic(control_plane_node, name)
+ for name in commands:
+ results[name] = run_diagnostic(control_plane_node, name, profile=profile)
return results
def run_custom_command(
- control_plane_node: dict,
+ control_plane_node: dict | None,
command: str,
+ profile: ClusterProfile | None = None,
) -> SSHResult:
- """Run a custom command on the control-plane node."""
+ """Run a custom command. For imported clusters, runs kubectl locally."""
+ if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+ cmd = command.strip()
+ if cmd.startswith("kubectl "):
+ cmd = cmd[len("kubectl "):]
+ return _run_local_kubectl(profile.kubeconfig_content, cmd, timeout=60)
+
+ if not control_plane_node:
+ return SSHResult(
+ hostname="unknown", command=command, return_code=1,
+ stdout="", stderr="No control-plane node available.", success=False,
+ )
return run_ssh_command(
ip_address=control_plane_node["ip_address"],
command=command,
@@ -211,9 +309,27 @@ def get_debug_suggestion(
return query_llm(prompt)
-def check_pod_issues(control_plane_node: dict, namespace: str = "") -> SSHResult:
+def check_pod_issues(
+ control_plane_node: dict | None,
+ namespace: str = "",
+ profile: ClusterProfile | None = None,
+) -> SSHResult:
"""Check for pods in non-running states."""
ns_flag = f"-n {namespace}" if namespace else "-A"
+
+ if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+ kubectl_args = (
+ f"get pods {ns_flag} "
+ "--field-selector=status.phase!=Running,status.phase!=Succeeded -o wide"
+ )
+ return _run_local_kubectl(profile.kubeconfig_content, kubectl_args, timeout=60)
+
+ if not control_plane_node:
+ return SSHResult(
+ hostname="unknown", command="check_pod_issues", return_code=1,
+ stdout="", stderr="No control-plane node available.", success=False,
+ )
+
command = (
f"kubectl get pods {ns_flag} --field-selector="
"'status.phase!=Running,status.phase!=Succeeded' -o wide 2>/dev/null; "
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index fea9438..c58c86d 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -1,11 +1,62 @@
-"""Log Analyzer — Kubernetes log collection, parsing, error correlation, and analysis."""
+"""Log Analyzer — Kubernetes log collection, parsing, error correlation, and analysis.
+Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based).
+"""
+
+import os
import re
+import subprocess
from collections import Counter
from dataclasses import dataclass, field
from typing import Optional
from modules.cluster_creator import run_ssh_command, SSHResult
+from modules.profile_manager import ClusterProfile
+import config
+
+
+def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 60) -> SSHResult:
+ """Run a shell command locally with KUBECONFIG set from profile content."""
+ kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_log_temp.kubeconfig")
+ os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+ with open(kubeconfig_path, "w") as f:
+ f.write(kubeconfig_content)
+ env = dict(os.environ, KUBECONFIG=kubeconfig_path)
+ try:
+ proc = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
+ return SSHResult(
+ hostname="local", command=command, return_code=proc.returncode,
+ stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
+ )
+ except subprocess.TimeoutExpired:
+ return SSHResult(
+ hostname="local", command=command, return_code=-1,
+ stdout="", stderr=f"Command timed out after {timeout}s", success=False,
+ )
+ except Exception as e:
+ return SSHResult(
+ hostname="local", command=command, return_code=-1,
+ stdout="", stderr=str(e), success=False,
+ )
+
+
+def _run_on_cluster(control_plane_node: dict | None, command: str, profile: ClusterProfile | None = None, timeout: int = 60) -> SSHResult:
+ """Route command to local shell or SSH based on cluster source."""
+ if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+ return _run_local_shell(profile.kubeconfig_content, command, timeout=timeout)
+ if not control_plane_node:
+ return SSHResult(
+ hostname="unknown", command=command, return_code=1,
+ stdout="", stderr="No control-plane node available.", success=False,
+ )
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=timeout,
+ )
@dataclass
@@ -34,6 +85,9 @@ class LogAnalysisResult:
# ── Log collection commands ───────────────────────────────────────────────
+# SSH-only sources (journalctl requires node access)
+SSH_ONLY_LOG_SOURCES = {"Kubelet", "CRI-O"}
+
LOG_SOURCES = {
"Kubelet": "journalctl -u kubelet --no-pager -n {lines} --since '{since}'",
"CRI-O": "journalctl -u crio --no-pager -n {lines} --since '{since}'",
@@ -46,22 +100,39 @@ class LogAnalysisResult:
"Events": "kubectl get events -A --sort-by='.lastTimestamp' | tail -{lines}",
}
+def get_available_log_sources(profile: ClusterProfile | None = None) -> list[str]:
+ """Return log sources available for the given cluster type."""
+ if profile and profile.cluster_source == "imported":
+ return [s for s in LOG_SOURCES if s not in SSH_ONLY_LOG_SOURCES]
+ return list(LOG_SOURCES.keys())
+
+
POD_LOG_COMMAND = "kubectl logs {pod_ref} --tail={lines} --since={since_k8s} {container_flag}"
POD_PREVIOUS_LOG_COMMAND = "kubectl logs {pod_ref} --previous --tail={lines} {container_flag} 2>/dev/null || echo 'No previous logs available'"
def collect_logs(
- control_plane_node: dict,
+ control_plane_node: dict | None,
source: str,
lines: int = 200,
since: str = "1 hour ago",
since_k8s: str = "1h",
+ profile: ClusterProfile | None = None,
) -> SSHResult:
"""Collect logs from a specific source on the cluster."""
+ # Block SSH-only sources for imported clusters
+ if profile and profile.cluster_source == "imported" and source in SSH_ONLY_LOG_SOURCES:
+ return SSHResult(
+ hostname="local", command=source, return_code=1,
+ stdout="",
+ stderr=f"'{source}' logs require SSH access (not available for imported clusters).",
+ success=False,
+ )
+
cmd_template = LOG_SOURCES.get(source)
if not cmd_template:
return SSHResult(
- hostname=control_plane_node["ip_address"],
+ hostname=control_plane_node["ip_address"] if control_plane_node else "local",
command=source,
return_code=1,
stdout="",
@@ -75,24 +146,18 @@ def collect_logs(
since_k8s=since_k8s,
)
- return run_ssh_command(
- ip_address=control_plane_node["ip_address"],
- command=command,
- ssh_user=control_plane_node.get("ssh_user", "root"),
- ssh_port=control_plane_node.get("ssh_port", 22),
- ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=60,
- )
+ return _run_on_cluster(control_plane_node, command, profile=profile, timeout=60)
def collect_pod_logs(
- control_plane_node: dict,
+ control_plane_node: dict | None,
namespace: str,
pod_name: str,
container: str = "",
lines: int = 200,
since_k8s: str = "1h",
previous: bool = False,
+ profile: ClusterProfile | None = None,
) -> SSHResult:
"""Collect logs from a specific pod."""
pod_ref = f"-n {namespace} {pod_name}"
@@ -112,28 +177,22 @@ def collect_pod_logs(
container_flag=container_flag,
)
- return run_ssh_command(
- ip_address=control_plane_node["ip_address"],
- command=command,
- ssh_user=control_plane_node.get("ssh_user", "root"),
- ssh_port=control_plane_node.get("ssh_port", 22),
- ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=60,
- )
+ return _run_on_cluster(control_plane_node, command, profile=profile, timeout=60)
def collect_multi_source_logs(
- control_plane_node: dict,
+ control_plane_node: dict | None,
sources: list[str],
lines: int = 100,
since: str = "1 hour ago",
since_k8s: str = "1h",
+ profile: ClusterProfile | None = None,
) -> dict[str, SSHResult]:
"""Collect logs from multiple sources."""
results = {}
for source in sources:
results[source] = collect_logs(
- control_plane_node, source, lines, since, since_k8s
+ control_plane_node, source, lines, since, since_k8s, profile=profile
)
return results
@@ -338,17 +397,11 @@ def llm_correlate_analysis(
def get_pod_list(
- control_plane_node: dict,
+ control_plane_node: dict | None,
namespace: str = "",
+ profile: ClusterProfile | None = None,
) -> SSHResult:
"""Get list of pods for the log analysis UI."""
ns_flag = f"-n {namespace}" if namespace else "-A"
command = f"kubectl get pods {ns_flag} -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,CONTAINERS:.spec.containers[*].name' --no-headers"
- return run_ssh_command(
- ip_address=control_plane_node["ip_address"],
- command=command,
- ssh_user=control_plane_node.get("ssh_user", "root"),
- ssh_port=control_plane_node.get("ssh_port", 22),
- ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=30,
- )
+ return _run_on_cluster(control_plane_node, command, profile=profile, timeout=30)
diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py
index a2272d7..3df9ba4 100644
--- a/k8s-agent/modules/monitoring_setup.py
+++ b/k8s-agent/modules/monitoring_setup.py
@@ -1,7 +1,58 @@
-"""Monitoring Setup — Prometheus, Grafana, and dashboard provisioning via SSH."""
+"""Monitoring Setup — Prometheus, Grafana, and dashboard provisioning.
+
+Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based).
+"""
+
+import os
+import subprocess
from modules.cluster_creator import run_ssh_command, SSHResult
from modules.profile_manager import ClusterProfile
+import config
+
+
+def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 120) -> SSHResult:
+ """Run a shell command locally with KUBECONFIG set from profile content."""
+ kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_monitor_temp.kubeconfig")
+ os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+ with open(kubeconfig_path, "w") as f:
+ f.write(kubeconfig_content)
+ env = dict(os.environ, KUBECONFIG=kubeconfig_path)
+ try:
+ proc = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
+ return SSHResult(
+ hostname="local", command=command, return_code=proc.returncode,
+ stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
+ )
+ except subprocess.TimeoutExpired:
+ return SSHResult(
+ hostname="local", command=command, return_code=-1,
+ stdout="", stderr=f"Command timed out after {timeout}s", success=False,
+ )
+ except Exception as e:
+ return SSHResult(
+ hostname="local", command=command, return_code=-1,
+ stdout="", stderr=str(e), success=False,
+ )
+
+
+def _run_on_cluster(control_plane_node: dict | None, command: str, profile: ClusterProfile | None = None, timeout: int = 120) -> SSHResult:
+ """Route command to local kubectl or SSH based on cluster source."""
+ if profile and profile.cluster_source == "imported" and profile.kubeconfig_content:
+ return _run_local_shell(profile.kubeconfig_content, command, timeout=timeout)
+ if not control_plane_node:
+ return SSHResult(
+ hostname="unknown", command=command, return_code=1,
+ stdout="", stderr="No control-plane node available.", success=False,
+ )
+ return run_ssh_command(
+ ip_address=control_plane_node["ip_address"],
+ command=command,
+ ssh_user=control_plane_node.get("ssh_user", "root"),
+ ssh_port=control_plane_node.get("ssh_port", 22),
+ ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=timeout,
+ )
def generate_helm_install_script() -> str:
@@ -324,67 +375,44 @@ def generate_alerting_rules_script(namespace: str = "monitoring") -> str:
"""
-def install_helm(control_plane_node: dict) -> SSHResult:
- """Install Helm on the control-plane node."""
- return run_ssh_command(
- ip_address=control_plane_node["ip_address"],
- command=generate_helm_install_script(),
- ssh_user=control_plane_node.get("ssh_user", "root"),
- ssh_port=control_plane_node.get("ssh_port", 22),
- ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=120,
- )
+def install_helm(control_plane_node: dict | None = None, profile: ClusterProfile | None = None) -> SSHResult:
+ """Install Helm on the control-plane node or locally for imported clusters."""
+ return _run_on_cluster(control_plane_node, generate_helm_install_script(), profile=profile, timeout=120)
def install_prometheus_stack(
- control_plane_node: dict,
+ control_plane_node: dict | None = None,
namespace: str = "monitoring",
+ profile: ClusterProfile | None = None,
) -> SSHResult:
"""Install the full kube-prometheus-stack."""
- return run_ssh_command(
- ip_address=control_plane_node["ip_address"],
- command=generate_prometheus_install_script(namespace),
- ssh_user=control_plane_node.get("ssh_user", "root"),
- ssh_port=control_plane_node.get("ssh_port", 22),
- ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=900,
- )
+ return _run_on_cluster(control_plane_node, generate_prometheus_install_script(namespace), profile=profile, timeout=900)
def install_dashboards(
- control_plane_node: dict,
- dashboard_keys: list[str],
+ control_plane_node: dict | None = None,
+ dashboard_keys: list[str] | None = None,
namespace: str = "monitoring",
+ profile: ClusterProfile | None = None,
) -> SSHResult:
"""Import selected Grafana dashboards."""
- return run_ssh_command(
- ip_address=control_plane_node["ip_address"],
- command=generate_dashboard_import_script(dashboard_keys, namespace),
- ssh_user=control_plane_node.get("ssh_user", "root"),
- ssh_port=control_plane_node.get("ssh_port", 22),
- ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=300,
- )
+ dashboard_keys = dashboard_keys or []
+ return _run_on_cluster(control_plane_node, generate_dashboard_import_script(dashboard_keys, namespace), profile=profile, timeout=300)
def install_alert_rules(
- control_plane_node: dict,
+ control_plane_node: dict | None = None,
namespace: str = "monitoring",
+ profile: ClusterProfile | None = None,
) -> SSHResult:
"""Install Prometheus alerting rules."""
- return run_ssh_command(
- ip_address=control_plane_node["ip_address"],
- command=generate_alerting_rules_script(namespace),
- ssh_user=control_plane_node.get("ssh_user", "root"),
- ssh_port=control_plane_node.get("ssh_port", 22),
- ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=60,
- )
+ return _run_on_cluster(control_plane_node, generate_alerting_rules_script(namespace), profile=profile, timeout=60)
def get_monitoring_status(
- control_plane_node: dict,
+ control_plane_node: dict | None = None,
namespace: str = "monitoring",
+ profile: ClusterProfile | None = None,
) -> SSHResult:
"""Check the status of the monitoring stack."""
command = f"""
@@ -405,14 +433,7 @@ def get_monitoring_status(
echo ">> ServiceMonitors:"
kubectl -n {namespace} get servicemonitors 2>/dev/null || echo "No ServiceMonitors found"
"""
- return run_ssh_command(
- ip_address=control_plane_node["ip_address"],
- command=command,
- ssh_user=control_plane_node.get("ssh_user", "root"),
- ssh_port=control_plane_node.get("ssh_port", 22),
- ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=30,
- )
+ return _run_on_cluster(control_plane_node, command, profile=profile, timeout=30)
def get_monitoring_advice(
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
index 90cd9bf..d5707cd 100644
--- a/k8s-agent/modules/profile_manager.py
+++ b/k8s-agent/modules/profile_manager.py
@@ -54,6 +54,9 @@ class ClusterProfile:
# Offline manifest paths — user-provided files for air-gapped environments
flannel_manifest_path: str = "" # local path to kube-flannel.yml
prometheus_manifest_path: str = "" # local path to prometheus manifest
+ # Kubeconfig for existing clusters (imported, not provisioned)
+ kubeconfig_content: str = "" # raw kubeconfig YAML content
+ cluster_source: str = "provisioned" # "provisioned" or "imported"
def get_control_plane_nodes(self) -> list[dict]:
return [n for n in self.nodes if n.get("role") == "control-plane"]
From fc3b900ceccbbec2004e5683341fe8dc2d2d5558 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:57:16 +0000
Subject: [PATCH 09/31] Fix kubeconfig import: move file_uploader outside
st.form to prevent silent reset on submit
---
k8s-agent/app.py | 96 ++++++++++++++++++++++++------------------------
1 file changed, 49 insertions(+), 47 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 0000017..d2d0bd0 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -464,54 +464,56 @@ def page_profile_manager():
"without provisioning a new cluster."
)
- with st.form("import_cluster_form"):
- import_name = st.text_input(
- "Profile Name *",
- placeholder="my-existing-cluster",
- )
- import_desc = st.text_area(
- "Description",
- placeholder="Production cluster running in datacenter A",
- )
- kubeconfig_file = st.file_uploader(
- "Upload kubeconfig file",
- type=["yaml", "yml", "conf", "config"],
- key="kubeconfig_upload",
- help="Usually found at ~/.kube/config on your cluster's control-plane node.",
- )
- k8s_ver = st.text_input(
- "Kubernetes Version (optional)",
- placeholder="1.30",
- value="1.30",
- )
-
- submitted_import = st.form_submit_button(
- "Import Cluster", type="primary", use_container_width=True,
- )
+ # NOTE: file_uploader is kept OUTSIDE st.form because Streamlit
+ # resets the uploaded file on form submission, causing the import
+ # to silently do nothing.
+ import_name = st.text_input(
+ "Profile Name *",
+ placeholder="my-existing-cluster",
+ key="import_cluster_name",
+ )
+ import_desc = st.text_area(
+ "Description",
+ placeholder="Production cluster running in datacenter A",
+ key="import_cluster_desc",
+ )
+ kubeconfig_file = st.file_uploader(
+ "Upload kubeconfig file",
+ type=["yaml", "yml", "conf", "config", "txt"],
+ key="kubeconfig_upload",
+ help="Usually found at ~/.kube/config on your cluster's control-plane node. "
+ "If your file has no extension, rename it to config.yaml or config.txt before uploading.",
+ )
+ k8s_ver = st.text_input(
+ "Kubernetes Version (optional)",
+ placeholder="1.30",
+ value="1.30",
+ key="import_cluster_k8s_ver",
+ )
- if submitted_import:
- if not import_name:
- st.error("Profile name is required.")
- elif not kubeconfig_file:
- st.error("Please upload a kubeconfig file.")
- else:
- kubeconfig_content = kubeconfig_file.read().decode("utf-8")
- profile = ClusterProfile(
- name=import_name,
- description=import_desc,
- kubernetes_version=k8s_ver or "1.30",
- status="active",
- cluster_source="imported",
- kubeconfig_content=kubeconfig_content,
- )
- save_profile(profile)
- st.session_state.active_profile = import_name
- st.success(
- f"Cluster '{import_name}' imported! "
- "Select it from the sidebar to start using Debugger, Monitoring, "
- "Resource Viewer, etc."
- )
- st.rerun()
+ if st.button("Import Cluster", type="primary", use_container_width=True, key="import_cluster_btn"):
+ if not import_name:
+ st.error("Profile name is required.")
+ elif not kubeconfig_file:
+ st.error("Please upload a kubeconfig file.")
+ else:
+ kubeconfig_content = kubeconfig_file.read().decode("utf-8")
+ profile = ClusterProfile(
+ name=import_name,
+ description=import_desc,
+ kubernetes_version=k8s_ver or "1.30",
+ status="imported",
+ cluster_source="imported",
+ kubeconfig_content=kubeconfig_content,
+ )
+ save_profile(profile)
+ st.session_state.active_profile = import_name
+ st.success(
+ f"Cluster '{import_name}' imported! "
+ "Select it from the sidebar to start using Debugger, Monitoring, "
+ "Resource Viewer, etc."
+ )
+ st.rerun()
# ── Manage Profiles ───────────────────────────────────────────────────
with tab_list:
From d68e2699bb16ddda8a838f7ac9fbf243c237518e Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 13:04:09 +0000
Subject: [PATCH 10/31] Add kubectl path detection and namespace auto-fetch for
imported clusters
---
k8s-agent/app.py | 138 ++++++++++++++++++++++----
k8s-agent/config.py | 77 ++++++++++++++
k8s-agent/modules/cluster_debugger.py | 20 +++-
k8s-agent/modules/log_analyzer.py | 29 +++++-
k8s-agent/modules/monitoring_setup.py | 29 +++++-
5 files changed, 265 insertions(+), 28 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index d2d0bd0..b2342e6 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -11,7 +11,7 @@
# Navigation uses native st.radio — no third-party component needed.
import config
-from config import is_llm_configured
+from config import is_llm_configured, get_kubectl_path, fetch_namespaces, get_kubeconfig_path
from modules.profile_manager import (
ClusterProfile,
save_profile,
@@ -899,6 +899,17 @@ def page_cluster_debugger():
return
cp_node = cp_nodes[0]
+ # kubectl availability check for imported clusters
+ if profile.cluster_source == "imported" and not get_kubectl_path():
+ st.error(
+ "**kubectl not found** on this machine.\n\n"
+ "Install it with:\n```\n"
+ "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+ "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
+ "Or see: https://kubernetes.io/docs/tasks/tools/"
+ )
+ return
+
available_commands = get_available_commands(profile)
tab_quick, tab_category, tab_custom, tab_ai = st.tabs([
@@ -1062,7 +1073,29 @@ def page_monitoring_setup():
return
cp_node = cp_nodes[0]
- namespace = st.text_input("Monitoring Namespace", value="monitoring")
+ # kubectl availability check for imported clusters
+ if profile.cluster_source == "imported" and not get_kubectl_path():
+ st.error(
+ "**kubectl not found** on this machine.\n\n"
+ "Install it with:\n```\n"
+ "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+ "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
+ "Or see: https://kubernetes.io/docs/tasks/tools/"
+ )
+ return
+
+ # Namespace selection — auto-fetch from cluster for imported clusters
+ if profile.cluster_source == "imported" and profile.kubeconfig_content:
+ cluster_ns = fetch_namespaces(profile.kubeconfig_content)
+ if cluster_ns:
+ # Ensure "monitoring" is an option even if it doesn't exist yet
+ ns_options = cluster_ns if "monitoring" in cluster_ns else cluster_ns + ["monitoring"]
+ default_idx = ns_options.index("monitoring") if "monitoring" in ns_options else 0
+ namespace = st.selectbox("Monitoring Namespace", options=ns_options, index=default_idx, key="mon_ns")
+ else:
+ namespace = st.text_input("Monitoring Namespace", value="monitoring", key="mon_ns_txt")
+ else:
+ namespace = st.text_input("Monitoring Namespace", value="monitoring", key="mon_ns_txt")
tab_install, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([
"Install Stack",
@@ -1216,6 +1249,22 @@ def page_log_analysis():
return
cp_node = cp_nodes[0]
+ # kubectl availability check for imported clusters
+ if profile.cluster_source == "imported" and not get_kubectl_path():
+ st.error(
+ "**kubectl not found** on this machine.\n\n"
+ "Install it with:\n```\n"
+ "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+ "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
+ "Or see: https://kubernetes.io/docs/tasks/tools/"
+ )
+ return
+
+ # Pre-fetch namespaces for imported clusters (used by Pod Logs tab)
+ _cluster_namespaces: list[str] = []
+ if profile.cluster_source == "imported" and profile.kubeconfig_content:
+ _cluster_namespaces = fetch_namespaces(profile.kubeconfig_content)
+
available_log_sources = get_available_log_sources(profile)
tab_system, tab_pod, tab_correlation, tab_ai = st.tabs([
@@ -1285,7 +1334,12 @@ def page_log_analysis():
st.markdown("### Pod Logs")
col1, col2 = st.columns(2)
with col1:
- pod_ns = st.text_input("Namespace", value="default", key="pod_ns")
+ if _cluster_namespaces:
+ pod_ns = st.selectbox("Namespace", options=_cluster_namespaces,
+ index=_cluster_namespaces.index("default") if "default" in _cluster_namespaces else 0,
+ key="pod_ns")
+ else:
+ pod_ns = st.text_input("Namespace", value="default", key="pod_ns")
pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz", key="pod_name_input")
with col2:
container = st.text_input("Container (optional)", key="pod_container")
@@ -1452,6 +1506,22 @@ def page_resource_viewer():
)
return
+ # kubectl availability check for imported clusters
+ if profile.cluster_source == "imported" and not get_kubectl_path():
+ st.error(
+ "**kubectl not found** on this machine.\n\n"
+ "Install it with:\n```\n"
+ "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+ "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
+ "Or see: https://kubernetes.io/docs/tasks/tools/"
+ )
+ return
+
+ # Pre-fetch namespaces for imported clusters
+ _rv_namespaces: list[str] = []
+ if profile.cluster_source == "imported" and profile.kubeconfig_content:
+ _rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
+
tab_resources, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
"Cluster Resources",
"Node Health",
@@ -1481,7 +1551,14 @@ def page_resource_viewer():
key="res_ns_choice",
)
if ns_choice == "Specific":
- namespace = st.text_input("Namespace", value="default", key="res_ns")
+ if _rv_namespaces:
+ namespace = st.selectbox(
+ "Namespace", options=_rv_namespaces,
+ index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+ key="res_ns",
+ )
+ else:
+ namespace = st.text_input("Namespace", value="default", key="res_ns")
else:
namespace = ""
else:
@@ -1528,11 +1605,19 @@ def page_resource_viewer():
key="desc_name",
)
with desc_col2:
- desc_ns = st.text_input(
- "Namespace (if applicable)",
- value="default",
- key="desc_ns",
- )
+ if _rv_namespaces:
+ desc_ns = st.selectbox(
+ "Namespace (if applicable)",
+ options=_rv_namespaces,
+ index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+ key="desc_ns",
+ )
+ else:
+ desc_ns = st.text_input(
+ "Namespace (if applicable)",
+ value="default",
+ key="desc_ns",
+ )
if st.button("Describe", key="describe_res") and desc_name:
# Determine the singular resource type for describe
@@ -1638,12 +1723,21 @@ def page_resource_viewer():
rbac_ns = ""
if "(namespaced)" in rbac_type or rbac_type == "ServiceAccounts":
- rbac_ns = st.text_input(
- "Namespace",
- value="default",
- key="rbac_ns",
- help="Leave blank for all namespaces",
- )
+ if _rv_namespaces:
+ rbac_ns = st.selectbox(
+ "Namespace (blank = all)",
+ options=[""] + _rv_namespaces,
+ index=0,
+ key="rbac_ns",
+ format_func=lambda x: "All Namespaces" if x == "" else x,
+ )
+ else:
+ rbac_ns = st.text_input(
+ "Namespace",
+ value="default",
+ key="rbac_ns",
+ help="Leave blank for all namespaces",
+ )
if st.button("Fetch RBAC Resources", type="primary", key="fetch_rbac"):
cmd_map = {
@@ -1708,7 +1802,12 @@ def page_resource_viewer():
helm_ns_all = st.checkbox("All namespaces", value=True, key="helm_ns_all")
helm_ns = ""
if not helm_ns_all:
- helm_ns = st.text_input("Namespace", value="default", key="helm_ns")
+ if _rv_namespaces:
+ helm_ns = st.selectbox("Namespace", options=_rv_namespaces,
+ index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+ key="helm_ns")
+ else:
+ helm_ns = st.text_input("Namespace", value="default", key="helm_ns")
if st.button("List Helm Releases", type="primary", key="helm_list"):
helm_cmd = "helm list"
@@ -1733,7 +1832,12 @@ def page_resource_viewer():
helm_release_name = st.text_input("Release Name", placeholder="my-release", key="helm_rel")
helm_chart = st.text_input("Chart", placeholder="prometheus-community/kube-prometheus-stack", key="helm_chart")
with hcol2:
- helm_install_ns = st.text_input("Namespace", value="default", key="helm_install_ns")
+ if _rv_namespaces:
+ helm_install_ns = st.selectbox("Namespace", options=_rv_namespaces,
+ index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+ key="helm_install_ns")
+ else:
+ helm_install_ns = st.text_input("Namespace", value="default", key="helm_install_ns")
helm_create_ns = st.checkbox("Create namespace if not exists", value=True, key="helm_create_ns")
helm_values = st.text_area(
"Values (YAML, optional)",
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index 14fb427..2317be1 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -1,6 +1,9 @@
"""Configuration for the K8s Agent application."""
import os
+import shutil
+import subprocess
+
# LLM Configuration
LLM_API_URL = os.getenv(
@@ -27,3 +30,77 @@ def is_llm_configured() -> bool:
# Ensure directories exist
os.makedirs(PROFILES_DIR, exist_ok=True)
os.makedirs(UPLOADS_DIR, exist_ok=True)
+
+
+# ── kubectl / helm path detection ─────────────────────────────────────────
+
+# Common install locations to check when kubectl/helm are not in PATH
+_KUBECTL_SEARCH_PATHS = [
+ "/usr/local/bin/kubectl",
+ "/usr/bin/kubectl",
+ "/snap/bin/kubectl",
+ os.path.expanduser("~/.local/bin/kubectl"),
+ os.path.expanduser("~/bin/kubectl"),
+ "/opt/bin/kubectl",
+]
+
+_HELM_SEARCH_PATHS = [
+ "/usr/local/bin/helm",
+ "/usr/bin/helm",
+ "/snap/bin/helm",
+ os.path.expanduser("~/.local/bin/helm"),
+ os.path.expanduser("~/bin/helm"),
+ "/opt/bin/helm",
+]
+
+
+def _find_binary(name: str, search_paths: list[str]) -> str:
+ """Find a binary by name, checking PATH first then common locations."""
+ found = shutil.which(name)
+ if found:
+ return found
+ for path in search_paths:
+ if os.path.isfile(path) and os.access(path, os.X_OK):
+ return path
+ return ""
+
+
+def get_kubectl_path() -> str:
+ """Return the full path to kubectl, or empty string if not found."""
+ return _find_binary("kubectl", _KUBECTL_SEARCH_PATHS)
+
+
+def get_helm_path() -> str:
+ """Return the full path to helm, or empty string if not found."""
+ return _find_binary("helm", _HELM_SEARCH_PATHS)
+
+
+def get_kubeconfig_path(profile_name: str = "_temp") -> str:
+ """Return the path where a kubeconfig file should be written for local commands."""
+ kc_dir = os.path.join(DATA_DIR, "kubeconfigs")
+ os.makedirs(kc_dir, exist_ok=True)
+ return os.path.join(kc_dir, f"{profile_name}.kubeconfig")
+
+
+def fetch_namespaces(kubeconfig_content: str) -> list[str]:
+ """Fetch all namespaces from a cluster using kubectl with the given kubeconfig.
+
+ Returns a list of namespace names, or an empty list on failure.
+ """
+ kubectl = get_kubectl_path()
+ if not kubectl:
+ return []
+ kc_path = get_kubeconfig_path("_ns_fetch")
+ os.makedirs(os.path.dirname(kc_path), exist_ok=True)
+ with open(kc_path, "w") as f:
+ f.write(kubeconfig_content)
+ try:
+ proc = subprocess.run(
+ f"{kubectl} --kubeconfig={kc_path} get namespaces -o jsonpath='{{.items[*].metadata.name}}'",
+ shell=True, capture_output=True, text=True, timeout=15,
+ )
+ if proc.returncode == 0 and proc.stdout.strip():
+ return sorted(proc.stdout.strip().split())
+ return []
+ except Exception:
+ return []
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index 390e6ed..45feb91 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -58,11 +58,25 @@
def _run_local_kubectl(kubeconfig_content: str, kubectl_args: str, timeout: int = 60) -> SSHResult:
"""Run a kubectl command locally using the given kubeconfig content."""
- kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_debug_temp.kubeconfig")
- os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+ kubectl = config.get_kubectl_path()
+ if not kubectl:
+ return SSHResult(
+ hostname="local", command="kubectl " + kubectl_args, return_code=1,
+ stdout="",
+ stderr=(
+ "kubectl not found on this machine.\n\n"
+ "Install kubectl:\n"
+ " curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+ " chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n"
+ "Or on macOS: brew install kubectl\n"
+ "Or see: https://kubernetes.io/docs/tasks/tools/"
+ ),
+ success=False,
+ )
+ kubeconfig_path = config.get_kubeconfig_path("_debug_temp")
with open(kubeconfig_path, "w") as f:
f.write(kubeconfig_content)
- full_cmd = f"kubectl --kubeconfig={kubeconfig_path} {kubectl_args}"
+ full_cmd = f"{kubectl} --kubeconfig={kubeconfig_path} {kubectl_args}"
try:
proc = subprocess.run(full_cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return SSHResult(
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index c58c86d..08a25e1 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -16,14 +16,35 @@
def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 60) -> SSHResult:
- """Run a shell command locally with KUBECONFIG set from profile content."""
- kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_log_temp.kubeconfig")
- os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+ """Run a shell command locally with KUBECONFIG set from profile content.
+
+ Replaces bare ``kubectl`` and ``helm`` references with their full paths
+ so the command works even when these binaries are not in $PATH.
+ """
+ kubectl = config.get_kubectl_path()
+ helm = config.get_helm_path()
+ if not kubectl:
+ return SSHResult(
+ hostname="local", command=command, return_code=1,
+ stdout="",
+ stderr=(
+ "kubectl not found on this machine.\n\n"
+ "Install kubectl:\n"
+ " curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+ " chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n"
+ "Or on macOS: brew install kubectl\n"
+ "Or see: https://kubernetes.io/docs/tasks/tools/"
+ ),
+ success=False,
+ )
+ # Replace bare kubectl/helm with full paths
+ resolved_cmd = command.replace("kubectl ", f"{kubectl} ").replace("helm ", f"{helm} " if helm else "helm ")
+ kubeconfig_path = config.get_kubeconfig_path("_log_temp")
with open(kubeconfig_path, "w") as f:
f.write(kubeconfig_content)
env = dict(os.environ, KUBECONFIG=kubeconfig_path)
try:
- proc = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
+ proc = subprocess.run(resolved_cmd, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
return SSHResult(
hostname="local", command=command, return_code=proc.returncode,
stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py
index 3df9ba4..c62be44 100644
--- a/k8s-agent/modules/monitoring_setup.py
+++ b/k8s-agent/modules/monitoring_setup.py
@@ -12,14 +12,35 @@
def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 120) -> SSHResult:
- """Run a shell command locally with KUBECONFIG set from profile content."""
- kubeconfig_path = os.path.join(config.DATA_DIR, "kubeconfigs", "_monitor_temp.kubeconfig")
- os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
+ """Run a shell command locally with KUBECONFIG set from profile content.
+
+ Replaces bare ``kubectl`` and ``helm`` references with their full paths
+ so the command works even when these binaries are not in $PATH.
+ """
+ kubectl = config.get_kubectl_path()
+ helm = config.get_helm_path()
+ if not kubectl:
+ return SSHResult(
+ hostname="local", command=command, return_code=1,
+ stdout="",
+ stderr=(
+ "kubectl not found on this machine.\n\n"
+ "Install kubectl:\n"
+ " curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
+ " chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n"
+ "Or on macOS: brew install kubectl\n"
+ "Or see: https://kubernetes.io/docs/tasks/tools/"
+ ),
+ success=False,
+ )
+ # Replace bare kubectl/helm with full paths
+ resolved_cmd = command.replace("kubectl ", f"{kubectl} ").replace("helm ", f"{helm} " if helm else "helm ")
+ kubeconfig_path = config.get_kubeconfig_path("_monitor_temp")
with open(kubeconfig_path, "w") as f:
f.write(kubeconfig_content)
env = dict(os.environ, KUBECONFIG=kubeconfig_path)
try:
- proc = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
+ proc = subprocess.run(resolved_cmd, shell=True, capture_output=True, text=True, timeout=timeout, env=env)
return SSHResult(
hostname="local", command=command, return_code=proc.returncode,
stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0,
From 07a1b725053b95b746009bcfac7249cb70fa45a3 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 13:22:00 +0000
Subject: [PATCH 11/31] Improve kubectl detection: drop os.access check, add
subprocess which fallback
---
k8s-agent/config.py | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index 2317be1..90eb95b 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -55,13 +55,37 @@ def is_llm_configured() -> bool:
def _find_binary(name: str, search_paths: list[str]) -> str:
- """Find a binary by name, checking PATH first then common locations."""
+ """Find a binary by name, checking PATH first then common locations.
+
+ Strategy:
+ 1. ``shutil.which`` — honours $PATH as seen by the Python process.
+ 2. Probe well-known install directories with ``os.path.isfile``.
+ (Skip the ``os.access`` X_OK check because some SELinux / mount
+ configurations report False even though the file *is* executable.)
+ 3. Last resort: ask the OS via ``/usr/bin/which`` in a subprocess,
+ which may see a different PATH than the Python process (e.g. when
+ Streamlit is started through systemd or a virtualenv wrapper).
+ """
+ # 1. shutil.which
found = shutil.which(name)
if found:
return found
+ # 2. well-known paths — only check existence (skip os.access)
for path in search_paths:
- if os.path.isfile(path) and os.access(path, os.X_OK):
+ if os.path.isfile(path):
return path
+ # 3. subprocess fallback — works when shell PATH differs from Python PATH
+ for which_cmd in ("which", "/usr/bin/which", "/bin/which"):
+ try:
+ proc = subprocess.run(
+ f"{which_cmd} {name}",
+ shell=True, capture_output=True, text=True, timeout=5,
+ )
+ result = proc.stdout.strip()
+ if proc.returncode == 0 and result and os.path.isfile(result):
+ return result
+ except Exception:
+ continue
return ""
From b239963413cf2a84bdd9790b17eba3c73dd9f68a Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 04:47:46 +0000
Subject: [PATCH 12/31] Add metrics install, deployment scaling, pod shell,
resource dropdown, fix disk usage for imported clusters
---
k8s-agent/app.py | 494 +++++++++++++++++++++++---
k8s-agent/modules/cluster_creator.py | 28 +-
k8s-agent/modules/cluster_debugger.py | 1 +
3 files changed, 469 insertions(+), 54 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index b2342e6..6ac3fe4 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -899,16 +899,12 @@ def page_cluster_debugger():
return
cp_node = cp_nodes[0]
- # kubectl availability check for imported clusters
+ # kubectl availability warning for imported clusters
if profile.cluster_source == "imported" and not get_kubectl_path():
- st.error(
- "**kubectl not found** on this machine.\n\n"
- "Install it with:\n```\n"
- "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
- "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
- "Or see: https://kubernetes.io/docs/tasks/tools/"
+ st.warning(
+ "kubectl not found on this machine. Commands will fail until kubectl is installed.\n\n"
+ "Install: `curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl && mv kubectl ~/.local/bin/`"
)
- return
available_commands = get_available_commands(profile)
@@ -1073,17 +1069,6 @@ def page_monitoring_setup():
return
cp_node = cp_nodes[0]
- # kubectl availability check for imported clusters
- if profile.cluster_source == "imported" and not get_kubectl_path():
- st.error(
- "**kubectl not found** on this machine.\n\n"
- "Install it with:\n```\n"
- "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
- "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
- "Or see: https://kubernetes.io/docs/tasks/tools/"
- )
- return
-
# Namespace selection — auto-fetch from cluster for imported clusters
if profile.cluster_source == "imported" and profile.kubeconfig_content:
cluster_ns = fetch_namespaces(profile.kubeconfig_content)
@@ -1097,8 +1082,9 @@ def page_monitoring_setup():
else:
namespace = st.text_input("Monitoring Namespace", value="monitoring", key="mon_ns_txt")
- tab_install, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([
+ tab_install, tab_metrics, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([
"Install Stack",
+ "Metrics Components",
"Dashboards",
"Alert Rules",
"Status",
@@ -1145,6 +1131,142 @@ def page_monitoring_setup():
st.error("Alert rules installation failed")
st.code(result.stderr, language="text")
+ # ── Metrics Components ────────────────────────────────────────────────
+ with tab_metrics:
+ st.markdown("### Metrics Components")
+ st.markdown(
+ "Install **metrics-server** (enables `kubectl top`) and/or "
+ "**kube-state-metrics** (exposes workload/object-level metrics to Prometheus)."
+ )
+
+ met_col1, met_col2 = st.columns(2)
+
+ with met_col1:
+ st.markdown("#### metrics-server")
+ st.markdown(
+ "Provides CPU/memory usage for pods and nodes. "
+ "Required for `kubectl top` and HPA autoscaling."
+ )
+ ms_insecure = st.checkbox(
+ "Add `--kubelet-insecure-tls` flag (self-signed certs)",
+ value=True,
+ key="ms_insecure",
+ )
+ if st.button("Install metrics-server", type="primary", key="install_ms"):
+ ms_url = (
+ "https://github.com/kubernetes-sigs/metrics-server"
+ "/releases/latest/download/components.yaml"
+ )
+ with st.status("Installing metrics-server...", expanded=True):
+ # Apply the manifest
+ apply_result = run_kubectl(
+ profile,
+ f"apply -f {ms_url}",
+ timeout=60,
+ )
+ if apply_result.success:
+ st.write("Manifest applied successfully.")
+ st.code(apply_result.stdout, language="text")
+ # Patch for insecure TLS if requested
+ if ms_insecure:
+ patch_cmd = (
+ "patch deployment metrics-server -n kube-system "
+ "--type=json -p="
+ "'[{\"op\":\"add\",\"path\":\"/spec/template/spec/containers/0/args/-\","
+ "\"value\":\"--kubelet-insecure-tls\"}]'"
+ )
+ patch_result = run_kubectl(profile, patch_cmd, timeout=30)
+ if patch_result.success:
+ st.success("metrics-server installed with --kubelet-insecure-tls!")
+ else:
+ st.warning("Installed but TLS patch may have failed (already applied?).")
+ st.code(patch_result.stderr, language="text")
+ else:
+ st.success("metrics-server installed!")
+ else:
+ st.error("metrics-server installation failed")
+ st.code(apply_result.stderr, language="text")
+
+ # Check status
+ if st.button("Check metrics-server status", key="ms_status"):
+ with st.spinner("Checking..."):
+ result = run_kubectl(
+ profile,
+ "get deployment metrics-server -n kube-system -o wide",
+ timeout=15,
+ )
+ if result.success:
+ st.code(result.stdout, language="text")
+ else:
+ st.warning("metrics-server not found or not ready.")
+ st.code(result.stderr, language="text")
+
+ with met_col2:
+ st.markdown("#### kube-state-metrics")
+ st.markdown(
+ "Exposes object-level metrics (Deployments, Pods, Nodes, etc.) "
+ "to Prometheus for dashboards and alerting."
+ )
+ ksm_ns = namespace # reuse the monitoring namespace
+ if st.button("Install kube-state-metrics", type="primary", key="install_ksm"):
+ with st.status("Installing kube-state-metrics...", expanded=True):
+ # Use helm if available, otherwise apply raw manifest
+ helm_cmd = (
+ f"helm install kube-state-metrics "
+ f"oci://registry-1.docker.io/bitnamicharts/kube-state-metrics "
+ f"-n {ksm_ns} --create-namespace"
+ )
+ result = run_kubectl(profile, helm_cmd, timeout=120)
+ if result.success:
+ st.success("kube-state-metrics installed via Helm!")
+ st.code(result.stdout, language="text")
+ else:
+ st.warning("Helm install failed, trying kubectl apply...")
+ st.code(result.stderr, language="text")
+ # Fallback: direct manifest from GitHub
+ ksm_url = (
+ "https://raw.githubusercontent.com/kubernetes/"
+ "kube-state-metrics/main/examples/standard/service.yaml"
+ )
+ apply_result = run_kubectl(
+ profile,
+ f"apply -f https://raw.githubusercontent.com/kubernetes/kube-state-metrics/main/examples/standard/ 2>/dev/null || echo 'Manual install required'",
+ timeout=60,
+ )
+ if apply_result.success:
+ st.success("kube-state-metrics applied!")
+ st.code(apply_result.stdout, language="text")
+ else:
+ st.error(
+ "Could not install kube-state-metrics automatically.\n\n"
+ "Manual install:\n"
+ "```\nhelm repo add prometheus-community "
+ "https://prometheus-community.github.io/helm-charts\n"
+ "helm install kube-state-metrics "
+ f"prometheus-community/kube-state-metrics -n {ksm_ns}\n```"
+ )
+
+ if st.button("Check kube-state-metrics status", key="ksm_status"):
+ with st.spinner("Checking..."):
+ result = run_kubectl(
+ profile,
+ f"get pods -n {ksm_ns} -l app.kubernetes.io/name=kube-state-metrics -o wide",
+ timeout=15,
+ )
+ if result.success and result.stdout.strip():
+ st.code(result.stdout, language="text")
+ else:
+ # Try broader search
+ result2 = run_kubectl(
+ profile,
+ "get pods -A -l app.kubernetes.io/name=kube-state-metrics -o wide",
+ timeout=15,
+ )
+ if result2.success and result2.stdout.strip():
+ st.code(result2.stdout, language="text")
+ else:
+ st.warning("kube-state-metrics not found on the cluster.")
+
# ── Dashboards ────────────────────────────────────────────────────────
with tab_dashboards:
st.markdown("### Grafana Dashboards")
@@ -1249,17 +1371,6 @@ def page_log_analysis():
return
cp_node = cp_nodes[0]
- # kubectl availability check for imported clusters
- if profile.cluster_source == "imported" and not get_kubectl_path():
- st.error(
- "**kubectl not found** on this machine.\n\n"
- "Install it with:\n```\n"
- "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
- "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
- "Or see: https://kubernetes.io/docs/tasks/tools/"
- )
- return
-
# Pre-fetch namespaces for imported clusters (used by Pod Logs tab)
_cluster_namespaces: list[str] = []
if profile.cluster_source == "imported" and profile.kubeconfig_content:
@@ -1506,24 +1617,15 @@ def page_resource_viewer():
)
return
- # kubectl availability check for imported clusters
- if profile.cluster_source == "imported" and not get_kubectl_path():
- st.error(
- "**kubectl not found** on this machine.\n\n"
- "Install it with:\n```\n"
- "curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n"
- "chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n```\n"
- "Or see: https://kubernetes.io/docs/tasks/tools/"
- )
- return
-
# Pre-fetch namespaces for imported clusters
_rv_namespaces: list[str] = []
if profile.cluster_source == "imported" and profile.kubeconfig_content:
_rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
- tab_resources, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+ tab_resources, tab_scaling, tab_shell, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
"Cluster Resources",
+ "Scaling",
+ "Pod Shell",
"Node Health",
"RBAC Viewer",
"Helm Releases",
@@ -1597,27 +1699,66 @@ def page_resource_viewer():
# Describe a specific resource
st.markdown("---")
st.markdown("#### Describe a Resource")
- desc_col1, desc_col2 = st.columns(2)
- with desc_col1:
- desc_name = st.text_input(
- "Resource name",
- placeholder="e.g., my-pod-xyz",
- key="desc_name",
- )
+ desc_col1, desc_col2, desc_col3 = st.columns([2, 2, 1])
with desc_col2:
if _rv_namespaces:
desc_ns = st.selectbox(
- "Namespace (if applicable)",
+ "Namespace",
options=_rv_namespaces,
index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
key="desc_ns",
)
else:
desc_ns = st.text_input(
- "Namespace (if applicable)",
+ "Namespace",
value="default",
key="desc_ns",
)
+ with desc_col3:
+ desc_refresh = st.button("Load names", key="desc_load_names")
+
+ # Fetch resource names for the dropdown
+ _desc_resource_names: list[str] = []
+ if desc_refresh or st.session_state.get("_desc_cached_names"):
+ if desc_refresh:
+ # Determine the kubectl get command for names
+ _desc_cmd_base, _desc_ns_supported = _RESOURCE_TYPES[resource_type]
+ _names_cmd = f"{_desc_cmd_base} -o name"
+ if _desc_ns_supported and desc_ns:
+ _names_cmd += f" -n {desc_ns}"
+ elif _desc_ns_supported:
+ _names_cmd += " -A"
+ _names_result = run_kubectl(profile, _names_cmd, timeout=15)
+ if _names_result.success and _names_result.stdout.strip():
+ raw_names = _names_result.stdout.strip().split("\n")
+ # Strip resource type prefix (e.g. "pod/my-pod" -> "my-pod")
+ _desc_resource_names = [
+ n.split("/", 1)[-1] if "/" in n else n
+ for n in raw_names if n.strip()
+ ]
+ st.session_state["_desc_cached_names"] = _desc_resource_names
+ st.session_state["_desc_cached_type"] = resource_type
+ else:
+ _desc_resource_names = []
+ st.session_state["_desc_cached_names"] = []
+ else:
+ # Use cached names if resource type matches
+ if st.session_state.get("_desc_cached_type") == resource_type:
+ _desc_resource_names = st.session_state.get("_desc_cached_names", [])
+
+ with desc_col1:
+ if _desc_resource_names:
+ desc_name = st.selectbox(
+ "Resource name",
+ options=_desc_resource_names,
+ key="desc_name_select",
+ )
+ else:
+ desc_name = st.text_input(
+ "Resource name",
+ placeholder="Click 'Load names' or type a name",
+ key="desc_name",
+ )
if st.button("Describe", key="describe_res") and desc_name:
# Determine the singular resource type for describe
@@ -1643,6 +1784,255 @@ def page_resource_viewer():
st.error("Describe failed")
st.code(result.stderr, language="text")
+ # ── Scaling ──────────────────────────────────────────────────────────
+ with tab_scaling:
+ st.markdown("### Deployment Scaling")
+ st.markdown("Scale deployment replicas up or down.")
+
+ sc_col1, sc_col2 = st.columns(2)
+ with sc_col1:
+ if _rv_namespaces:
+ sc_ns = st.selectbox(
+ "Namespace",
+ options=_rv_namespaces,
+ index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+ key="sc_ns",
+ )
+ else:
+ sc_ns = st.text_input("Namespace", value="default", key="sc_ns")
+ with sc_col2:
+ sc_load = st.button("Load Deployments", key="sc_load")
+
+ # Fetch deployments for the dropdown
+ _sc_deployments: list[str] = []
+ _sc_dep_info: dict[str, str] = {}
+ if sc_load or st.session_state.get("_sc_cached_deps"):
+ if sc_load:
+ dep_result = run_kubectl(
+ profile,
+ f"get deployments -n {sc_ns} -o custom-columns=NAME:.metadata.name,REPLICAS:.spec.replicas,AVAILABLE:.status.availableReplicas --no-headers",
+ timeout=15,
+ )
+ if dep_result.success and dep_result.stdout.strip():
+ for line in dep_result.stdout.strip().split("\n"):
+ parts = line.split()
+ if parts:
+ dep_name = parts[0]
+ _sc_deployments.append(dep_name)
+ replicas = parts[1] if len(parts) > 1 else "?"
+ available = parts[2] if len(parts) > 2 else "?"
+ _sc_dep_info[dep_name] = f"{replicas} replicas ({available} available)"
+ st.session_state["_sc_cached_deps"] = _sc_deployments
+ st.session_state["_sc_cached_dep_info"] = _sc_dep_info
+ st.session_state["_sc_cached_ns"] = sc_ns
+ else:
+ st.session_state["_sc_cached_deps"] = []
+ st.session_state["_sc_cached_dep_info"] = {}
+ if dep_result.success:
+ st.info(f"No deployments found in namespace '{sc_ns}'.")
+ else:
+ st.error("Failed to list deployments")
+ st.code(dep_result.stderr, language="text")
+ else:
+ if st.session_state.get("_sc_cached_ns") == sc_ns:
+ _sc_deployments = st.session_state.get("_sc_cached_deps", [])
+ _sc_dep_info = st.session_state.get("_sc_cached_dep_info", {})
+
+ if _sc_deployments:
+ sc_dep_col1, sc_dep_col2, sc_dep_col3 = st.columns([3, 1, 1])
+ with sc_dep_col1:
+ sc_selected = st.selectbox(
+ "Deployment",
+ options=_sc_deployments,
+ format_func=lambda d: f"{d} ({_sc_dep_info.get(d, '')})",
+ key="sc_selected",
+ )
+ with sc_dep_col2:
+ sc_replicas = st.number_input(
+ "Target replicas",
+ min_value=0,
+ max_value=100,
+ value=1,
+ key="sc_replicas",
+ )
+ with sc_dep_col3:
+ st.markdown("
", unsafe_allow_html=True)
+ if st.button("Scale", type="primary", key="sc_apply"):
+ scale_cmd = f"scale deployment {sc_selected} --replicas={sc_replicas} -n {sc_ns}"
+ with st.spinner(f"Scaling {sc_selected} to {sc_replicas} replicas..."):
+ result = run_kubectl(profile, scale_cmd, timeout=30)
+ if result.success:
+ st.success(f"Scaled **{sc_selected}** to **{sc_replicas}** replicas!")
+ st.code(result.stdout, language="text")
+ # Refresh to show updated state
+ verify = run_kubectl(
+ profile,
+ f"get deployment {sc_selected} -n {sc_ns} -o wide",
+ timeout=15,
+ )
+ if verify.success:
+ st.code(verify.stdout, language="text")
+ else:
+ st.error("Scaling failed")
+ st.code(result.stderr, language="text")
+
+ # Quick scale buttons
+ st.markdown("---")
+ st.markdown("#### Quick Actions")
+ qa_col1, qa_col2, qa_col3, qa_col4 = st.columns(4)
+ with qa_col1:
+ if st.button("Scale to 0 (stop)", key="sc_0"):
+ with st.spinner("Scaling to 0..."):
+ result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=0 -n {sc_ns}", timeout=30)
+ st.success("Scaled to 0") if result.success else st.error(result.stderr)
+ with qa_col2:
+ if st.button("Scale to 1", key="sc_1"):
+ with st.spinner("Scaling to 1..."):
+ result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=1 -n {sc_ns}", timeout=30)
+ st.success("Scaled to 1") if result.success else st.error(result.stderr)
+ with qa_col3:
+ if st.button("Scale to 3", key="sc_3"):
+ with st.spinner("Scaling to 3..."):
+ result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=3 -n {sc_ns}", timeout=30)
+ st.success("Scaled to 3") if result.success else st.error(result.stderr)
+ with qa_col4:
+ if st.button("Scale to 5", key="sc_5"):
+ with st.spinner("Scaling to 5..."):
+ result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=5 -n {sc_ns}", timeout=30)
+ st.success("Scaled to 5") if result.success else st.error(result.stderr)
+ elif not sc_load:
+ st.info("Click **Load Deployments** to see deployments in the selected namespace.")
+
+ # ── Pod Shell ────────────────────────────────────────────────────────
+ with tab_shell:
+ st.markdown("### Pod Shell (Exec)")
+ st.markdown("Execute commands inside a running pod/container.")
+
+ sh_col1, sh_col2 = st.columns(2)
+ with sh_col1:
+ if _rv_namespaces:
+ sh_ns = st.selectbox(
+ "Namespace",
+ options=_rv_namespaces,
+ index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+ key="sh_ns",
+ )
+ else:
+ sh_ns = st.text_input("Namespace", value="default", key="sh_ns")
+ with sh_col2:
+ sh_load = st.button("Load Pods", key="sh_load")
+
+ # Fetch running pods
+ _sh_pods: list[str] = []
+ _sh_containers: dict[str, list[str]] = {}
+ if sh_load or st.session_state.get("_sh_cached_pods"):
+ if sh_load:
+ pod_result = run_kubectl(
+ profile,
+ f"get pods -n {sh_ns} --field-selector=status.phase=Running -o jsonpath="
+ "'{range .items[*]}{.metadata.name}{\"\\n\"}{end}'",
+ timeout=15,
+ )
+ if pod_result.success and pod_result.stdout.strip():
+ _sh_pods = [p.strip() for p in pod_result.stdout.strip().split("\n") if p.strip()]
+ st.session_state["_sh_cached_pods"] = _sh_pods
+ st.session_state["_sh_cached_ns"] = sh_ns
+ # Fetch container names for each pod
+ _sh_containers = {}
+ for pod_name in _sh_pods[:20]: # limit to first 20 for perf
+ ctr_result = run_kubectl(
+ profile,
+ f"get pod {pod_name} -n {sh_ns} -o jsonpath="
+ "'{range .spec.containers[*]}{.name}{\"\\n\"}{end}'",
+ timeout=10,
+ )
+ if ctr_result.success and ctr_result.stdout.strip():
+ _sh_containers[pod_name] = [
+ c.strip() for c in ctr_result.stdout.strip().split("\n") if c.strip()
+ ]
+ else:
+ _sh_containers[pod_name] = []
+ st.session_state["_sh_cached_containers"] = _sh_containers
+ else:
+ st.session_state["_sh_cached_pods"] = []
+ st.session_state["_sh_cached_containers"] = {}
+ if pod_result.success:
+ st.info(f"No running pods found in namespace '{sh_ns}'.")
+ else:
+ st.error("Failed to list pods")
+ st.code(pod_result.stderr, language="text")
+ else:
+ if st.session_state.get("_sh_cached_ns") == sh_ns:
+ _sh_pods = st.session_state.get("_sh_cached_pods", [])
+ _sh_containers = st.session_state.get("_sh_cached_containers", {})
+
+ if _sh_pods:
+ sh_pod_col1, sh_pod_col2 = st.columns(2)
+ with sh_pod_col1:
+ sh_selected_pod = st.selectbox("Pod", options=_sh_pods, key="sh_pod")
+ with sh_pod_col2:
+ containers = _sh_containers.get(sh_selected_pod, [])
+ if containers:
+ sh_selected_ctr = st.selectbox("Container", options=containers, key="sh_ctr")
+ else:
+ sh_selected_ctr = st.text_input("Container (optional)", key="sh_ctr")
+
+ st.info(
+ "**Note:** This runs non-interactive commands via `kubectl exec`. "
+ "For a fully interactive shell, use your terminal:\n\n"
+ f"`kubectl exec -it {sh_selected_pod} -n {sh_ns}"
+ f"{' -c ' + sh_selected_ctr if sh_selected_ctr else ''} -- /bin/sh`"
+ )
+
+ sh_cmd = st.text_input(
+ "Command to execute",
+ value="sh -c 'hostname && cat /etc/os-release && df -h'",
+ key="sh_cmd",
+ help="Enter the command to run inside the container",
+ )
+
+ sh_preset_col1, sh_preset_col2, sh_preset_col3, sh_preset_col4 = st.columns(4)
+ with sh_preset_col1:
+ if st.button("env", key="sh_p_env"):
+ sh_cmd = "env"
+ with sh_preset_col2:
+ if st.button("ps aux", key="sh_p_ps"):
+ sh_cmd = "ps aux"
+ with sh_preset_col3:
+ if st.button("df -h", key="sh_p_df"):
+ sh_cmd = "df -h"
+ with sh_preset_col4:
+ if st.button("cat /etc/resolv.conf", key="sh_p_dns"):
+ sh_cmd = "cat /etc/resolv.conf"
+
+ if st.button("Execute", type="primary", key="sh_exec") and sh_cmd:
+ ctr_flag = f" -c {sh_selected_ctr}" if sh_selected_ctr else ""
+ exec_cmd = f"exec {sh_selected_pod} -n {sh_ns}{ctr_flag} -- {sh_cmd}"
+ with st.spinner(f"Executing in {sh_selected_pod}..."):
+ result = run_kubectl(profile, exec_cmd, timeout=30)
+ if result.success:
+ st.code(result.stdout or "(no output)", language="text")
+ else:
+ st.error("Exec failed")
+ st.code(result.stderr, language="text")
+
+ # Pod logs quick access
+ st.markdown("---")
+ st.markdown("#### Quick Pod Logs")
+ log_lines = st.number_input("Tail lines", min_value=10, max_value=500, value=50, key="sh_log_lines")
+ if st.button("View Logs", key="sh_logs"):
+ ctr_flag = f" -c {sh_selected_ctr}" if sh_selected_ctr else ""
+ log_cmd = f"logs {sh_selected_pod} -n {sh_ns}{ctr_flag} --tail={log_lines}"
+ with st.spinner("Fetching logs..."):
+ result = run_kubectl(profile, log_cmd, timeout=30)
+ if result.success:
+ st.code(result.stdout or "(no logs)", language="text")
+ else:
+ st.error("Failed to fetch logs")
+ st.code(result.stderr, language="text")
+ elif not sh_load:
+ st.info("Click **Load Pods** to see running pods in the selected namespace.")
+
# ── Node Health ──────────────────────────────────────────────────────
with tab_node_health:
st.markdown("### Node Health Overview")
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 79c94cd..644bf78 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -1205,10 +1205,34 @@ def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSH
with open(kubeconfig_path, "w") as f:
f.write(profile.kubeconfig_content)
+ kubectl = config.get_kubectl_path()
+ helm = config.get_helm_path()
+
if is_helm:
- full_cmd = f"KUBECONFIG={kubeconfig_path} {command}"
+ bin_path = helm or "helm"
+ resolved = command.strip()
+ if resolved.startswith("helm "):
+ resolved = bin_path + resolved[4:]
+ full_cmd = f"KUBECONFIG={kubeconfig_path} {resolved}"
else:
- full_cmd = f"kubectl --kubeconfig={kubeconfig_path} {command}"
+ if not kubectl:
+ return SSHResult(
+ hostname="local (kubeconfig)",
+ command=command,
+ return_code=1,
+ stdout="",
+ stderr=(
+ "kubectl not found on this machine.\n\n"
+ "Install kubectl:\n"
+ " curl -LO https://dl.k8s.io/release/"
+ "$(curl -Ls https://dl.k8s.io/release/stable.txt)"
+ "/bin/linux/amd64/kubectl\n"
+ " chmod +x kubectl && mv kubectl ~/.local/bin/\n\n"
+ "Or see: https://kubernetes.io/docs/tasks/tools/"
+ ),
+ success=False,
+ )
+ full_cmd = f"{kubectl} --kubeconfig={kubeconfig_path} {command}"
try:
proc = subprocess.run(
full_cmd,
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index 45feb91..c5216f1 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -28,6 +28,7 @@
"Services": "get svc -A",
"PVCs": "get pvc -A",
"Ingresses": "get ingress -A",
+ "Disk Usage (Nodes)": "top nodes",
}
# Full SSH commands (backward-compat for provisioned clusters)
From 2285ea5904e02e04f714855233976318d5c8f773 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:47:49 +0000
Subject: [PATCH 13/31] Add Node Containers (crictl) tab: view containers per
node via SSH or kubectl
---
k8s-agent/app.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 117 insertions(+), 1 deletion(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 6ac3fe4..19bf862 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -22,6 +22,7 @@
)
from modules.cluster_creator import (
test_ssh_connectivity,
+ run_ssh_command,
generate_common_setup_script,
generate_control_plane_init_script,
generate_worker_join_script,
@@ -1622,10 +1623,11 @@ def page_resource_viewer():
if profile.cluster_source == "imported" and profile.kubeconfig_content:
_rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
- tab_resources, tab_scaling, tab_shell, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+ tab_resources, tab_scaling, tab_shell, tab_crictl, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
"Cluster Resources",
"Scaling",
"Pod Shell",
+ "Node Containers",
"Node Health",
"RBAC Viewer",
"Helm Releases",
@@ -2033,6 +2035,120 @@ def page_resource_viewer():
elif not sh_load:
st.info("Click **Load Pods** to see running pods in the selected namespace.")
+ # ── Node Containers (crictl) ────────────────────────────────────────
+ with tab_crictl:
+ st.markdown("### Node Containers (crictl)")
+ st.markdown("View containers running on each node using `crictl ps -a`.")
+
+ if profile.cluster_source == "imported":
+ # Imported clusters — no SSH, but we can still get node list and show
+ # container info via kubectl debug or just list pods per node
+ st.info(
+ "**crictl** requires SSH access to each node and is available for "
+ "provisioned clusters. For imported clusters, container-level "
+ "information is shown via kubectl below."
+ )
+ if st.button("Show containers per node (kubectl)", type="primary", key="crictl_kubectl"):
+ with st.spinner("Fetching node list..."):
+ node_result = run_kubectl(
+ profile,
+ "get nodes -o jsonpath='{range .items[*]}{.metadata.name}{\"\\n\"}{end}'",
+ timeout=15,
+ )
+ if node_result.success and node_result.stdout.strip():
+ node_names = [n.strip() for n in node_result.stdout.strip().split("\n") if n.strip()]
+ for node_name in node_names:
+ with st.expander(f"Node: **{node_name}**", expanded=True):
+ with st.spinner(f"Fetching containers on {node_name}..."):
+ pod_result = run_kubectl(
+ profile,
+ f"get pods -A --field-selector spec.nodeName={node_name} "
+ "-o custom-columns="
+ "'NAMESPACE:.metadata.namespace,"
+ "POD:.metadata.name,"
+ "CONTAINERS:.spec.containers[*].name,"
+ "STATUS:.status.phase,"
+ "RESTARTS:.status.containerStatuses[0].restartCount,"
+ "NODE:.spec.nodeName'",
+ timeout=15,
+ )
+ if pod_result.success:
+ st.code(pod_result.stdout or "(no pods on this node)", language="text")
+ else:
+ st.error(f"Failed to get pods on {node_name}")
+ st.code(pod_result.stderr, language="text")
+ else:
+ st.error("Failed to list nodes")
+ if node_result.stderr:
+ st.code(node_result.stderr, language="text")
+ else:
+ # Provisioned clusters — SSH into each node and run crictl
+ all_nodes = profile.nodes
+ if not all_nodes:
+ st.warning("No nodes defined in this profile.")
+ else:
+ crictl_cmd = st.text_input(
+ "CRI command",
+ value="crictl ps -a",
+ key="crictl_cmd",
+ help="Command to run on each node (e.g. crictl ps -a, crictl images, crictl stats)",
+ )
+
+ crictl_presets = st.columns(5)
+ with crictl_presets[0]:
+ if st.button("crictl ps -a", key="cp_ps"):
+ crictl_cmd = "crictl ps -a"
+ with crictl_presets[1]:
+ if st.button("crictl images", key="cp_img"):
+ crictl_cmd = "crictl images"
+ with crictl_presets[2]:
+ if st.button("crictl stats", key="cp_stats"):
+ crictl_cmd = "crictl stats"
+ with crictl_presets[3]:
+ if st.button("crictl pods", key="cp_pods"):
+ crictl_cmd = "crictl pods"
+ with crictl_presets[4]:
+ if st.button("crictl info", key="cp_info"):
+ crictl_cmd = "crictl info"
+
+ # Node selection
+ node_labels = [
+ f"{n.get('hostname', n.get('ip_address', '?'))} ({n.get('ip_address', '?')}) [{n.get('role', '?')}]"
+ for n in all_nodes
+ ]
+ cr_select_all = st.checkbox("Run on all nodes", value=True, key="cr_all")
+
+ if not cr_select_all:
+ selected_nodes_idx = st.multiselect(
+ "Select nodes",
+ options=list(range(len(all_nodes))),
+ format_func=lambda i: node_labels[i],
+ default=list(range(len(all_nodes))),
+ key="cr_nodes",
+ )
+ selected_nodes = [all_nodes[i] for i in selected_nodes_idx]
+ else:
+ selected_nodes = all_nodes
+
+ if st.button("Run on selected nodes", type="primary", key="crictl_run"):
+ for node in selected_nodes:
+ node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
+ with st.expander(f"Node: **{node_label}** [{node.get('role', '')}]", expanded=True):
+ with st.spinner(f"Running `{crictl_cmd}` on {node_label}..."):
+ result = run_ssh_command(
+ ip_address=node["ip_address"],
+ command=crictl_cmd,
+ ssh_user=node.get("ssh_user", "root"),
+ ssh_port=node.get("ssh_port", 22),
+ ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=30,
+ )
+ if result.success:
+ st.code(result.stdout or "(no output)", language="text")
+ else:
+ st.error(f"Command failed on {node_label}")
+ st.code(result.stderr, language="text")
+
# ── Node Health ──────────────────────────────────────────────────────
with tab_node_health:
st.markdown("### Node Health Overview")
From 252e941add3fe037c5067b2c4c62d026d179e485 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 05:52:31 +0000
Subject: [PATCH 14/31] Add cluster reset/teardown feature with re-provision
option
---
k8s-agent/app.py | 148 ++++++++++++++++++++++++++-
k8s-agent/modules/cluster_creator.py | 143 ++++++++++++++++++++++++++
2 files changed, 290 insertions(+), 1 deletion(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 19bf862..fcf776d 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -42,6 +42,7 @@
get_control_plane_steps,
get_worker_join_steps,
get_best_practices_steps,
+ get_cluster_reset_steps,
)
from modules.cluster_debugger import (
DIAGNOSTIC_COMMANDS,
@@ -604,9 +605,10 @@ def page_cluster_creation():
_show_profile_summary(profile)
- tab_preflight, tab_provision, tab_scripts, tab_manifests, tab_advice = st.tabs([
+ tab_preflight, tab_provision, tab_reset, tab_scripts, tab_manifests, tab_advice = st.tabs([
"Pre-flight Checks",
"Provision Cluster",
+ "Reset Cluster",
"View Scripts",
"Offline Manifests",
"AI Advice",
@@ -783,6 +785,150 @@ def page_cluster_creation():
update_profile_status(profile.name, "error")
st.error("Provisioning did not complete successfully. Check the errors above.")
+ # ── Reset Cluster ────────────────────────────────────────────────────
+ with tab_reset:
+ st.markdown("### Reset / Tear Down Cluster")
+ st.markdown(
+ "Completely reset the Kubernetes cluster on all (or selected) nodes. "
+ "This will run `kubeadm reset`, stop services, remove CRI-O data, "
+ "CNI configs, etcd data, and flush iptables — preparing nodes for a "
+ "fresh cluster installation."
+ )
+
+ if profile.cluster_source == "imported":
+ st.info(
+ "Cluster reset requires SSH access to each node and is only "
+ "available for **provisioned** clusters. For imported clusters, "
+ "run `kubeadm reset` directly on each node."
+ )
+ else:
+ all_nodes = profile.nodes
+ if not all_nodes:
+ st.warning("No nodes defined in this profile.")
+ else:
+ st.error(
+ "**WARNING:** This is a destructive operation. All Kubernetes data, "
+ "containers, etcd data, and configuration will be permanently deleted "
+ "from the selected nodes. This cannot be undone."
+ )
+
+ # Node selection
+ reset_node_labels = [
+ f"{n.get('hostname', n.get('ip_address', '?'))} ({n.get('ip_address', '?')}) [{n.get('role', '?')}]"
+ for n in all_nodes
+ ]
+ reset_all = st.checkbox("Reset ALL nodes", value=True, key="reset_all_nodes")
+
+ if not reset_all:
+ reset_idx = st.multiselect(
+ "Select nodes to reset",
+ options=list(range(len(all_nodes))),
+ format_func=lambda i: reset_node_labels[i],
+ default=list(range(len(all_nodes))),
+ key="reset_node_select",
+ )
+ reset_nodes = [all_nodes[i] for i in reset_idx]
+ else:
+ reset_nodes = all_nodes
+
+ # Options
+ col_r1, col_r2 = st.columns(2)
+ with col_r1:
+ remove_packages = st.checkbox(
+ "Also remove kubeadm/kubelet/kubectl packages",
+ value=False,
+ key="reset_remove_pkgs",
+ )
+ with col_r2:
+ auto_reprovision = st.checkbox(
+ "Re-provision cluster after reset",
+ value=False,
+ key="reset_reprovision",
+ help="After reset completes, automatically start fresh provisioning using the Provision Cluster flow.",
+ )
+
+ # Confirmation
+ confirm_text = st.text_input(
+ 'Type **RESET** to confirm',
+ key="reset_confirm",
+ help="Type RESET (all caps) to enable the reset button.",
+ )
+
+ reset_enabled = confirm_text.strip() == "RESET" and len(reset_nodes) > 0
+ if st.button(
+ f"Reset {len(reset_nodes)} Node(s)",
+ type="primary",
+ disabled=not reset_enabled,
+ use_container_width=True,
+ key="reset_go",
+ ):
+ update_profile_status(profile.name, "provisioning")
+ reset_steps = get_cluster_reset_steps(profile)
+
+ # Optionally add package removal step
+ if remove_packages:
+ reset_steps.append(
+ ProvisionStep(
+ name="remove_packages",
+ title="Remove kubeadm/kubelet/kubectl packages",
+ script="""set -uo pipefail
+echo '>> Removing Kubernetes packages...'
+if command -v yum &>/dev/null; then
+ yum remove -y kubeadm kubelet kubectl cri-o 2>/dev/null || true
+elif command -v apt-get &>/dev/null; then
+ apt-get remove -y --purge kubeadm kubelet kubectl cri-o 2>/dev/null || true
+fi
+echo 'Packages removed.'
+""",
+ timeout=120,
+ fatal=False,
+ )
+ )
+
+ reset_success = True
+ for node in reset_nodes:
+ node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
+ st.markdown(f"---\n#### Resetting: {node_label} [{node.get('role', '')}]")
+ progress = st.progress(0, text=f"Starting reset on {node_label}...")
+ node_ok = True
+ for idx, step in enumerate(reset_steps):
+ pct = int((idx / len(reset_steps)) * 100)
+ progress.progress(pct, text=f"[{idx+1}/{len(reset_steps)}] {step.title}")
+ with st.status(f"{step.title}...", expanded=False) as status:
+ result = _run_step(node, step)
+ if result.success:
+ st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text")
+ status.update(label=f"{step.title} — done", state="complete")
+ else:
+ st.warning(f"{step.title} — issue encountered")
+ st.code(result.stderr or result.stdout, language="text")
+ status.update(label=f"{step.title} — issue", state="error")
+ node_ok = False
+ if step.fatal:
+ reset_success = False
+ break
+ progress.progress(100, text=f"{'Reset complete' if node_ok else 'Reset had issues'} on {node_label}")
+ if node_ok:
+ st.success(f"Node {node_label} reset successfully.")
+ else:
+ st.warning(f"Node {node_label} reset completed with some issues. Check details above.")
+
+ if reset_success:
+ update_profile_status(profile.name, "draft")
+ st.success("All selected nodes have been reset. The cluster has been torn down.")
+ st.info("You can now go to the **Provision Cluster** tab to create a new cluster on these nodes.")
+
+ if auto_reprovision:
+ st.markdown("---")
+ st.markdown("### Auto Re-provisioning")
+ st.info(
+ "Auto re-provision is enabled. Please switch to the **Provision Cluster** tab "
+ "and click **Start Provisioning** to set up a fresh cluster with the current profile settings."
+ )
+ else:
+ update_profile_status(profile.name, "error")
+ st.error("Reset encountered fatal errors on some nodes. Review the output above before re-provisioning.")
+
# ── View Scripts ──────────────────────────────────────────────────────
with tab_scripts:
st.markdown("### Generated Scripts")
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 644bf78..d9f694d 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -1043,6 +1043,149 @@ def get_best_practices_steps() -> List[ProvisionStep]:
]
+def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]:
+ """Return the ordered list of steps to fully reset/teardown a K8s node.
+
+ This runs kubeadm reset, stops services, removes CRI-O data, CNI configs,
+ and cleans up iptables — preparing the node for a fresh cluster install.
+ """
+ crio_root = profile.crio_root or "/var/lib/containers/storage"
+ kubelet_root = profile.kubelet_root or "/var/lib/kubelet"
+ log_root = profile.log_root or "/var/log"
+
+ return [
+ ProvisionStep(
+ name="drain_node",
+ title="Drain & Cordon Node (best effort)",
+ script="""set -uo pipefail
+echo '>> Attempting to drain this node (best effort)...'
+HOSTNAME=$(hostname)
+kubectl drain "$HOSTNAME" --ignore-daemonsets --delete-emptydir-data --force --timeout=60s 2>/dev/null || true
+kubectl cordon "$HOSTNAME" 2>/dev/null || true
+echo 'Drain/cordon complete (or skipped if kubectl not available).'
+""",
+ timeout=90,
+ fatal=False,
+ ),
+ ProvisionStep(
+ name="kubeadm_reset",
+ title="Run kubeadm reset",
+ script="""set -uo pipefail
+echo '>> Running kubeadm reset...'
+kubeadm reset -f --cri-socket unix:///var/run/crio/crio.sock 2>/dev/null || \
+kubeadm reset -f 2>/dev/null || \
+echo 'kubeadm reset returned non-zero (may already be reset)'
+echo 'kubeadm reset complete.'
+""",
+ timeout=120,
+ ),
+ ProvisionStep(
+ name="stop_services",
+ title="Stop kubelet & CRI-O services",
+ script="""set -uo pipefail
+echo '>> Stopping kubelet...'
+systemctl stop kubelet 2>/dev/null || true
+systemctl disable kubelet 2>/dev/null || true
+echo '>> Stopping CRI-O...'
+systemctl stop crio 2>/dev/null || true
+systemctl disable crio 2>/dev/null || true
+echo 'Services stopped.'
+""",
+ timeout=60,
+ ),
+ ProvisionStep(
+ name="clean_cni",
+ title="Remove CNI configuration & network interfaces",
+ script="""set -uo pipefail
+echo '>> Removing CNI configs...'
+rm -rf /etc/cni/net.d/*
+echo '>> Removing flannel interface...'
+ip link delete flannel.1 2>/dev/null || true
+ip link delete cni0 2>/dev/null || true
+ip link delete flannel-wg 2>/dev/null || true
+echo 'CNI cleanup complete.'
+""",
+ timeout=30,
+ ),
+ ProvisionStep(
+ name="clean_iptables",
+ title="Flush iptables rules",
+ script="""set -uo pipefail
+echo '>> Flushing iptables...'
+iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
+ip6tables -F && ip6tables -t nat -F && ip6tables -t mangle -F && ip6tables -X 2>/dev/null || true
+echo 'iptables flushed.'
+""",
+ timeout=30,
+ ),
+ ProvisionStep(
+ name="clean_kubelet_data",
+ title="Remove kubelet data",
+ script=f"""set -uo pipefail
+echo '>> Removing kubelet data at {kubelet_root}...'
+rm -rf {kubelet_root}/*
+rm -rf /etc/kubernetes/*
+rm -rf /tmp/kubeadm-join-command.txt
+echo 'Kubelet data removed.'
+""",
+ timeout=60,
+ ),
+ ProvisionStep(
+ name="clean_crio_data",
+ title="Remove CRI-O container data",
+ script=f"""set -uo pipefail
+echo '>> Removing CRI-O storage at {crio_root}...'
+rm -rf {crio_root}/*
+echo '>> Removing CRI-O run root...'
+rm -rf /run/containers/storage/*
+echo 'CRI-O data removed.'
+""",
+ timeout=60,
+ fatal=False,
+ ),
+ ProvisionStep(
+ name="clean_etcd",
+ title="Remove etcd data (control-plane only, best effort)",
+ script="""set -uo pipefail
+echo '>> Removing etcd data...'
+rm -rf /var/lib/etcd/*
+echo 'etcd data removed (if present).'
+""",
+ timeout=30,
+ fatal=False,
+ ),
+ ProvisionStep(
+ name="clean_logs",
+ title="Clean K8s-related logs",
+ script=f"""set -uo pipefail
+echo '>> Cleaning K8s logs at {log_root}...'
+rm -rf {log_root}/pods/*
+rm -rf {log_root}/containers/*
+rm -rf /var/log/kubernetes/* 2>/dev/null || true
+echo 'Logs cleaned.'
+""",
+ timeout=30,
+ fatal=False,
+ ),
+ ProvisionStep(
+ name="verify_clean",
+ title="Verify cleanup",
+ script="""set -uo pipefail
+echo '>> Verifying cleanup...'
+echo "kubelet active: $(systemctl is-active kubelet 2>/dev/null || echo 'not found')"
+echo "crio active: $(systemctl is-active crio 2>/dev/null || echo 'not found')"
+echo "kubeadm present: $(which kubeadm 2>/dev/null || echo 'not found')"
+echo "kubectl present: $(which kubectl 2>/dev/null || echo 'not found')"
+echo "CNI configs: $(ls /etc/cni/net.d/ 2>/dev/null || echo 'empty/missing')"
+echo ""
+echo "Node is ready for a fresh cluster installation."
+""",
+ timeout=30,
+ fatal=False,
+ ),
+ ]
+
+
def execute_provision_steps(
node: dict,
steps: List[ProvisionStep],
From 88b1984f0af24aa6f7edc2c28af0eacdc24e3d20 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 06:54:52 +0000
Subject: [PATCH 15/31] Add Resource Requests/Limits tab: tabular view of CPU,
memory, ephemeral-storage per namespace
---
k8s-agent/app.py | 167 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 166 insertions(+), 1 deletion(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index fcf776d..582f377 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -1769,10 +1769,11 @@ def page_resource_viewer():
if profile.cluster_source == "imported" and profile.kubeconfig_content:
_rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
- tab_resources, tab_scaling, tab_shell, tab_crictl, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+ tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
"Cluster Resources",
"Scaling",
"Pod Shell",
+ "Resource Requests/Limits",
"Node Containers",
"Node Health",
"RBAC Viewer",
@@ -2181,6 +2182,170 @@ def page_resource_viewer():
elif not sh_load:
st.info("Click **Load Pods** to see running pods in the selected namespace.")
+ # ── Resource Requests / Limits ───────────────────────────────────────
+ with tab_res_limits:
+ st.markdown("### Container Resource Requests & Limits")
+ st.markdown(
+ "View CPU, memory, and ephemeral-storage requests and limits for all "
+ "containers in a namespace (from Deployments, StatefulSets, DaemonSets, and Jobs)."
+ )
+
+ rl_col1, rl_col2 = st.columns(2)
+ with rl_col1:
+ if _rv_namespaces:
+ rl_ns = st.selectbox(
+ "Namespace",
+ options=_rv_namespaces,
+ index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
+ key="rl_ns",
+ )
+ else:
+ rl_ns = st.text_input("Namespace", value="default", key="rl_ns")
+ with rl_col2:
+ rl_workload = st.selectbox(
+ "Workload Type",
+ options=["Deployments", "StatefulSets", "DaemonSets", "Jobs", "All"],
+ index=0,
+ key="rl_workload",
+ )
+
+ if st.button("Fetch Resource Requests/Limits", type="primary", key="rl_fetch"):
+ import json as _json
+
+ workload_map = {
+ "Deployments": "deploy",
+ "StatefulSets": "statefulsets",
+ "DaemonSets": "daemonsets",
+ "Jobs": "jobs",
+ }
+ if rl_workload == "All":
+ types_to_fetch = list(workload_map.items())
+ else:
+ types_to_fetch = [(rl_workload, workload_map[rl_workload])]
+
+ all_rows: list[dict] = []
+ for wl_label, wl_cmd in types_to_fetch:
+ with st.spinner(f"Fetching {wl_label}..."):
+ result = run_kubectl(
+ profile,
+ f"get {wl_cmd} -n {rl_ns} -o json",
+ timeout=30,
+ )
+ if result.success and result.stdout.strip():
+ try:
+ data = _json.loads(result.stdout)
+ for item in data.get("items", []):
+ workload_name = item.get("metadata", {}).get("name", "?")
+ spec = item.get("spec", {})
+ # For Jobs the pod template is at spec.template,
+ # for Deployments/StatefulSets/DaemonSets it's spec.template
+ template = spec.get("template", {})
+ pod_spec = template.get("spec", {})
+ containers = pod_spec.get("containers", [])
+ init_containers = pod_spec.get("initContainers", [])
+ for ctr in containers:
+ res = ctr.get("resources", {})
+ req = res.get("requests", {})
+ lim = res.get("limits", {})
+ all_rows.append({
+ "Type": wl_label,
+ "Workload": workload_name,
+ "Container": ctr.get("name", "?"),
+ "Init": "",
+ "CPU Req": req.get("cpu", "-"),
+ "CPU Lim": lim.get("cpu", "-"),
+ "Mem Req": req.get("memory", "-"),
+ "Mem Lim": lim.get("memory", "-"),
+ "Eph Req": req.get("ephemeral-storage", "-"),
+ "Eph Lim": lim.get("ephemeral-storage", "-"),
+ })
+ for ctr in init_containers:
+ res = ctr.get("resources", {})
+ req = res.get("requests", {})
+ lim = res.get("limits", {})
+ all_rows.append({
+ "Type": wl_label,
+ "Workload": workload_name,
+ "Container": ctr.get("name", "?"),
+ "Init": "init",
+ "CPU Req": req.get("cpu", "-"),
+ "CPU Lim": lim.get("cpu", "-"),
+ "Mem Req": req.get("memory", "-"),
+ "Mem Lim": lim.get("memory", "-"),
+ "Eph Req": req.get("ephemeral-storage", "-"),
+ "Eph Lim": lim.get("ephemeral-storage", "-"),
+ })
+ except _json.JSONDecodeError:
+ st.warning(f"Could not parse JSON for {wl_label}")
+ elif not result.success:
+ st.warning(f"Failed to fetch {wl_label}: {result.stderr}")
+
+ if all_rows:
+ st.markdown(f"**{len(all_rows)} container(s)** found in namespace `{rl_ns}`:")
+ st.dataframe(
+ all_rows,
+ use_container_width=True,
+ column_config={
+ "Type": st.column_config.TextColumn(width="small"),
+ "Workload": st.column_config.TextColumn(width="medium"),
+ "Container": st.column_config.TextColumn(width="medium"),
+ "Init": st.column_config.TextColumn(width="small"),
+ "CPU Req": st.column_config.TextColumn(width="small"),
+ "CPU Lim": st.column_config.TextColumn(width="small"),
+ "Mem Req": st.column_config.TextColumn(width="small"),
+ "Mem Lim": st.column_config.TextColumn(width="small"),
+ "Eph Req": st.column_config.TextColumn(width="small"),
+ "Eph Lim": st.column_config.TextColumn(width="small"),
+ },
+ )
+
+ # Summary stats
+ st.markdown("---")
+ st.markdown("#### Summary")
+ no_cpu_req = sum(1 for r in all_rows if r["CPU Req"] == "-" and r["Init"] == "")
+ no_mem_req = sum(1 for r in all_rows if r["Mem Req"] == "-" and r["Init"] == "")
+ no_cpu_lim = sum(1 for r in all_rows if r["CPU Lim"] == "-" and r["Init"] == "")
+ no_mem_lim = sum(1 for r in all_rows if r["Mem Lim"] == "-" and r["Init"] == "")
+ non_init = sum(1 for r in all_rows if r["Init"] == "")
+ sc1, sc2, sc3, sc4 = st.columns(4)
+ with sc1:
+ st.metric("No CPU Request", f"{no_cpu_req}/{non_init}")
+ with sc2:
+ st.metric("No CPU Limit", f"{no_cpu_lim}/{non_init}")
+ with sc3:
+ st.metric("No Mem Request", f"{no_mem_req}/{non_init}")
+ with sc4:
+ st.metric("No Mem Limit", f"{no_mem_lim}/{non_init}")
+
+ if no_cpu_req > 0 or no_mem_req > 0:
+ st.warning(
+ f"{no_cpu_req + no_mem_req} container(s) are missing resource requests. "
+ "This can affect scheduling and QoS class assignment."
+ )
+ if no_cpu_lim > 0 or no_mem_lim > 0:
+ st.info(
+ f"{no_cpu_lim + no_mem_lim} container(s) are missing resource limits. "
+ "Consider setting limits to prevent resource contention."
+ )
+
+ # Download as TSV
+ tsv_lines = ["Type\tWorkload\tContainer\tInit\tCPU Req\tCPU Lim\tMem Req\tMem Lim\tEph Req\tEph Lim"]
+ for r in all_rows:
+ tsv_lines.append(
+ f"{r['Type']}\t{r['Workload']}\t{r['Container']}\t{r['Init']}\t"
+ f"{r['CPU Req']}\t{r['CPU Lim']}\t{r['Mem Req']}\t{r['Mem Lim']}\t"
+ f"{r['Eph Req']}\t{r['Eph Lim']}"
+ )
+ st.download_button(
+ "Download as TSV",
+ data="\n".join(tsv_lines),
+ file_name=f"resource_limits_{rl_ns}.tsv",
+ mime="text/tab-separated-values",
+ key="rl_download",
+ )
+ else:
+ st.info(f"No containers found in namespace `{rl_ns}` for the selected workload type(s).")
+
# ── Node Containers (crictl) ────────────────────────────────────────
with tab_crictl:
st.markdown("### Node Containers (crictl)")
From b63824a69cca3aab33483432d6db3d136bae523a Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:02:40 +0000
Subject: [PATCH 16/31] Fix Devin Review: Disk Usage key mismatch in
CATEGORY_MAP, add timestamp window to correlate_errors
---
k8s-agent/modules/cluster_debugger.py | 2 +-
k8s-agent/modules/log_analyzer.py | 35 ++++++++++++++++++++++++---
2 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index c5216f1..6815a42 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -28,7 +28,7 @@
"Services": "get svc -A",
"PVCs": "get pvc -A",
"Ingresses": "get ingress -A",
- "Disk Usage (Nodes)": "top nodes",
+ "Disk Usage": "top nodes",
}
# Full SSH commands (backward-compat for provisioned clusters)
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index 08a25e1..917bc3e 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -319,22 +319,49 @@ def correlate_errors(
all_errors.sort(key=lambda e: e.get("timestamp", ""))
+ def _parse_ts(ts_str: str):
+ """Try to parse a timestamp string into a datetime object."""
+ from datetime import datetime
+ for fmt in (
+ "%Y-%m-%dT%H:%M:%S",
+ "%Y-%m-%dT%H:%M:%S.%f",
+ "%Y-%m-%dT%H:%M:%SZ",
+ "%Y-%m-%dT%H:%M:%S.%fZ",
+ "%b %d %H:%M:%S",
+ "%Y-%m-%d %H:%M:%S",
+ "%Y-%m-%d %H:%M:%S.%f",
+ ):
+ try:
+ return datetime.strptime(ts_str.strip(), fmt)
+ except (ValueError, AttributeError):
+ continue
+ return None
+
correlated = []
window_seconds = 30
- used = set()
+ used: set[int] = set()
for i, err in enumerate(all_errors):
if i in used:
continue
group = [err]
used.add(i)
+ err_ts = _parse_ts(err.get("timestamp", ""))
for j in range(i + 1, len(all_errors)):
if j in used:
continue
- if all_errors[j].get("source") != err.get("source"):
- group.append(all_errors[j])
- used.add(j)
+ other = all_errors[j]
+ if other.get("source") == err.get("source"):
+ continue
+ # If both timestamps are parseable, enforce the time window
+ other_ts = _parse_ts(other.get("timestamp", ""))
+ if err_ts and other_ts:
+ diff = abs((other_ts - err_ts).total_seconds())
+ if diff > window_seconds:
+ continue
+ group.append(other)
+ used.add(j)
if len(group) > 1:
correlated.append({
From 23324fd75b136d231f95ded532c72ac88af840a2 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:05:59 +0000
Subject: [PATCH 17/31] Add proper feedback messages for buttons: imported
cluster guards, sudo for crictl, test summary
---
k8s-agent/app.py | 147 +++++++++++++++++++++++++++++++++--------------
1 file changed, 103 insertions(+), 44 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 582f377..53e2b30 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -619,40 +619,71 @@ def page_cluster_creation():
st.markdown("### SSH Connectivity Test")
st.markdown("Test SSH access to all nodes before provisioning.")
- if st.button("Test All Nodes", type="primary"):
- for node in profile.nodes:
- with st.status(f"Testing {node.get('hostname', node['ip_address'])}...", expanded=True):
- result = test_ssh_connectivity(node)
- if result.success:
- st.success(f"Connected to {node['ip_address']}")
- st.code(result.stdout, language="text")
- else:
- st.error(f"Failed to connect to {node['ip_address']}")
- st.code(result.stderr, language="text")
+ if profile.cluster_source == "imported":
+ st.info(
+ "SSH connectivity tests are not applicable for imported clusters. "
+ "Imported clusters connect via kubeconfig — no SSH access is needed. "
+ "Use the **Cluster Debugger** or **Resource Viewer** to verify connectivity."
+ )
+ elif not profile.nodes:
+ st.warning("No nodes defined in this profile. Add nodes in the Profile Manager first.")
+ else:
+ if st.button("Test All Nodes", type="primary"):
+ all_ok = True
+ for node in profile.nodes:
+ with st.status(f"Testing {node.get('hostname', node['ip_address'])}...", expanded=True):
+ result = test_ssh_connectivity(node)
+ if result.success:
+ st.success(f"Connected to {node['ip_address']}")
+ st.code(result.stdout, language="text")
+ else:
+ all_ok = False
+ st.error(f"Failed to connect to {node['ip_address']}")
+ st.code(result.stderr, language="text")
+ if all_ok:
+ st.success("All nodes are reachable via SSH. You can proceed to provisioning.")
+ else:
+ st.error("Some nodes failed SSH connectivity. Fix the issues above before provisioning.")
# ── Provision ─────────────────────────────────────────────────────────
with tab_provision:
st.markdown("### Automated Cluster Provisioning")
- st.warning(
- "This will SSH into each node and execute every provisioning step "
- "automatically. Ensure all nodes are accessible and you have root/sudo access."
- )
+
+ if profile.cluster_source == "imported":
+ st.info(
+ "Provisioning is not available for imported clusters. "
+ "This cluster was imported via kubeconfig and is managed externally. "
+ "Use the **Resource Viewer**, **Cluster Debugger**, or **Monitoring Setup** "
+ "pages to work with your cluster."
+ )
+ elif not profile.nodes:
+ st.warning("No nodes defined in this profile. Add nodes in the Profile Manager first.")
+ else:
+ st.warning(
+ "This will SSH into each node and execute every provisioning step "
+ "automatically. Ensure all nodes are accessible and you have root/sudo access."
+ )
cp_nodes = profile.get_control_plane_nodes()
worker_nodes = profile.get_worker_nodes()
- st.markdown(f"**Control Plane:** {len(cp_nodes)} node(s) | **Workers:** {len(worker_nodes)} node(s)")
+ if profile.cluster_source != "imported" and profile.nodes:
+ st.markdown(f"**Control Plane:** {len(cp_nodes)} node(s) | **Workers:** {len(worker_nodes)} node(s)")
- col1, col2, col3 = st.columns(3)
- with col1:
- step1 = st.checkbox("Step 1: Common Setup (all nodes)", value=True)
- with col2:
- step2 = st.checkbox("Step 2: Init Control Plane", value=True)
- with col3:
- step3 = st.checkbox("Step 3: Join Workers", value=True)
- step4 = st.checkbox("Step 4: Apply Best Practices", value=True)
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ step1 = st.checkbox("Step 1: Common Setup (all nodes)", value=True)
+ with col2:
+ step2 = st.checkbox("Step 2: Init Control Plane", value=True)
+ with col3:
+ step3 = st.checkbox("Step 3: Join Workers", value=True)
+ step4 = st.checkbox("Step 4: Apply Best Practices", value=True)
+ else:
+ step1 = step2 = step3 = step4 = False
- if st.button("Start Provisioning", type="primary", use_container_width=True):
+ if profile.cluster_source == "imported" or not profile.nodes:
+ pass # messages shown above
+ elif st.button("Start Provisioning", type="primary", use_container_width=True):
update_profile_status(profile.name, "provisioning")
overall_success = True
@@ -2355,9 +2386,15 @@ def page_resource_viewer():
# Imported clusters — no SSH, but we can still get node list and show
# container info via kubectl debug or just list pods per node
st.info(
- "**crictl** requires SSH access to each node and is available for "
- "provisioned clusters. For imported clusters, container-level "
- "information is shown via kubectl below."
+ "**crictl** requires SSH access to each node and is only available for "
+ "provisioned clusters. For imported clusters, pod and container "
+ "information per node is shown via `kubectl` below."
+ )
+ st.markdown(
+ "This view uses `kubectl get pods --field-selector spec.nodeName=` "
+ "to list pods/containers on each node. For full container-level details "
+ "(container IDs, image digests, runtime state), SSH into the node and run "
+ "`sudo crictl ps -a` directly."
)
if st.button("Show containers per node (kubectl)", type="primary", key="crictl_kubectl"):
with st.spinner("Fetching node list..."):
@@ -2398,6 +2435,16 @@ def page_resource_viewer():
if not all_nodes:
st.warning("No nodes defined in this profile.")
else:
+ st.markdown(
+ "> **Note:** `crictl` typically requires **root/sudo** access. "
+ "If your SSH user is not root, the command will be prefixed with `sudo`."
+ )
+ use_sudo = st.checkbox(
+ "Run with sudo (required if SSH user is not root)",
+ value=True,
+ key="crictl_sudo",
+ help="Prefix the command with 'sudo' for non-root SSH users.",
+ )
crictl_cmd = st.text_input(
"CRI command",
value="crictl ps -a",
@@ -2442,23 +2489,35 @@ def page_resource_viewer():
selected_nodes = all_nodes
if st.button("Run on selected nodes", type="primary", key="crictl_run"):
- for node in selected_nodes:
- node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
- with st.expander(f"Node: **{node_label}** [{node.get('role', '')}]", expanded=True):
- with st.spinner(f"Running `{crictl_cmd}` on {node_label}..."):
- result = run_ssh_command(
- ip_address=node["ip_address"],
- command=crictl_cmd,
- ssh_user=node.get("ssh_user", "root"),
- ssh_port=node.get("ssh_port", 22),
- ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
- timeout=30,
- )
- if result.success:
- st.code(result.stdout or "(no output)", language="text")
- else:
- st.error(f"Command failed on {node_label}")
- st.code(result.stderr, language="text")
+ if not selected_nodes:
+ st.warning("No nodes selected. Please select at least one node.")
+ else:
+ actual_cmd = f"sudo {crictl_cmd}" if use_sudo and not crictl_cmd.strip().startswith("sudo") else crictl_cmd
+ all_success = True
+ for node in selected_nodes:
+ node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
+ with st.expander(f"Node: **{node_label}** [{node.get('role', '')}]", expanded=True):
+ with st.spinner(f"Running `{actual_cmd}` on {node_label}..."):
+ result = run_ssh_command(
+ ip_address=node["ip_address"],
+ command=actual_cmd,
+ ssh_user=node.get("ssh_user", "root"),
+ ssh_port=node.get("ssh_port", 22),
+ ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=30,
+ )
+ if result.success:
+ st.code(result.stdout or "(no output)", language="text")
+ else:
+ all_success = False
+ st.error(f"Command failed on {node_label}")
+ st.code(result.stderr, language="text")
+ if "permission denied" in (result.stderr or "").lower():
+ st.info("Tip: Enable the 'Run with sudo' checkbox above if your SSH user needs elevated privileges.")
+ if all_success:
+ st.success(f"Command completed successfully on {len(selected_nodes)} node(s).")
+ else:
+ st.warning("Command failed on some nodes. Check the details above.")
# ── Node Health ──────────────────────────────────────────────────────
with tab_node_health:
From 6a43a3a3fccea33ad617b1033fa0ad33f8a21ef1 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:08:15 +0000
Subject: [PATCH 18/31] Add flash messages for Import Cluster and Create
Profile so success/error persists after rerun
---
k8s-agent/app.py | 34 +++++++++++++++++++++++++---------
1 file changed, 25 insertions(+), 9 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 53e2b30..4df7f83 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -145,6 +145,7 @@ def init_session_state():
"provisioning_log": [],
"debug_results": {},
"log_analysis_results": {},
+ "_flash_message": None,
}
for key, value in defaults.items():
if key not in st.session_state:
@@ -260,6 +261,17 @@ def page_profile_manager():
st.markdown("## Cluster Profile Manager")
st.markdown("Create, edit, and manage profiles for your on-prem Kubernetes clusters.")
+ # Show any flash message from a previous action (e.g. after st.rerun)
+ if st.session_state.get("_flash_message"):
+ _flash = st.session_state._flash_message
+ if _flash[0] == "success":
+ st.success(_flash[1])
+ elif _flash[0] == "error":
+ st.error(_flash[1])
+ elif _flash[0] == "info":
+ st.info(_flash[1])
+ st.session_state._flash_message = None
+
tab_create, tab_import_cluster, tab_list, tab_import = st.tabs([
"Create Profile", "Import Existing Cluster", "Manage Profiles", "Import / Export",
])
@@ -454,7 +466,7 @@ def page_profile_manager():
)
path = save_profile(profile)
st.session_state.active_profile = name
- st.success(f"Profile '{name}' created successfully!")
+ st.session_state._flash_message = ("success", f"Profile '{name}' created successfully! Select it from the sidebar to get started.")
st.rerun()
# ── Import Existing Cluster ──────────────────────────────────────────
@@ -508,14 +520,18 @@ def page_profile_manager():
cluster_source="imported",
kubeconfig_content=kubeconfig_content,
)
- save_profile(profile)
- st.session_state.active_profile = import_name
- st.success(
- f"Cluster '{import_name}' imported! "
- "Select it from the sidebar to start using Debugger, Monitoring, "
- "Resource Viewer, etc."
- )
- st.rerun()
+ try:
+ save_profile(profile)
+ st.session_state.active_profile = import_name
+ st.session_state._flash_message = (
+ "success",
+ f"Cluster '{import_name}' imported successfully! "
+ "It is now the active profile. Use the sidebar navigation to go to "
+ "Cluster Debugger, Resource Viewer, Monitoring Setup, etc."
+ )
+ st.rerun()
+ except Exception as e:
+ st.error(f"Failed to import cluster: {e}")
# ── Manage Profiles ───────────────────────────────────────────────────
with tab_list:
From ad0ad398dcadc355db7d2dbb09eb15a66a06d7d4 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:11:40 +0000
Subject: [PATCH 19/31] Enrich Cluster Details for imported clusters: show node
IPs, roles, kubelet version, OS, container runtime, cluster-info
---
k8s-agent/app.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 61 insertions(+)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 4df7f83..2e4e13a 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -3327,6 +3327,67 @@ def _show_profile_summary(profile: ClusterProfile):
with st.expander("Cluster Details", expanded=False):
st.markdown(f"**Description:** {profile.description or 'N/A'}")
st.markdown(f"**Kubeconfig:** {'Loaded' if profile.kubeconfig_content else 'Not loaded'}")
+
+ # Fetch live cluster info from kubeconfig
+ if profile.kubeconfig_content:
+ node_result = run_kubectl(
+ profile,
+ "get nodes -o wide --no-headers",
+ timeout=10,
+ )
+ if node_result.success and node_result.stdout.strip():
+ st.markdown("---")
+ st.markdown("**Cluster Nodes:**")
+ node_lines = [l for l in node_result.stdout.strip().split("\n") if l.strip()]
+ node_data = []
+ for line in node_lines:
+ parts = line.split()
+ if len(parts) >= 5:
+ node_data.append({
+ "Name": parts[0],
+ "Status": parts[1],
+ "Roles": parts[2] if parts[2] != "" else "worker",
+ "Age": parts[3],
+ "Kubelet Version": parts[4],
+ "Internal IP": parts[5] if len(parts) > 5 else "N/A",
+ "OS Image": " ".join(parts[7:9]) if len(parts) > 8 else (parts[7] if len(parts) > 7 else "N/A"),
+ "Container Runtime": parts[-1] if len(parts) > 9 else "N/A",
+ })
+ if node_data:
+ import pandas as pd
+ st.dataframe(
+ pd.DataFrame(node_data),
+ use_container_width=True,
+ hide_index=True,
+ )
+ # Summary
+ cp_count = sum(1 for n in node_data if "control-plane" in n["Roles"] or "master" in n["Roles"])
+ worker_count = len(node_data) - cp_count
+ ready_count = sum(1 for n in node_data if "Ready" in n["Status"])
+ st.markdown(
+ f"**Total:** {len(node_data)} node(s) — "
+ f"{cp_count} control-plane, {worker_count} worker | "
+ f"**Ready:** {ready_count}/{len(node_data)}"
+ )
+ else:
+ st.code(node_result.stdout, language="text")
+
+ # Cluster info (API server endpoint)
+ info_result = run_kubectl(profile, "cluster-info", timeout=10)
+ if info_result.success and info_result.stdout.strip():
+ st.markdown("---")
+ st.markdown("**Cluster Info:**")
+ # Strip ANSI color codes for clean display
+ import re
+ clean_info = re.sub(r'\x1b\[[0-9;]*m', '', info_result.stdout)
+ st.code(clean_info.strip(), language="text")
+ elif node_result.success:
+ st.info("Connected to cluster but no nodes found.")
+ else:
+ st.warning(
+ f"Could not fetch cluster details: {node_result.stderr or 'kubectl command failed'}. "
+ "Verify that kubectl is installed and the kubeconfig is valid."
+ )
else:
cols = st.columns(5)
cols[0].metric("Profile", profile.name)
From 937bc798a5320836d596b56f74b5741ce21cdd5c Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:23:05 +0000
Subject: [PATCH 20/31] Add Multi-Cluster Dashboard, Certificate Manager, Cost
Optimizer, Pod Restart Tracker, Network Policy Visualizer, PVC/Storage
Dashboard
---
k8s-agent/app.py | 1085 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 1083 insertions(+), 2 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 2e4e13a..aadb5ba 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -209,6 +209,7 @@ def render_sidebar():
# ── Navigation ──
st.markdown("### Navigation")
nav_options = [
+ "Multi-Cluster Dashboard",
"Profile Manager",
"Cluster Creation",
"Resource Viewer",
@@ -216,6 +217,8 @@ def render_sidebar():
"Monitoring Setup",
"Log Analysis",
"Upgrade Planner",
+ "Certificate Manager",
+ "Cost Optimizer",
"AI Assistant",
]
selected_page = st.radio(
@@ -1816,7 +1819,9 @@ def page_resource_viewer():
if profile.cluster_source == "imported" and profile.kubeconfig_content:
_rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
- tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl, tab_node_health, tab_rbac, tab_helm, tab_events = st.tabs([
+ (tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl,
+ tab_node_health, tab_rbac, tab_helm, tab_events,
+ tab_restart_tracker, tab_netpol, tab_pvc) = st.tabs([
"Cluster Resources",
"Scaling",
"Pod Shell",
@@ -1826,6 +1831,9 @@ def page_resource_viewer():
"RBAC Viewer",
"Helm Releases",
"Events Timeline",
+ "Pod Restart Tracker",
+ "Network Policies",
+ "PVC / Storage",
])
# ── Cluster Resources ────────────────────────────────────────────────
@@ -2960,6 +2968,429 @@ def page_resource_viewer():
else:
st.code(result.stderr, language="text")
+ # ── Pod Restart Tracker ───────────────────────────────────────────────
+ with tab_restart_tracker:
+ st.markdown("### Pod Restart Tracker")
+ st.markdown("Identify pods with frequent restarts, OOMKilled containers, and CrashLoopBackOff issues.")
+
+ rcol1, rcol2 = st.columns([2, 1])
+ with rcol1:
+ if _rv_namespaces:
+ restart_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="restart_ns")
+ else:
+ restart_ns = st.text_input("Namespace (blank = all)", value="", key="restart_ns_text")
+ if not restart_ns:
+ restart_ns = "All Namespaces"
+ with rcol2:
+ min_restarts = st.number_input("Min restarts to show", min_value=0, value=1, key="min_restarts")
+
+ if st.button("Load Pod Restarts", type="primary", key="load_restarts"):
+ ns_flag = "-A" if restart_ns == "All Namespaces" else f"-n {restart_ns}"
+ cmd = f"get pods {ns_flag} -o json"
+ with st.spinner("Fetching pod data..."):
+ result = run_kubectl(profile, cmd, timeout=30)
+ if result.success and result.stdout.strip():
+ try:
+ import pandas as pd
+ pods_json = json.loads(result.stdout)
+ restart_data = []
+ for pod in pods_json.get("items", []):
+ pod_name = pod.get("metadata", {}).get("name", "?")
+ pod_ns = pod.get("metadata", {}).get("namespace", "?")
+ for cs in pod.get("status", {}).get("containerStatuses", []):
+ restarts = cs.get("restartCount", 0)
+ if restarts < min_restarts:
+ continue
+ container_name = cs.get("name", "?")
+ ready = cs.get("ready", False)
+ # Detect OOMKilled
+ last_state = cs.get("lastState", {})
+ terminated = last_state.get("terminated", {})
+ reason = terminated.get("reason", "")
+ exit_code = terminated.get("exitCode", "")
+ # Current state
+ state = cs.get("state", {})
+ if "running" in state:
+ current_state = "Running"
+ elif "waiting" in state:
+ current_state = state["waiting"].get("reason", "Waiting")
+ elif "terminated" in state:
+ current_state = state["terminated"].get("reason", "Terminated")
+ else:
+ current_state = "Unknown"
+ restart_data.append({
+ "Namespace": pod_ns,
+ "Pod": pod_name,
+ "Container": container_name,
+ "Restarts": restarts,
+ "Ready": ready,
+ "State": current_state,
+ "Last Termination": reason or "N/A",
+ "Exit Code": str(exit_code) if exit_code != "" else "N/A",
+ })
+ if restart_data:
+ df = pd.DataFrame(restart_data).sort_values("Restarts", ascending=False)
+ # Summary metrics
+ total_restarts = df["Restarts"].sum()
+ oom_count = len(df[df["Last Termination"] == "OOMKilled"])
+ crash_count = len(df[df["State"] == "CrashLoopBackOff"])
+ mcol1, mcol2, mcol3, mcol4 = st.columns(4)
+ mcol1.metric("Containers with Restarts", len(df))
+ mcol2.metric("Total Restarts", int(total_restarts))
+ mcol3.metric("OOMKilled", oom_count)
+ mcol4.metric("CrashLoopBackOff", crash_count)
+ st.dataframe(df, use_container_width=True, hide_index=True)
+ # Highlight problematic pods
+ if oom_count > 0:
+ st.warning(
+ f"{oom_count} container(s) were terminated due to **OOMKilled** — "
+ "consider increasing memory limits for those workloads."
+ )
+ if crash_count > 0:
+ st.error(
+ f"{crash_count} container(s) are in **CrashLoopBackOff** — "
+ "check logs with `kubectl logs -c --previous`."
+ )
+ else:
+ st.success(f"No containers found with {min_restarts}+ restarts. Cluster looks healthy!")
+ except (json.JSONDecodeError, KeyError) as e:
+ st.error(f"Failed to parse pod data: {e}")
+ st.code(result.stdout[:2000], language="text")
+ elif result.success:
+ st.info("No pods found.")
+ else:
+ st.error("Failed to fetch pods")
+ st.code(result.stderr, language="text")
+
+ # ── Network Policy Visualizer ─────────────────────────────────────────
+ with tab_netpol:
+ st.markdown("### Network Policy Visualizer")
+ st.markdown("View and analyze NetworkPolicies to understand pod-to-pod communication rules.")
+
+ npcol1, npcol2 = st.columns([2, 1])
+ with npcol1:
+ if _rv_namespaces:
+ np_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="netpol_ns")
+ else:
+ np_ns = st.text_input("Namespace (blank = all)", value="", key="netpol_ns_text")
+ if not np_ns:
+ np_ns = "All Namespaces"
+
+ if st.button("Load Network Policies", type="primary", key="load_netpol"):
+ ns_flag = "-A" if np_ns == "All Namespaces" else f"-n {np_ns}"
+ cmd = f"get networkpolicies {ns_flag} -o json"
+ with st.spinner("Fetching network policies..."):
+ result = run_kubectl(profile, cmd, timeout=15)
+ if result.success and result.stdout.strip():
+ try:
+ import pandas as pd
+ np_json = json.loads(result.stdout)
+ policies = np_json.get("items", [])
+ if not policies:
+ st.info("No NetworkPolicies found. All pod-to-pod traffic is allowed by default.")
+ else:
+ st.markdown(f"**Found {len(policies)} NetworkPolicies**")
+
+ policy_summary = []
+ for pol in policies:
+ meta = pol.get("metadata", {})
+ spec = pol.get("spec", {})
+ pol_name = meta.get("name", "?")
+ pol_ns = meta.get("namespace", "?")
+ # Pod selector
+ pod_sel = spec.get("podSelector", {})
+ match_labels = pod_sel.get("matchLabels", {})
+ selector_str = ", ".join(f"{k}={v}" for k, v in match_labels.items()) if match_labels else "(all pods)"
+ # Policy types
+ policy_types = spec.get("policyTypes", [])
+ # Ingress rules count
+ ingress_rules = spec.get("ingress", [])
+ egress_rules = spec.get("egress", [])
+
+ policy_summary.append({
+ "Namespace": pol_ns,
+ "Policy": pol_name,
+ "Pod Selector": selector_str,
+ "Types": ", ".join(policy_types) if policy_types else "N/A",
+ "Ingress Rules": len(ingress_rules),
+ "Egress Rules": len(egress_rules),
+ })
+
+ st.dataframe(pd.DataFrame(policy_summary), use_container_width=True, hide_index=True)
+
+ # Detailed view per policy
+ for pol in policies:
+ meta = pol.get("metadata", {})
+ spec = pol.get("spec", {})
+ pol_name = meta.get("name", "?")
+ pol_ns = meta.get("namespace", "?")
+ with st.expander(f"{pol_ns}/{pol_name}", expanded=False):
+ # Pod selector
+ pod_sel = spec.get("podSelector", {})
+ match_labels = pod_sel.get("matchLabels", {})
+ if match_labels:
+ st.markdown("**Applies to pods matching:** " + ", ".join(f"`{k}={v}`" for k, v in match_labels.items()))
+ else:
+ st.markdown("**Applies to:** All pods in namespace")
+
+ # Ingress
+ ingress_rules = spec.get("ingress", [])
+ if ingress_rules:
+ st.markdown("**Ingress Rules:**")
+ for i, rule in enumerate(ingress_rules):
+ sources = []
+ for fr in rule.get("from", []):
+ if "podSelector" in fr:
+ labels = fr["podSelector"].get("matchLabels", {})
+ sources.append("Pods: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
+ if "namespaceSelector" in fr:
+ labels = fr["namespaceSelector"].get("matchLabels", {})
+ sources.append("Namespaces: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
+ if "ipBlock" in fr:
+ sources.append(f"CIDR: {fr['ipBlock'].get('cidr', '?')}")
+ ports = []
+ for p in rule.get("ports", []):
+ ports.append(f"{p.get('protocol', 'TCP')}/{p.get('port', '*')}")
+ src_str = ", ".join(sources) if sources else "any"
+ port_str = ", ".join(ports) if ports else "all ports"
+ st.markdown(f" - Rule {i+1}: Allow from **{src_str}** on **{port_str}**")
+ elif "Ingress" in spec.get("policyTypes", []):
+ st.warning("Ingress type declared but no rules — all ingress traffic is **denied**.")
+
+ # Egress
+ egress_rules = spec.get("egress", [])
+ if egress_rules:
+ st.markdown("**Egress Rules:**")
+ for i, rule in enumerate(egress_rules):
+ destinations = []
+ for to in rule.get("to", []):
+ if "podSelector" in to:
+ labels = to["podSelector"].get("matchLabels", {})
+ destinations.append("Pods: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
+ if "namespaceSelector" in to:
+ labels = to["namespaceSelector"].get("matchLabels", {})
+ destinations.append("Namespaces: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
+ if "ipBlock" in to:
+ destinations.append(f"CIDR: {to['ipBlock'].get('cidr', '?')}")
+ ports = []
+ for p in rule.get("ports", []):
+ ports.append(f"{p.get('protocol', 'TCP')}/{p.get('port', '*')}")
+ dest_str = ", ".join(destinations) if destinations else "any"
+ port_str = ", ".join(ports) if ports else "all ports"
+ st.markdown(f" - Rule {i+1}: Allow to **{dest_str}** on **{port_str}**")
+ elif "Egress" in spec.get("policyTypes", []):
+ st.warning("Egress type declared but no rules — all egress traffic is **denied**.")
+
+ st.markdown("---")
+ st.markdown("**Raw YAML:**")
+ import yaml
+ st.code(yaml.dump(pol, default_flow_style=False), language="yaml")
+
+ # Coverage check
+ st.markdown("---")
+ st.markdown("#### Coverage Analysis")
+ if st.button("Check Unprotected Pods", key="netpol_coverage"):
+ # Get all pods and check which are selected by a policy
+ pod_ns_flag = f"-n {np_ns}" if np_ns != "All Namespaces" else "-A"
+ pod_cmd = f"get pods {pod_ns_flag} -o json"
+ with st.spinner("Analyzing coverage..."):
+ pod_result = run_kubectl(profile, pod_cmd, timeout=15)
+ if pod_result.success and pod_result.stdout.strip():
+ try:
+ all_pods = json.loads(pod_result.stdout).get("items", [])
+ protected_pods = set()
+ for pol in policies:
+ pol_ns_name = pol.get("metadata", {}).get("namespace", "")
+ pod_sel = pol.get("spec", {}).get("podSelector", {})
+ match_labels = pod_sel.get("matchLabels", {})
+ for p in all_pods:
+ p_ns = p.get("metadata", {}).get("namespace", "")
+ p_name = p.get("metadata", {}).get("name", "")
+ p_labels = p.get("metadata", {}).get("labels", {})
+ if p_ns != pol_ns_name:
+ continue
+ if not match_labels or all(p_labels.get(k) == v for k, v in match_labels.items()):
+ protected_pods.add(f"{p_ns}/{p_name}")
+ unprotected = []
+ for p in all_pods:
+ p_ns = p.get("metadata", {}).get("namespace", "")
+ p_name = p.get("metadata", {}).get("name", "")
+ if f"{p_ns}/{p_name}" not in protected_pods:
+ unprotected.append({"Namespace": p_ns, "Pod": p_name})
+ if unprotected:
+ st.warning(f"{len(unprotected)} pod(s) are **not covered** by any NetworkPolicy (all traffic allowed):")
+ st.dataframe(pd.DataFrame(unprotected), use_container_width=True, hide_index=True)
+ else:
+ st.success("All pods are covered by at least one NetworkPolicy.")
+ except (json.JSONDecodeError, KeyError):
+ st.error("Failed to parse pod data for coverage analysis.")
+
+ except (json.JSONDecodeError, KeyError) as e:
+ st.error(f"Failed to parse network policy data: {e}")
+ elif result.success:
+ st.info("No NetworkPolicies found. All pod-to-pod traffic is allowed by default.")
+ else:
+ st.error("Failed to fetch network policies")
+ st.code(result.stderr, language="text")
+
+ # ── PVC / Storage Dashboard ───────────────────────────────────────────
+ with tab_pvc:
+ st.markdown("### PVC / Storage Dashboard")
+ st.markdown("View PersistentVolumeClaims, PersistentVolumes, and StorageClasses.")
+
+ pvc_sub = st.radio(
+ "View",
+ ["PVCs", "PersistentVolumes", "StorageClasses"],
+ horizontal=True,
+ key="pvc_view",
+ )
+
+ if pvc_sub == "PVCs":
+ pcol1, pcol2 = st.columns([2, 1])
+ with pcol1:
+ if _rv_namespaces:
+ pvc_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="pvc_ns")
+ else:
+ pvc_ns = st.text_input("Namespace (blank = all)", value="", key="pvc_ns_text")
+ if not pvc_ns:
+ pvc_ns = "All Namespaces"
+
+ if st.button("Load PVCs", type="primary", key="load_pvcs"):
+ ns_flag = "-A" if pvc_ns == "All Namespaces" else f"-n {pvc_ns}"
+ cmd = f"get pvc {ns_flag} -o json"
+ with st.spinner("Fetching PVCs..."):
+ result = run_kubectl(profile, cmd, timeout=15)
+ if result.success and result.stdout.strip():
+ try:
+ import pandas as pd
+ pvc_json = json.loads(result.stdout)
+ pvcs = pvc_json.get("items", [])
+ if not pvcs:
+ st.info("No PVCs found.")
+ else:
+ pvc_data = []
+ for pvc in pvcs:
+ meta = pvc.get("metadata", {})
+ spec = pvc.get("spec", {})
+ status = pvc.get("status", {})
+ capacity = status.get("capacity", {}).get("storage", "N/A")
+ requested = spec.get("resources", {}).get("requests", {}).get("storage", "N/A")
+ pvc_data.append({
+ "Namespace": meta.get("namespace", "?"),
+ "Name": meta.get("name", "?"),
+ "Status": status.get("phase", "?"),
+ "Volume": spec.get("volumeName", "N/A"),
+ "Capacity": capacity,
+ "Requested": requested,
+ "Access Modes": ", ".join(spec.get("accessModes", [])),
+ "Storage Class": spec.get("storageClassName", "N/A"),
+ })
+ df = pd.DataFrame(pvc_data)
+ # Summary
+ bound = len(df[df["Status"] == "Bound"])
+ pending = len(df[df["Status"] == "Pending"])
+ lost = len(df[df["Status"] == "Lost"])
+ scol1, scol2, scol3, scol4 = st.columns(4)
+ scol1.metric("Total PVCs", len(df))
+ scol2.metric("Bound", bound)
+ scol3.metric("Pending", pending)
+ scol4.metric("Lost", lost)
+ if pending > 0:
+ st.warning(f"{pending} PVC(s) are **Pending** — check StorageClass availability and provisioner status.")
+ if lost > 0:
+ st.error(f"{lost} PVC(s) are **Lost** — the bound PV has been deleted. Data may be lost.")
+ st.dataframe(df, use_container_width=True, hide_index=True)
+ except (json.JSONDecodeError, KeyError) as e:
+ st.error(f"Failed to parse PVC data: {e}")
+ elif result.success:
+ st.info("No PVCs found.")
+ else:
+ st.error("Failed to fetch PVCs")
+ st.code(result.stderr, language="text")
+
+ elif pvc_sub == "PersistentVolumes":
+ if st.button("Load PVs", type="primary", key="load_pvs"):
+ cmd = "get pv -o json"
+ with st.spinner("Fetching PersistentVolumes..."):
+ result = run_kubectl(profile, cmd, timeout=15)
+ if result.success and result.stdout.strip():
+ try:
+ import pandas as pd
+ pv_json = json.loads(result.stdout)
+ pvs = pv_json.get("items", [])
+ if not pvs:
+ st.info("No PersistentVolumes found.")
+ else:
+ pv_data = []
+ for pv in pvs:
+ meta = pv.get("metadata", {})
+ spec = pv.get("spec", {})
+ status = pv.get("status", {})
+ claim_ref = spec.get("claimRef", {})
+ claim = f"{claim_ref.get('namespace', '')}/{claim_ref.get('name', '')}" if claim_ref else "Unbound"
+ pv_data.append({
+ "Name": meta.get("name", "?"),
+ "Capacity": spec.get("capacity", {}).get("storage", "N/A"),
+ "Access Modes": ", ".join(spec.get("accessModes", [])),
+ "Reclaim Policy": spec.get("persistentVolumeReclaimPolicy", "N/A"),
+ "Status": status.get("phase", "?"),
+ "Claim": claim,
+ "Storage Class": spec.get("storageClassName", "N/A"),
+ "Volume Mode": spec.get("volumeMode", "N/A"),
+ })
+ df = pd.DataFrame(pv_data)
+ avail = len(df[df["Status"] == "Available"])
+ bound = len(df[df["Status"] == "Bound"])
+ released = len(df[df["Status"] == "Released"])
+ scol1, scol2, scol3, scol4 = st.columns(4)
+ scol1.metric("Total PVs", len(df))
+ scol2.metric("Bound", bound)
+ scol3.metric("Available", avail)
+ scol4.metric("Released", released)
+ st.dataframe(df, use_container_width=True, hide_index=True)
+ except (json.JSONDecodeError, KeyError) as e:
+ st.error(f"Failed to parse PV data: {e}")
+ elif result.success:
+ st.info("No PersistentVolumes found.")
+ else:
+ st.error("Failed to fetch PVs")
+ st.code(result.stderr, language="text")
+
+ elif pvc_sub == "StorageClasses":
+ if st.button("Load Storage Classes", type="primary", key="load_sc"):
+ cmd = "get storageclasses -o json"
+ with st.spinner("Fetching StorageClasses..."):
+ result = run_kubectl(profile, cmd, timeout=15)
+ if result.success and result.stdout.strip():
+ try:
+ import pandas as pd
+ sc_json = json.loads(result.stdout)
+ scs = sc_json.get("items", [])
+ if not scs:
+ st.info("No StorageClasses found.")
+ else:
+ sc_data = []
+ for sc in scs:
+ meta = sc.get("metadata", {})
+ annotations = meta.get("annotations", {})
+ is_default = annotations.get("storageclass.kubernetes.io/is-default-class", "false") == "true"
+ sc_data.append({
+ "Name": meta.get("name", "?"),
+ "Provisioner": sc.get("provisioner", "N/A"),
+ "Reclaim Policy": sc.get("reclaimPolicy", "N/A"),
+ "Volume Binding": sc.get("volumeBindingMode", "N/A"),
+ "Allow Expansion": sc.get("allowVolumeExpansion", False),
+ "Default": is_default,
+ })
+ st.dataframe(pd.DataFrame(sc_data), use_container_width=True, hide_index=True)
+ except (json.JSONDecodeError, KeyError) as e:
+ st.error(f"Failed to parse StorageClass data: {e}")
+ elif result.success:
+ st.info("No StorageClasses found.")
+ else:
+ st.error("Failed to fetch StorageClasses")
+ st.code(result.stderr, language="text")
+
# ══════════════════════════════════════════════════════════════════════════
# PAGE: Upgrade Planner
@@ -3302,6 +3733,650 @@ def page_ai_assistant():
st.session_state.chat_history.append({"role": "assistant", "content": full_response})
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Multi-Cluster Dashboard
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_multi_cluster_dashboard():
+ st.markdown("## Multi-Cluster Dashboard")
+ st.markdown("Overview of all registered cluster profiles at a glance.")
+
+ profiles = list_profiles()
+ if not profiles:
+ st.info("No cluster profiles yet. Create one in the **Profile Manager** or import a cluster via kubeconfig.")
+ return
+
+ # Summary metrics
+ total = len(profiles)
+ imported = sum(1 for p in profiles if p.cluster_source == "imported")
+ provisioned = total - imported
+ active_count = sum(1 for p in profiles if p.status == "active")
+ draft_count = sum(1 for p in profiles if p.status == "draft")
+ error_count = sum(1 for p in profiles if p.status == "error")
+
+ mcol1, mcol2, mcol3, mcol4, mcol5 = st.columns(5)
+ mcol1.metric("Total Clusters", total)
+ mcol2.metric("Provisioned", provisioned)
+ mcol3.metric("Imported", imported)
+ mcol4.metric("Active", active_count)
+ mcol5.metric("Errors", error_count)
+
+ st.markdown("---")
+
+ # Cluster cards
+ for profile in profiles:
+ status_icon = {"active": "🟢", "error": "🔴", "draft": "⚪", "provisioning": "🟡"}.get(profile.status, "⚪")
+ source_label = "Imported" if profile.cluster_source == "imported" else "Provisioned"
+
+ with st.expander(
+ f"{status_icon} **{profile.name}** — {source_label} | {profile.status.upper()}",
+ expanded=(profile.status == "error"),
+ ):
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ st.markdown(f"**K8s Version:** {profile.kubernetes_version}")
+ st.markdown(f"**Source:** {source_label}")
+ st.markdown(f"**Status:** {profile.status.upper()}")
+ with col2:
+ if profile.cluster_source == "imported":
+ st.markdown(f"**Kubeconfig:** {'Loaded' if profile.kubeconfig_content else 'Not loaded'}")
+ else:
+ cp = len(profile.get_control_plane_nodes())
+ wk = len(profile.get_worker_nodes())
+ st.markdown(f"**Nodes:** {cp} control-plane + {wk} worker")
+ st.markdown(f"**CRI-O:** {profile.crio_version}")
+ st.markdown(f"**CNI:** Flannel")
+ with col3:
+ if profile.description:
+ st.markdown(f"**Description:** {profile.description}")
+
+ # Live cluster health check for imported clusters
+ if profile.cluster_source == "imported" and profile.kubeconfig_content:
+ if st.button(f"Check Health", key=f"health_{profile.name}"):
+ with st.spinner("Checking cluster health..."):
+ node_result = run_kubectl(profile, "get nodes --no-headers", timeout=10)
+ if node_result.success and node_result.stdout.strip():
+ lines = [l for l in node_result.stdout.strip().split("\n") if l.strip()]
+ total_nodes = len(lines)
+ ready_nodes = sum(1 for l in lines if "Ready" in l.split()[1] if len(l.split()) > 1)
+ not_ready = total_nodes - ready_nodes
+ hcol1, hcol2, hcol3 = st.columns(3)
+ hcol1.metric("Nodes", total_nodes)
+ hcol2.metric("Ready", ready_nodes)
+ hcol3.metric("Not Ready", not_ready)
+ if not_ready > 0:
+ st.warning(f"{not_ready} node(s) are not Ready.")
+ else:
+ st.success("All nodes are Ready.")
+ # Pod summary
+ pod_result = run_kubectl(profile, "get pods -A --no-headers", timeout=15)
+ if pod_result.success and pod_result.stdout.strip():
+ pod_lines = [l for l in pod_result.stdout.strip().split("\n") if l.strip()]
+ total_pods = len(pod_lines)
+ running_pods = sum(1 for l in pod_lines if "Running" in l)
+ failed_pods = sum(1 for l in pod_lines if any(s in l for s in ["Error", "CrashLoopBackOff", "ImagePullBackOff"]))
+ pcol1, pcol2, pcol3 = st.columns(3)
+ pcol1.metric("Total Pods", total_pods)
+ pcol2.metric("Running", running_pods)
+ pcol3.metric("Failed/Error", failed_pods)
+ elif node_result.success:
+ st.info("Connected but no nodes found.")
+ else:
+ st.error(f"Could not connect: {node_result.stderr or 'kubectl failed'}")
+
+ # Quick actions
+ if profile.cluster_source == "imported" and profile.kubeconfig_content:
+ qcol1, qcol2, qcol3 = st.columns(3)
+ with qcol1:
+ if st.button("View Nodes", key=f"qnodes_{profile.name}"):
+ result = run_kubectl(profile, "get nodes -o wide", timeout=10)
+ if result.success:
+ st.code(result.stdout or "(no output)", language="text")
+ else:
+ st.error(result.stderr or "Failed")
+ with qcol2:
+ if st.button("View Namespaces", key=f"qns_{profile.name}"):
+ result = run_kubectl(profile, "get namespaces", timeout=10)
+ if result.success:
+ st.code(result.stdout or "(no output)", language="text")
+ else:
+ st.error(result.stderr or "Failed")
+ with qcol3:
+ if st.button("Warning Events", key=f"qevents_{profile.name}"):
+ result = run_kubectl(
+ profile,
+ "get events -A --field-selector type=Warning --sort-by=.lastTimestamp",
+ timeout=15,
+ )
+ if result.success:
+ st.code(result.stdout or "(no warning events)", language="text")
+ else:
+ st.error(result.stderr or "Failed")
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Certificate Manager
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_certificate_manager():
+ st.markdown("## Certificate Manager")
+ st.markdown("View cluster certificate expiration dates, TLS secrets, and plan renewals.")
+
+ profile = _get_active_profile()
+ if not profile:
+ return
+
+ _show_profile_summary(profile)
+
+ tab_certs, tab_tls, tab_renew = st.tabs([
+ "Cluster Certificates",
+ "TLS Secrets",
+ "Renewal Guide",
+ ])
+
+ # ── Cluster Certificates (kubeadm) ────────────────────────────────────
+ with tab_certs:
+ st.markdown("### Cluster Certificates (kubeadm)")
+
+ if profile.cluster_source == "imported":
+ st.info(
+ "Certificate inspection via `kubeadm certs check-expiration` requires SSH access "
+ "to control-plane nodes. For imported clusters, use the **TLS Secrets** tab to view "
+ "TLS certificates stored in the cluster."
+ )
+ # Still try to get API server cert info
+ if st.button("Check API Server Certificate", key="api_cert_check"):
+ with st.spinner("Checking API server certificate..."):
+ cmd = (
+ "get --raw /healthz -v=6 2>&1 || true"
+ )
+ result = run_kubectl(profile, "version --short", timeout=10)
+ if result.success:
+ st.success("API server is reachable and serving valid TLS.")
+ st.code(result.stdout, language="text")
+ else:
+ if "certificate" in (result.stderr or "").lower():
+ st.error("Certificate issue detected:")
+ st.code(result.stderr, language="text")
+ else:
+ st.warning(f"Could not check: {result.stderr}")
+ else:
+ cp_nodes = profile.get_control_plane_nodes()
+ if not cp_nodes:
+ st.warning("No control-plane nodes defined.")
+ else:
+ st.markdown(
+ "Runs `kubeadm certs check-expiration` on control-plane nodes via SSH "
+ "to show certificate validity and expiration dates."
+ )
+ if st.button("Check Certificate Expiration", type="primary", key="check_certs"):
+ for node in cp_nodes:
+ node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})"
+ with st.expander(f"Node: {node_label}", expanded=True):
+ with st.spinner(f"Checking certificates on {node_label}..."):
+ result = run_ssh_command(
+ ip_address=node["ip_address"],
+ command="sudo kubeadm certs check-expiration 2>/dev/null || echo 'kubeadm certs command not available'",
+ ssh_user=node.get("ssh_user", "root"),
+ ssh_port=node.get("ssh_port", 22),
+ ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"),
+ timeout=30,
+ )
+ if result.success and result.stdout.strip():
+ st.code(result.stdout, language="text")
+ # Parse for expiring soon
+ if "RESIDUAL TIME" in result.stdout:
+ for line in result.stdout.split("\n"):
+ if any(warn in line.lower() for warn in ["invalid", "expired"]):
+ st.error(f"Certificate issue: {line.strip()}")
+ else:
+ st.error(f"Failed: {result.stderr or 'No output'}")
+
+ # ── TLS Secrets ───────────────────────────────────────────────────────
+ with tab_tls:
+ st.markdown("### TLS Secrets")
+ st.markdown("View Kubernetes TLS secrets and their certificate details.")
+
+ if st.button("Load TLS Secrets", type="primary", key="load_tls"):
+ cmd = "get secrets -A -o json"
+ with st.spinner("Fetching secrets..."):
+ result = run_kubectl(profile, cmd, timeout=20)
+ if result.success and result.stdout.strip():
+ try:
+ import pandas as pd
+ secrets_json = json.loads(result.stdout)
+ tls_secrets = []
+ for secret in secrets_json.get("items", []):
+ if secret.get("type") == "kubernetes.io/tls":
+ meta = secret.get("metadata", {})
+ annotations = meta.get("annotations", {})
+ tls_secrets.append({
+ "Namespace": meta.get("namespace", "?"),
+ "Name": meta.get("name", "?"),
+ "Type": "kubernetes.io/tls",
+ "Created": meta.get("creationTimestamp", "N/A"),
+ "Issuer": annotations.get("cert-manager.io/issuer-name", annotations.get("cert-manager.io/cluster-issuer", "N/A")),
+ "Has cert": "tls.crt" in secret.get("data", {}),
+ "Has key": "tls.key" in secret.get("data", {}),
+ })
+ if tls_secrets:
+ st.markdown(f"**Found {len(tls_secrets)} TLS secret(s)**")
+ st.dataframe(pd.DataFrame(tls_secrets), use_container_width=True, hide_index=True)
+ else:
+ st.info("No TLS secrets found in the cluster.")
+ except (json.JSONDecodeError, KeyError) as e:
+ st.error(f"Failed to parse secrets: {e}")
+ elif result.success:
+ st.info("No secrets found.")
+ else:
+ st.error("Failed to fetch secrets")
+ st.code(result.stderr, language="text")
+
+ # cert-manager status
+ st.markdown("---")
+ st.markdown("#### cert-manager Status")
+ if st.button("Check cert-manager", key="check_certmanager"):
+ with st.spinner("Checking cert-manager..."):
+ result = run_kubectl(profile, "get pods -n cert-manager --no-headers", timeout=10)
+ if result.success and result.stdout.strip():
+ st.success("cert-manager is installed:")
+ st.code(result.stdout, language="text")
+ # Check certificates
+ cert_result = run_kubectl(profile, "get certificates -A --no-headers", timeout=10)
+ if cert_result.success and cert_result.stdout.strip():
+ st.markdown("**Managed Certificates:**")
+ st.code(cert_result.stdout, language="text")
+ elif result.success:
+ st.info("cert-manager namespace exists but no pods found.")
+ else:
+ st.info("cert-manager does not appear to be installed.")
+
+ # ── Renewal Guide ─────────────────────────────────────────────────────
+ with tab_renew:
+ st.markdown("### Certificate Renewal Guide")
+
+ st.markdown("""
+#### Automatic Renewal (kubeadm)
+
+kubeadm automatically renews certificates during `kubeadm upgrade`. For manual renewal:
+
+```bash
+# Renew all certificates
+sudo kubeadm certs renew all
+
+# Renew specific certificate
+sudo kubeadm certs renew apiserver
+sudo kubeadm certs renew apiserver-kubelet-client
+sudo kubeadm certs renew front-proxy-client
+sudo kubeadm certs renew etcd-server
+sudo kubeadm certs renew etcd-peer
+sudo kubeadm certs renew etcd-healthcheck-client
+
+# After renewal, restart control plane components
+sudo systemctl restart kubelet
+```
+
+#### Certificate Authority (CA) Rotation
+
+CA rotation is more complex and requires:
+1. Generate new CA certificate and key
+2. Distribute to all nodes
+3. Re-sign all component certificates
+4. Rolling restart of all components
+
+#### cert-manager Renewal
+
+If using cert-manager, certificates are automatically renewed before expiration.
+Check cert-manager logs for renewal status:
+
+```bash
+kubectl logs -n cert-manager deploy/cert-manager -f
+```
+
+#### Best Practices
+- Monitor certificate expiration dates regularly
+- Set up alerts for certificates expiring within 30 days
+- Keep kubeadm version aligned with cluster version for smooth renewals
+- Back up `/etc/kubernetes/pki/` before any certificate operations
+- Test renewal in a staging environment first
+ """)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# PAGE: Cost Optimizer
+# ══════════════════════════════════════════════════════════════════════════
+
+def page_cost_optimizer():
+ st.markdown("## Cost Estimator / Resource Optimizer")
+ st.markdown("Analyze resource usage vs requests/limits and identify optimization opportunities.")
+
+ profile = _get_active_profile()
+ if not profile:
+ return
+
+ _show_profile_summary(profile)
+
+ tab_usage, tab_right_size, tab_idle = st.tabs([
+ "Resource Usage",
+ "Right-Sizing",
+ "Idle Resources",
+ ])
+
+ # ── Resource Usage ────────────────────────────────────────────────────
+ with tab_usage:
+ st.markdown("### Actual Resource Usage vs Requests")
+ st.markdown("Compare real CPU/memory usage (from metrics-server) against configured requests and limits.")
+
+ usage_sub = st.radio("View", ["Node Usage", "Pod Usage"], horizontal=True, key="usage_view")
+
+ if usage_sub == "Node Usage":
+ if st.button("Load Node Usage", type="primary", key="load_node_usage"):
+ with st.spinner("Fetching node metrics..."):
+ result = run_kubectl(profile, "top nodes --no-headers", timeout=15)
+ if result.success and result.stdout.strip():
+ import pandas as pd
+ lines = [l for l in result.stdout.strip().split("\n") if l.strip()]
+ node_usage = []
+ for line in lines:
+ parts = line.split()
+ if len(parts) >= 5:
+ node_usage.append({
+ "Node": parts[0],
+ "CPU (cores)": parts[1],
+ "CPU %": parts[2],
+ "Memory": parts[3],
+ "Memory %": parts[4],
+ })
+ if node_usage:
+ st.dataframe(pd.DataFrame(node_usage), use_container_width=True, hide_index=True)
+ # Chart
+ try:
+ import plotly.graph_objects as go
+ fig = go.Figure()
+ names = [n["Node"] for n in node_usage]
+ cpu_pcts = [int(n["CPU %"].replace("%", "")) for n in node_usage]
+ mem_pcts = [int(n["Memory %"].replace("%", "")) for n in node_usage]
+ fig.add_trace(go.Bar(name="CPU %", x=names, y=cpu_pcts, marker_color="#326CE5"))
+ fig.add_trace(go.Bar(name="Memory %", x=names, y=mem_pcts, marker_color="#764ba2"))
+ fig.update_layout(
+ title="Node Resource Utilization",
+ yaxis_title="Utilization %",
+ barmode="group",
+ height=400,
+ )
+ fig.add_hline(y=80, line_dash="dash", line_color="red", annotation_text="80% threshold")
+ st.plotly_chart(fig, use_container_width=True)
+ except ImportError:
+ pass
+ else:
+ st.code(result.stdout, language="text")
+ elif result.success:
+ st.info("No node metrics available. Is metrics-server installed?")
+ else:
+ st.error("Failed to fetch node metrics. Ensure metrics-server is installed.")
+ st.code(result.stderr, language="text")
+ st.info("Install metrics-server via **Monitoring Setup** > **Metrics Components**.")
+
+ elif usage_sub == "Pod Usage":
+ pcol1, pcol2 = st.columns([2, 1])
+ with pcol1:
+ _co_namespaces: list[str] = []
+ if profile.cluster_source == "imported" and profile.kubeconfig_content:
+ _co_namespaces = fetch_namespaces(profile.kubeconfig_content)
+ if _co_namespaces:
+ pod_usage_ns = st.selectbox("Namespace", ["All Namespaces"] + _co_namespaces, key="pod_usage_ns")
+ else:
+ pod_usage_ns = st.text_input("Namespace (blank = all)", value="", key="pod_usage_ns_text")
+ if not pod_usage_ns:
+ pod_usage_ns = "All Namespaces"
+
+ if st.button("Load Pod Usage", type="primary", key="load_pod_usage"):
+ ns_flag = "-A" if pod_usage_ns == "All Namespaces" else f"-n {pod_usage_ns}"
+ with st.spinner("Fetching pod metrics..."):
+ result = run_kubectl(profile, f"top pods {ns_flag} --no-headers", timeout=20)
+ if result.success and result.stdout.strip():
+ import pandas as pd
+ lines = [l for l in result.stdout.strip().split("\n") if l.strip()]
+ pod_usage = []
+ for line in lines:
+ parts = line.split()
+ if pod_usage_ns == "All Namespaces" and len(parts) >= 4:
+ pod_usage.append({
+ "Namespace": parts[0],
+ "Pod": parts[1],
+ "CPU": parts[2],
+ "Memory": parts[3],
+ })
+ elif len(parts) >= 3:
+ pod_usage.append({
+ "Pod": parts[0],
+ "CPU": parts[1],
+ "Memory": parts[2],
+ })
+ if pod_usage:
+ df = pd.DataFrame(pod_usage)
+ st.dataframe(df, use_container_width=True, hide_index=True)
+ st.markdown(f"**Total pods:** {len(df)}")
+ elif result.success:
+ st.info("No pod metrics available.")
+ else:
+ st.error("Failed to fetch pod metrics.")
+ st.code(result.stderr, language="text")
+
+ # ── Right-Sizing ──────────────────────────────────────────────────────
+ with tab_right_size:
+ st.markdown("### Right-Sizing Recommendations")
+ st.markdown(
+ "Compare actual pod usage against configured requests/limits to find "
+ "over-provisioned or under-provisioned workloads."
+ )
+
+ rs_col1, rs_col2 = st.columns([2, 1])
+ with rs_col1:
+ _rs_namespaces: list[str] = []
+ if profile.cluster_source == "imported" and profile.kubeconfig_content:
+ _rs_namespaces = fetch_namespaces(profile.kubeconfig_content)
+ if _rs_namespaces:
+ rs_ns = st.selectbox("Namespace", _rs_namespaces, key="rs_ns")
+ else:
+ rs_ns = st.text_input("Namespace", value="default", key="rs_ns_text")
+
+ if st.button("Analyze Right-Sizing", type="primary", key="analyze_rs"):
+ if not rs_ns:
+ st.warning("Please specify a namespace.")
+ else:
+ with st.spinner("Fetching usage and resource specs..."):
+ # Get actual usage
+ usage_result = run_kubectl(
+ profile,
+ f"top pods -n {rs_ns} --no-headers --containers",
+ timeout=20,
+ )
+ # Get resource specs
+ spec_result = run_kubectl(
+ profile,
+ f"get pods -n {rs_ns} -o json",
+ timeout=20,
+ )
+
+ if usage_result.success and spec_result.success:
+ try:
+ import pandas as pd
+ # Parse usage: POD CONTAINER CPU MEM
+ usage_map = {}
+ for line in (usage_result.stdout or "").strip().split("\n"):
+ parts = line.split()
+ if len(parts) >= 4:
+ key = f"{parts[0]}/{parts[1]}"
+ usage_map[key] = {"cpu_usage": parts[2], "mem_usage": parts[3]}
+
+ # Parse specs
+ pods_json = json.loads(spec_result.stdout)
+ rows = []
+ for pod in pods_json.get("items", []):
+ pod_name = pod.get("metadata", {}).get("name", "?")
+ for container in pod.get("spec", {}).get("containers", []):
+ c_name = container.get("name", "?")
+ res = container.get("resources", {})
+ req_cpu = res.get("requests", {}).get("cpu", "none")
+ req_mem = res.get("requests", {}).get("memory", "none")
+ lim_cpu = res.get("limits", {}).get("cpu", "none")
+ lim_mem = res.get("limits", {}).get("memory", "none")
+ key = f"{pod_name}/{c_name}"
+ usage = usage_map.get(key, {})
+ rows.append({
+ "Pod": pod_name,
+ "Container": c_name,
+ "CPU Usage": usage.get("cpu_usage", "N/A"),
+ "CPU Request": req_cpu,
+ "CPU Limit": lim_cpu,
+ "Mem Usage": usage.get("mem_usage", "N/A"),
+ "Mem Request": req_mem,
+ "Mem Limit": lim_mem,
+ })
+ if rows:
+ df = pd.DataFrame(rows)
+ st.dataframe(df, use_container_width=True, hide_index=True)
+
+ # Recommendations
+ no_req_cpu = sum(1 for r in rows if r["CPU Request"] == "none")
+ no_req_mem = sum(1 for r in rows if r["Mem Request"] == "none")
+ no_lim_cpu = sum(1 for r in rows if r["CPU Limit"] == "none")
+ no_lim_mem = sum(1 for r in rows if r["Mem Limit"] == "none")
+
+ st.markdown("---")
+ st.markdown("#### Recommendations")
+ if no_req_cpu > 0:
+ st.warning(f"{no_req_cpu} container(s) have **no CPU request** — scheduler cannot make optimal placement decisions.")
+ if no_req_mem > 0:
+ st.warning(f"{no_req_mem} container(s) have **no memory request** — pods may be evicted under pressure.")
+ if no_lim_cpu > 0:
+ st.info(f"{no_lim_cpu} container(s) have **no CPU limit** — they can consume all available CPU on the node.")
+ if no_lim_mem > 0:
+ st.warning(f"{no_lim_mem} container(s) have **no memory limit** — they may be OOMKilled or cause node instability.")
+ if no_req_cpu == 0 and no_req_mem == 0 and no_lim_cpu == 0 and no_lim_mem == 0:
+ st.success("All containers have CPU and memory requests and limits set.")
+ else:
+ st.info("No containers found in this namespace.")
+ except (json.JSONDecodeError, KeyError) as e:
+ st.error(f"Failed to parse data: {e}")
+ else:
+ if not usage_result.success:
+ st.error("Failed to fetch pod usage metrics. Is metrics-server installed?")
+ st.code(usage_result.stderr, language="text")
+ if not spec_result.success:
+ st.error("Failed to fetch pod specs.")
+ st.code(spec_result.stderr, language="text")
+
+ # ── Idle Resources ────────────────────────────────────────────────────
+ with tab_idle:
+ st.markdown("### Idle / Unused Resources")
+ st.markdown("Find resources that may be wasting cluster capacity.")
+
+ idle_checks = st.multiselect(
+ "Check for",
+ [
+ "Completed/Failed Jobs",
+ "Deployments scaled to 0",
+ "Orphaned ConfigMaps",
+ "Unbound PVCs",
+ "Empty Namespaces",
+ ],
+ default=["Completed/Failed Jobs", "Deployments scaled to 0", "Unbound PVCs"],
+ key="idle_checks",
+ )
+
+ if st.button("Scan for Idle Resources", type="primary", key="scan_idle"):
+ findings = []
+
+ if "Completed/Failed Jobs" in idle_checks:
+ with st.spinner("Checking completed/failed jobs..."):
+ result = run_kubectl(profile, "get jobs -A -o json", timeout=15)
+ if result.success and result.stdout.strip():
+ try:
+ jobs = json.loads(result.stdout).get("items", [])
+ old_jobs = []
+ for job in jobs:
+ status = job.get("status", {})
+ conditions = status.get("conditions", [])
+ for cond in conditions:
+ if cond.get("type") in ("Complete", "Failed") and cond.get("status") == "True":
+ meta = job.get("metadata", {})
+ old_jobs.append(f" - {meta.get('namespace', '?')}/{meta.get('name', '?')} ({cond['type']})")
+ if old_jobs:
+ findings.append(("warning", f"**{len(old_jobs)} completed/failed job(s)** can be cleaned up:\n" + "\n".join(old_jobs[:20])))
+ else:
+ findings.append(("success", "No completed/failed jobs found."))
+ except (json.JSONDecodeError, KeyError):
+ findings.append(("error", "Failed to parse jobs data."))
+
+ if "Deployments scaled to 0" in idle_checks:
+ with st.spinner("Checking zero-replica deployments..."):
+ result = run_kubectl(profile, "get deployments -A -o json", timeout=15)
+ if result.success and result.stdout.strip():
+ try:
+ deploys = json.loads(result.stdout).get("items", [])
+ zero_deploys = []
+ for dep in deploys:
+ replicas = dep.get("spec", {}).get("replicas", 1)
+ if replicas == 0:
+ meta = dep.get("metadata", {})
+ zero_deploys.append(f" - {meta.get('namespace', '?')}/{meta.get('name', '?')}")
+ if zero_deploys:
+ findings.append(("warning", f"**{len(zero_deploys)} deployment(s) scaled to 0 replicas:**\n" + "\n".join(zero_deploys[:20])))
+ else:
+ findings.append(("success", "No zero-replica deployments found."))
+ except (json.JSONDecodeError, KeyError):
+ findings.append(("error", "Failed to parse deployment data."))
+
+ if "Unbound PVCs" in idle_checks:
+ with st.spinner("Checking unbound PVCs..."):
+ result = run_kubectl(profile, "get pvc -A --no-headers", timeout=15)
+ if result.success and result.stdout.strip():
+ lines = [l for l in result.stdout.strip().split("\n") if l.strip()]
+ pending_pvcs = [l for l in lines if "Pending" in l]
+ if pending_pvcs:
+ findings.append(("warning", f"**{len(pending_pvcs)} PVC(s) in Pending state** (not bound to a PV):\n```\n" + "\n".join(pending_pvcs[:10]) + "\n```"))
+ else:
+ findings.append(("success", "All PVCs are bound."))
+ elif result.success:
+ findings.append(("info", "No PVCs found."))
+
+ if "Empty Namespaces" in idle_checks:
+ with st.spinner("Checking empty namespaces..."):
+ ns_result = run_kubectl(profile, "get namespaces --no-headers", timeout=10)
+ if ns_result.success and ns_result.stdout.strip():
+ ns_lines = [l.split()[0] for l in ns_result.stdout.strip().split("\n") if l.strip()]
+ system_ns = {"kube-system", "kube-public", "kube-node-lease", "default"}
+ empty_ns = []
+ for ns in ns_lines:
+ if ns in system_ns:
+ continue
+ pod_r = run_kubectl(profile, f"get pods -n {ns} --no-headers", timeout=10)
+ if pod_r.success and not pod_r.stdout.strip():
+ empty_ns.append(ns)
+ if empty_ns:
+ findings.append(("info", f"**{len(empty_ns)} namespace(s) with no pods:**\n - " + "\n - ".join(empty_ns[:15])))
+ else:
+ findings.append(("success", "No empty non-system namespaces found."))
+
+ if "Orphaned ConfigMaps" in idle_checks:
+ findings.append(("info", "Orphaned ConfigMap detection requires cross-referencing all pod specs — use the **Resource Viewer** to manually inspect ConfigMaps per namespace."))
+
+ # Display findings
+ st.markdown("---")
+ st.markdown("#### Findings")
+ for level, msg in findings:
+ if level == "warning":
+ st.warning(msg)
+ elif level == "error":
+ st.error(msg)
+ elif level == "success":
+ st.success(msg)
+ else:
+ st.info(msg)
+
+
# ── Helper functions ──────────────────────────────────────────────────────
def _get_active_profile() -> ClusterProfile | None:
@@ -3422,7 +4497,9 @@ def _show_profile_summary(profile: ClusterProfile):
def main():
page = render_sidebar()
- if page == "Profile Manager":
+ if page == "Multi-Cluster Dashboard":
+ page_multi_cluster_dashboard()
+ elif page == "Profile Manager":
page_profile_manager()
elif page == "Cluster Creation":
page_cluster_creation()
@@ -3436,6 +4513,10 @@ def main():
page_log_analysis()
elif page == "Upgrade Planner":
page_upgrade_planner()
+ elif page == "Certificate Manager":
+ page_certificate_manager()
+ elif page == "Cost Optimizer":
+ page_cost_optimizer()
elif page == "AI Assistant":
page_ai_assistant()
From 2df2413ebfaaf7e18deadb6b5b795e8c3cae391e Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:28:18 +0000
Subject: [PATCH 21/31] Add Smart Log Analysis (LogAI-inspired): clustering,
anomaly detection, pattern mining, summarization
---
k8s-agent/app.py | 260 +++++++++++++++++++-
k8s-agent/modules/log_analyzer.py | 383 ++++++++++++++++++++++++++++++
k8s-agent/requirements.txt | 3 +
3 files changed, 645 insertions(+), 1 deletion(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index aadb5ba..85e3874 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -80,6 +80,11 @@
llm_analyze_logs,
llm_correlate_analysis,
get_pod_list,
+ smart_analyze,
+ cluster_logs,
+ detect_anomalies,
+ mine_log_patterns,
+ summarize_logs,
)
from modules.llm_client import query_llm, stream_llm
@@ -1575,10 +1580,11 @@ def page_log_analysis():
available_log_sources = get_available_log_sources(profile)
- tab_system, tab_pod, tab_correlation, tab_ai = st.tabs([
+ tab_system, tab_pod, tab_correlation, tab_smart, tab_ai = st.tabs([
"System Logs",
"Pod Logs",
"Error Correlation",
+ "Smart Log Analysis",
"AI Log Analysis",
])
@@ -1731,6 +1737,258 @@ def page_log_analysis():
analysis = llm_correlate_analysis(multi_logs)
st.markdown(analysis)
+ # ── Smart Log Analysis (LogAI-inspired) ─────────────────────────────
+ with tab_smart:
+ st.markdown("### Smart Log Analysis (LogAI-inspired)")
+ st.markdown(
+ "ML-powered log analysis using techniques from "
+ "[Salesforce LogAI](https://github.com/salesforce/logai): "
+ "**log clustering** (TF-IDF + DBSCAN), **anomaly detection**, "
+ "**pattern mining** (Drain-style), and **auto-summarization**."
+ )
+
+ smart_mode = st.radio(
+ "Analysis mode",
+ ["Collect from cluster", "Paste logs"],
+ horizontal=True,
+ key="smart_mode",
+ )
+
+ smart_log_text = ""
+
+ if smart_mode == "Collect from cluster":
+ scol1, scol2, scol3 = st.columns(3)
+ with scol1:
+ smart_source = st.selectbox(
+ "Log Source", available_log_sources, key="smart_source",
+ )
+ with scol2:
+ smart_lines = st.number_input(
+ "Lines to fetch", min_value=100, max_value=5000, value=500, key="smart_lines",
+ )
+ with scol3:
+ smart_since_opts = {
+ "Last 15 min": ("15 minutes ago", "15m"),
+ "Last 1 hour": ("1 hour ago", "1h"),
+ "Last 6 hours": ("6 hours ago", "6h"),
+ "Last 24 hours": ("24 hours ago", "24h"),
+ }
+ smart_since_label = st.selectbox(
+ "Time Range", list(smart_since_opts.keys()), index=1, key="smart_since",
+ )
+ smart_since, smart_since_k8s = smart_since_opts[smart_since_label]
+
+ if st.button("Collect & Analyze", type="primary", key="smart_collect"):
+ with st.spinner(f"Collecting {smart_source} logs..."):
+ result = collect_logs(
+ cp_node, smart_source, smart_lines, smart_since, smart_since_k8s, profile=profile,
+ )
+ if result.success and result.stdout.strip():
+ smart_log_text = result.stdout
+ st.session_state["_smart_log_text"] = smart_log_text
+ st.session_state["_smart_source"] = smart_source
+ elif result.success:
+ st.info("No logs returned for the selected source and time range.")
+ else:
+ st.error(f"Failed to collect logs: {result.stderr}")
+
+ # Persist across reruns
+ if "_smart_log_text" in st.session_state and not smart_log_text:
+ smart_log_text = st.session_state["_smart_log_text"]
+
+ else:
+ smart_log_text = st.text_area(
+ "Paste log output",
+ height=200,
+ placeholder="Paste your Kubernetes logs here for smart analysis...",
+ key="smart_paste",
+ )
+ if smart_log_text:
+ st.session_state["_smart_log_text"] = smart_log_text
+ st.session_state["_smart_source"] = "pasted"
+
+ # Run analysis if we have log text
+ if smart_log_text:
+ src_label = st.session_state.get("_smart_source", "")
+ with st.spinner("Running LogAI-inspired analysis pipeline..."):
+ sa_result = smart_analyze(smart_log_text, source=src_label)
+
+ # ── Summary / Health Score ────────────────────────────────
+ st.markdown("---")
+ st.markdown("#### Log Summary & Health Score")
+ summary = sa_result.summary
+ health = summary.get("health_score", 100)
+ health_color = "green" if health >= 80 else ("orange" if health >= 50 else "red")
+ scol1, scol2, scol3, scol4, scol5 = st.columns(5)
+ scol1.metric("Total Lines", summary.get("total_lines", 0))
+ scol2.metric("Errors", summary.get("error_count", 0))
+ scol3.metric("Warnings", summary.get("warning_count", 0))
+ scol4.metric("Unique Templates", summary.get("unique_templates", 0))
+ scol5.metric("Health Score", f"{health}/100")
+
+ if health < 50:
+ st.error(f"Health score is **{health}/100** — significant issues detected in logs.")
+ elif health < 80:
+ st.warning(f"Health score is **{health}/100** — some issues detected.")
+ else:
+ st.success(f"Health score is **{health}/100** — logs look healthy.")
+
+ st.markdown(
+ f"**Time span:** {summary.get('first_timestamp', 'N/A')} → {summary.get('last_timestamp', 'N/A')} | "
+ f"**Template diversity:** {summary.get('template_diversity', 0)}%"
+ )
+
+ # Top errors
+ top_errors = summary.get("top_errors", [])
+ if top_errors:
+ with st.expander(f"Top {len(top_errors)} Error Patterns", expanded=True):
+ for pattern, count in top_errors:
+ st.markdown(f"- **x{count}** — `{pattern[:200]}`")
+
+ # ── Log Clustering ────────────────────────────────────────
+ st.markdown("---")
+ st.markdown("#### Log Clustering (TF-IDF + DBSCAN)")
+ st.markdown(
+ "Groups similar log messages together to reduce noise and highlight distinct message types. "
+ "Uses TF-IDF vectorization and DBSCAN density-based clustering."
+ )
+ if sa_result.clusters:
+ import pandas as pd
+ cluster_data = []
+ for c in sa_result.clusters:
+ label = f"Cluster {c.cluster_id}" if c.cluster_id >= 0 else "Noise (unique)"
+ cluster_data.append({
+ "Cluster": label,
+ "Count": c.count,
+ "Level": c.level,
+ "Template": c.template[:120],
+ "First Seen": c.first_seen or "N/A",
+ "Last Seen": c.last_seen or "N/A",
+ })
+ df_clusters = pd.DataFrame(cluster_data)
+ st.dataframe(df_clusters, use_container_width=True, hide_index=True)
+
+ # Cluster distribution chart
+ try:
+ import plotly.express as px
+ fig = px.pie(
+ df_clusters, names="Cluster", values="Count",
+ title="Log Message Distribution by Cluster",
+ color_discrete_sequence=px.colors.qualitative.Set3,
+ )
+ fig.update_layout(height=400)
+ st.plotly_chart(fig, use_container_width=True)
+ except ImportError:
+ pass
+
+ # Show sample messages per cluster
+ error_clusters = [c for c in sa_result.clusters if c.level == "ERROR"]
+ if error_clusters:
+ with st.expander(f"Error Clusters ({len(error_clusters)})", expanded=True):
+ for c in error_clusters:
+ label = f"Cluster {c.cluster_id}" if c.cluster_id >= 0 else "Noise"
+ st.markdown(f"**{label}** — {c.count} messages")
+ for sample in c.sample_messages[:2]:
+ st.code(sample, language="text")
+ else:
+ st.info("Not enough log lines for clustering (need 3+ lines).")
+
+ # ── Anomaly Detection ─────────────────────────────────────
+ st.markdown("---")
+ st.markdown("#### Anomaly Detection")
+ st.markdown(
+ "Detects unusual log lines using TF-IDF distance from centroid (outlier scoring) "
+ "and frequency-based rare template detection."
+ )
+ if sa_result.anomalies:
+ st.markdown(f"**{len(sa_result.anomalies)} anomalous log line(s) detected**")
+ import pandas as pd
+ anomaly_data = []
+ for a in sa_result.anomalies[:30]:
+ anomaly_data.append({
+ "Score": round(a.score, 2),
+ "Reason": a.reason,
+ "Timestamp": a.timestamp or "N/A",
+ "Message": a.message[:150],
+ })
+ df_anomalies = pd.DataFrame(anomaly_data)
+ st.dataframe(df_anomalies, use_container_width=True, hide_index=True)
+
+ # Show full messages for top anomalies
+ with st.expander("Top Anomaly Details", expanded=False):
+ for i, a in enumerate(sa_result.anomalies[:10]):
+ st.markdown(f"**#{i+1}** (score: {a.score:.2f}) — {a.reason}")
+ st.code(a.message, language="text")
+ else:
+ st.success("No anomalous log lines detected — all messages follow expected patterns.")
+
+ # ── Pattern Mining ────────────────────────────────────────
+ st.markdown("---")
+ st.markdown("#### Pattern Mining (Drain-style)")
+ st.markdown(
+ "Extracts frequent log templates by replacing variable tokens (IPs, IDs, numbers, paths) "
+ "with placeholders — similar to LogAI's Drain parser."
+ )
+ if sa_result.patterns:
+ import pandas as pd
+ pattern_data = []
+ for p in sa_result.patterns[:20]:
+ pattern_data.append({
+ "Template": p["template"][:120],
+ "Count": p["count"],
+ "% of Logs": p["percentage"],
+ "Level": p["level"],
+ })
+ df_patterns = pd.DataFrame(pattern_data)
+ st.dataframe(df_patterns, use_container_width=True, hide_index=True)
+
+ # Bar chart of top patterns
+ try:
+ import plotly.express as px
+ top_10 = sa_result.patterns[:10]
+ fig = px.bar(
+ x=[p["template"][:60] for p in top_10],
+ y=[p["count"] for p in top_10],
+ labels={"x": "Template", "y": "Count"},
+ title="Top 10 Log Templates",
+ color=[p["level"] for p in top_10],
+ color_discrete_map={"ERROR": "#FF4B4B", "WARNING": "#FFA500", "INFO": "#326CE5"},
+ )
+ fig.update_layout(height=400, xaxis_tickangle=-45)
+ st.plotly_chart(fig, use_container_width=True)
+ except ImportError:
+ pass
+ else:
+ st.info("No patterns extracted.")
+
+ # ── Timeline ──────────────────────────────────────────────
+ if sa_result.timeline_buckets and len(sa_result.timeline_buckets) > 1:
+ st.markdown("---")
+ st.markdown("#### Log Volume Timeline")
+ try:
+ import plotly.graph_objects as go
+ import pandas as pd
+ ts_labels = [b["timestamp"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"]
+ ts_totals = [b["total"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"]
+ ts_errors = [b["errors"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"]
+ ts_warnings = [b["warnings"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"]
+
+ if ts_labels:
+ fig = go.Figure()
+ fig.add_trace(go.Scatter(x=ts_labels, y=ts_totals, name="Total", mode="lines+markers", line=dict(color="#326CE5")))
+ fig.add_trace(go.Bar(x=ts_labels, y=ts_errors, name="Errors", marker_color="#FF4B4B"))
+ fig.add_trace(go.Bar(x=ts_labels, y=ts_warnings, name="Warnings", marker_color="#FFA500"))
+ fig.update_layout(
+ title="Log Volume Over Time",
+ yaxis_title="Count",
+ xaxis_title="Time",
+ barmode="stack",
+ height=400,
+ )
+ st.plotly_chart(fig, use_container_width=True)
+ except ImportError:
+ pass
+
# ── AI Log Analysis ───────────────────────────────────────────────────
with tab_ai:
st.markdown("### AI-Powered Log Analysis")
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index 917bc3e..3f6c6df 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -453,3 +453,386 @@ def get_pod_list(
ns_flag = f"-n {namespace}" if namespace else "-A"
command = f"kubectl get pods {ns_flag} -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,CONTAINERS:.spec.containers[*].name' --no-headers"
return _run_on_cluster(control_plane_node, command, profile=profile, timeout=30)
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# LogAI-inspired Smart Log Analysis
+# Provides: Log Clustering, Anomaly Detection, Pattern Mining, Summarization
+# Uses scikit-learn (TF-IDF + DBSCAN) instead of LogAI directly due to
+# Python 3.12 compatibility issues with the logai package.
+# ══════════════════════════════════════════════════════════════════════════
+
+@dataclass
+class LogCluster:
+ """A cluster of similar log messages."""
+ cluster_id: int
+ template: str
+ count: int
+ level: str # predominant level: ERROR, WARNING, INFO
+ sample_messages: list[str] = field(default_factory=list)
+ first_seen: str = ""
+ last_seen: str = ""
+
+
+@dataclass
+class LogAnomaly:
+ """An anomalous log line or pattern."""
+ message: str
+ score: float # anomaly score (higher = more anomalous)
+ reason: str
+ timestamp: str = ""
+ source: str = ""
+
+
+@dataclass
+class SmartAnalysisResult:
+ """Full result from smart log analysis."""
+ total_lines: int = 0
+ clusters: list[LogCluster] = field(default_factory=list)
+ anomalies: list[LogAnomaly] = field(default_factory=list)
+ patterns: list[dict] = field(default_factory=list)
+ summary: dict = field(default_factory=dict)
+ timeline_buckets: list[dict] = field(default_factory=list)
+
+
+def _tokenize_log(message: str) -> str:
+ """Tokenize a log message by replacing variable parts with placeholders.
+
+ This mimics LogAI's Drain-style log parsing — variable tokens (IPs,
+ hex IDs, numbers, paths, UUIDs) are replaced so that messages with the
+ same *template* look identical after tokenization.
+ """
+ # Remove leading timestamp (various formats)
+ msg = re.sub(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[^\s]*\s*", "", message)
+ msg = re.sub(r"^[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s*", "", msg)
+ # Replace UUIDs
+ msg = re.sub(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", "", msg, flags=re.IGNORECASE)
+ # Replace hex IDs (8+ chars)
+ msg = re.sub(r"\b[0-9a-f]{8,}\b", "", msg)
+ # Replace IPs
+ msg = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?", "", msg)
+ # Replace pure numbers
+ msg = re.sub(r"\b\d+\b", "", msg)
+ # Replace file paths
+ msg = re.sub(r"/[\w./-]+", "", msg)
+ # Replace pod/container names with common suffixes
+ msg = re.sub(r"\b[\w]+-[0-9a-f]{5,10}\b", "", msg)
+ return msg.strip()
+
+
+def cluster_logs(log_text: str, source: str = "", max_clusters: int = 50, eps: float = 0.5) -> list[LogCluster]:
+ """Cluster log messages using TF-IDF vectorization + DBSCAN.
+
+ Inspired by LogAI's log clustering pipeline:
+ 1. Parse each log line
+ 2. Tokenize to extract log templates
+ 3. Vectorize with TF-IDF
+ 4. Cluster with DBSCAN (density-based — no need to specify k)
+ 5. Return clusters sorted by size
+ """
+ try:
+ from sklearn.feature_extraction.text import TfidfVectorizer
+ from sklearn.cluster import DBSCAN
+ import numpy as np
+ except ImportError:
+ return []
+
+ lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+ if len(lines) < 3:
+ return []
+
+ # Parse and tokenize
+ entries = [parse_log_line(l, source) for l in lines]
+ tokenized = [_tokenize_log(e.message) for e in entries]
+
+ # Filter out empty tokenized lines
+ valid_indices = [i for i, t in enumerate(tokenized) if t.strip()]
+ if len(valid_indices) < 3:
+ return []
+
+ valid_tokenized = [tokenized[i] for i in valid_indices]
+ valid_entries = [entries[i] for i in valid_indices]
+
+ # TF-IDF vectorization
+ try:
+ vectorizer = TfidfVectorizer(max_features=1000, stop_words=None, token_pattern=r"(?u)\b\w+\b")
+ tfidf_matrix = vectorizer.fit_transform(valid_tokenized)
+ except ValueError:
+ return []
+
+ # DBSCAN clustering
+ clustering = DBSCAN(eps=eps, min_samples=2, metric="cosine")
+ labels = clustering.fit_predict(tfidf_matrix)
+
+ # Build clusters
+ cluster_map: dict[int, list[int]] = {}
+ for idx, label in enumerate(labels):
+ cluster_map.setdefault(label, []).append(idx)
+
+ result_clusters = []
+ for cluster_id, member_indices in sorted(cluster_map.items(), key=lambda x: -len(x[1])):
+ members = [valid_entries[i] for i in member_indices]
+ levels = [m.level for m in members]
+ level_counter = Counter(levels)
+ predominant_level = level_counter.most_common(1)[0][0]
+
+ # Use the most common tokenized form as the template
+ templates = [valid_tokenized[i] for i in member_indices]
+ template = Counter(templates).most_common(1)[0][0]
+
+ # Timestamps
+ timestamps = [m.timestamp for m in members if m.timestamp]
+ first_seen = min(timestamps) if timestamps else ""
+ last_seen = max(timestamps) if timestamps else ""
+
+ samples = [members[i].raw for i in range(min(3, len(members)))]
+
+ label_str = "noise" if cluster_id == -1 else str(cluster_id)
+ result_clusters.append(LogCluster(
+ cluster_id=cluster_id,
+ template=template if cluster_id != -1 else "(unclustered / unique messages)",
+ count=len(members),
+ level=predominant_level,
+ sample_messages=samples,
+ first_seen=first_seen,
+ last_seen=last_seen,
+ ))
+
+ # Sort by count descending, but put noise cluster (-1) last
+ result_clusters.sort(key=lambda c: (c.cluster_id == -1, -c.count))
+ return result_clusters[:max_clusters]
+
+
+def detect_anomalies(log_text: str, source: str = "", threshold: float = 2.0) -> list[LogAnomaly]:
+ """Detect anomalous log lines using frequency-based and TF-IDF outlier detection.
+
+ Inspired by LogAI's anomaly detection pipeline:
+ 1. Tokenize messages to get templates
+ 2. Count template frequencies
+ 3. Rare templates (below frequency threshold) are flagged
+ 4. Additionally, use TF-IDF distance from centroid for outlier scoring
+ """
+ try:
+ from sklearn.feature_extraction.text import TfidfVectorizer
+ import numpy as np
+ except ImportError:
+ return []
+
+ lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+ if len(lines) < 5:
+ return []
+
+ entries = [parse_log_line(l, source) for l in lines]
+ tokenized = [_tokenize_log(e.message) for e in entries]
+
+ # Frequency-based anomaly detection
+ template_counts = Counter(tokenized)
+ total = len(tokenized)
+ freq_threshold = max(1, total * 0.01) # templates appearing in < 1% of lines
+
+ anomalies = []
+
+ # TF-IDF outlier detection
+ try:
+ vectorizer = TfidfVectorizer(max_features=500, token_pattern=r"(?u)\b\w+\b")
+ tfidf_matrix = vectorizer.fit_transform(tokenized)
+ centroid = tfidf_matrix.mean(axis=0)
+ centroid = np.asarray(centroid).flatten()
+
+ distances = []
+ for i in range(tfidf_matrix.shape[0]):
+ vec = np.asarray(tfidf_matrix[i].todense()).flatten()
+ dist = np.linalg.norm(vec - centroid)
+ distances.append(dist)
+
+ distances = np.array(distances)
+ mean_dist = distances.mean()
+ std_dist = distances.std() if distances.std() > 0 else 1.0
+
+ for i, (entry, dist) in enumerate(zip(entries, distances)):
+ z_score = (dist - mean_dist) / std_dist
+ reasons = []
+
+ # TF-IDF outlier
+ if z_score > threshold:
+ reasons.append(f"TF-IDF outlier (z-score: {z_score:.2f})")
+
+ # Frequency anomaly
+ if template_counts[tokenized[i]] <= freq_threshold:
+ reasons.append(f"Rare template (seen {template_counts[tokenized[i]]}x out of {total})")
+
+ # Error/critical level
+ if entry.level == "ERROR":
+ reasons.append("Error-level message")
+
+ if reasons:
+ anomalies.append(LogAnomaly(
+ message=entry.raw,
+ score=float(z_score),
+ reason="; ".join(reasons),
+ timestamp=entry.timestamp,
+ source=source,
+ ))
+ except ValueError:
+ # Fallback to frequency-only if TF-IDF fails
+ for i, entry in enumerate(entries):
+ if template_counts[tokenized[i]] <= freq_threshold:
+ anomalies.append(LogAnomaly(
+ message=entry.raw,
+ score=1.0,
+ reason=f"Rare template (seen {template_counts[tokenized[i]]}x out of {total})",
+ timestamp=entry.timestamp,
+ source=source,
+ ))
+
+ # Sort by score descending
+ anomalies.sort(key=lambda a: -a.score)
+ return anomalies[:100] # cap at 100
+
+
+def mine_log_patterns(log_text: str, source: str = "", top_n: int = 30) -> list[dict]:
+ """Mine frequent log patterns/templates from log text.
+
+ Inspired by LogAI's Drain log parser — extracts common templates by
+ tokenizing variable parts and counting occurrences.
+ """
+ lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+ if not lines:
+ return []
+
+ entries = [parse_log_line(l, source) for l in lines]
+ tokenized = [_tokenize_log(e.message) for e in entries]
+
+ # Count templates
+ template_counts = Counter(tokenized)
+
+ # Group by template
+ template_levels: dict[str, Counter] = {}
+ template_samples: dict[str, str] = {}
+ for entry, template in zip(entries, tokenized):
+ if template not in template_levels:
+ template_levels[template] = Counter()
+ template_samples[template] = entry.raw
+ template_levels[template][entry.level] += 1
+
+ patterns = []
+ for template, count in template_counts.most_common(top_n):
+ level_dist = dict(template_levels.get(template, {}))
+ predominant = max(level_dist, key=level_dist.get) if level_dist else "INFO"
+ patterns.append({
+ "template": template,
+ "count": count,
+ "percentage": round(count / len(lines) * 100, 1),
+ "level": predominant,
+ "level_distribution": level_dist,
+ "sample": template_samples.get(template, ""),
+ })
+
+ return patterns
+
+
+def summarize_logs(log_text: str, source: str = "") -> dict:
+ """Generate a comprehensive summary of log data.
+
+ Inspired by LogAI's summarization — provides:
+ - Level distribution (INFO/WARNING/ERROR counts)
+ - Time span
+ - Top error messages
+ - Log velocity (lines per minute)
+ - Health score
+ """
+ lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+ if not lines:
+ return {"total_lines": 0, "health_score": 100}
+
+ entries = [parse_log_line(l, source) for l in lines]
+
+ # Level distribution
+ levels = Counter(e.level for e in entries)
+ error_count = levels.get("ERROR", 0)
+ warning_count = levels.get("WARNING", 0)
+ info_count = levels.get("INFO", 0)
+
+ # Time span
+ timestamps = [e.timestamp for e in entries if e.timestamp]
+ first_ts = min(timestamps) if timestamps else "N/A"
+ last_ts = max(timestamps) if timestamps else "N/A"
+
+ # Top errors
+ error_messages = [_normalize_error(e.message) for e in entries if e.level == "ERROR"]
+ top_errors = Counter(error_messages).most_common(10)
+
+ # Top warnings
+ warning_messages = [_normalize_error(e.message) for e in entries if e.level == "WARNING"]
+ top_warnings = Counter(warning_messages).most_common(5)
+
+ # Unique templates
+ tokenized = [_tokenize_log(e.message) for e in entries]
+ unique_templates = len(set(tokenized))
+
+ # Health score (0-100)
+ # High errors = low score, high warnings = moderate reduction
+ error_ratio = error_count / len(lines) if lines else 0
+ warning_ratio = warning_count / len(lines) if lines else 0
+ health_score = max(0, min(100, int(100 - error_ratio * 300 - warning_ratio * 50)))
+
+ return {
+ "total_lines": len(lines),
+ "error_count": error_count,
+ "warning_count": warning_count,
+ "info_count": info_count,
+ "level_distribution": dict(levels),
+ "first_timestamp": first_ts,
+ "last_timestamp": last_ts,
+ "top_errors": top_errors,
+ "top_warnings": top_warnings,
+ "unique_templates": unique_templates,
+ "template_diversity": round(unique_templates / len(lines) * 100, 1) if lines else 0,
+ "health_score": health_score,
+ }
+
+
+def smart_analyze(log_text: str, source: str = "") -> SmartAnalysisResult:
+ """Run the full LogAI-inspired analysis pipeline.
+
+ Combines: clustering, anomaly detection, pattern mining, and summarization.
+ """
+ result = SmartAnalysisResult()
+ lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+ result.total_lines = len(lines)
+
+ if not lines:
+ return result
+
+ # 1. Clustering
+ result.clusters = cluster_logs(log_text, source)
+
+ # 2. Anomaly detection
+ result.anomalies = detect_anomalies(log_text, source)
+
+ # 3. Pattern mining
+ result.patterns = mine_log_patterns(log_text, source)
+
+ # 4. Summarization
+ result.summary = summarize_logs(log_text, source)
+
+ # 5. Timeline buckets (group by timestamp prefix for timeline view)
+ entries = [parse_log_line(l, source) for l in lines]
+ ts_buckets: dict[str, dict] = {}
+ for entry in entries:
+ if entry.timestamp:
+ # Bucket by minute (first 16 chars: YYYY-MM-DDTHH:MM)
+ bucket_key = entry.timestamp[:16] if len(entry.timestamp) >= 16 else entry.timestamp
+ else:
+ bucket_key = "unknown"
+ if bucket_key not in ts_buckets:
+ ts_buckets[bucket_key] = {"timestamp": bucket_key, "total": 0, "errors": 0, "warnings": 0}
+ ts_buckets[bucket_key]["total"] += 1
+ if entry.level == "ERROR":
+ ts_buckets[bucket_key]["errors"] += 1
+ elif entry.level == "WARNING":
+ ts_buckets[bucket_key]["warnings"] += 1
+
+ result.timeline_buckets = sorted(ts_buckets.values(), key=lambda b: b["timestamp"])
+
+ return result
diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt
index c9c4741..8667fe8 100644
--- a/k8s-agent/requirements.txt
+++ b/k8s-agent/requirements.txt
@@ -1,3 +1,6 @@
streamlit>=1.32.0
requests>=2.31.0
plotly>=5.18.0
+pandas>=2.0.0
+scikit-learn>=1.3.0
+numpy>=1.24.0
From 85f833a9a366666e7a8c0c3bafd7d99c9a6eb92d Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 07:42:20 +0000
Subject: [PATCH 22/31] Fix proxy /etc/environment format: use KEY=VALUE for
pam_env, source env in subsequent steps
---
k8s-agent/modules/cluster_creator.py | 62 +++++++++++++++++++++++++---
1 file changed, 56 insertions(+), 6 deletions(-)
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index d9f694d..5967556 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -106,7 +106,12 @@ def test_ssh_connectivity(node: dict) -> SSHResult:
def _proxy_env_block(profile: ClusterProfile) -> str:
- """Generate shell export lines for proxy environment variables."""
+ """Generate shell export lines for proxy environment variables.
+
+ These are valid *shell* statements — use inside scripts for the
+ current session. Do NOT write these to ``/etc/environment``;
+ use :func:`_proxy_env_file_block` for that.
+ """
lines = []
proxy = profile.http_proxy or profile.http_proxy_alt
proxys = profile.https_proxy or profile.https_proxy_alt
@@ -122,9 +127,45 @@ def _proxy_env_block(profile: ClusterProfile) -> str:
return "\n".join(lines)
+def _proxy_env_file_block(profile: ClusterProfile) -> str:
+ """Generate KEY=VALUE lines suitable for ``/etc/environment``.
+
+ ``/etc/environment`` is parsed by ``pam_env.so`` which expects plain
+ ``KEY=VALUE`` lines — the ``export`` keyword is **not** valid there.
+ """
+ lines = []
+ proxy = profile.http_proxy or profile.http_proxy_alt
+ proxys = profile.https_proxy or profile.https_proxy_alt
+ if proxy:
+ lines.append(f'http_proxy="{proxy}"')
+ lines.append(f'HTTP_PROXY="{proxy}"')
+ if proxys:
+ lines.append(f'https_proxy="{proxys}"')
+ lines.append(f'HTTPS_PROXY="{proxys}"')
+ if profile.no_proxy:
+ lines.append(f'no_proxy="{profile.no_proxy}"')
+ lines.append(f'NO_PROXY="{profile.no_proxy}"')
+ return "\n".join(lines)
+
+
+def _source_env_preamble() -> str:
+ """Return a shell snippet that sources /etc/environment.
+
+ Each ``ProvisionStep`` runs in its own SSH session, so environment
+ variables set by a previous step (e.g. proxy settings) are lost.
+ Sourcing ``/etc/environment`` at the top of every network-dependent
+ step ensures the variables are available.
+ """
+ return (
+ "# Source /etc/environment so proxy vars (and others) persist across SSH sessions\n"
+ "set -a; . /etc/environment 2>/dev/null || true; set +a\n"
+ )
+
+
def generate_common_setup_script(profile: ClusterProfile) -> str:
"""Generate the common setup script that runs on ALL nodes (control-plane + workers)."""
proxy_block = _proxy_env_block(profile)
+ proxy_env_file_block = _proxy_env_file_block(profile)
proxy_section = ""
if proxy_block:
proxy_section = f"""
@@ -132,9 +173,9 @@ def generate_common_setup_script(profile: ClusterProfile) -> str:
echo ">> Configuring proxy settings..."
{proxy_block}
-# Persist proxy in /etc/environment for all users
-cat >> /etc/environment <> /etc/environment <<'PROXYEOF'
+{proxy_env_file_block}
PROXYEOF
"""
@@ -569,6 +610,7 @@ def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
steps: List[ProvisionStep] = []
# 0. Proxy (optional)
+ proxy_env_file_block = _proxy_env_file_block(profile)
if proxy_block:
steps.append(ProvisionStep(
name="configure_proxy",
@@ -576,9 +618,9 @@ def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
script=f"""set -euo pipefail
echo '>> Configuring proxy settings...'
{proxy_block}
-# Persist proxy in /etc/environment for all users
+# Persist proxy in /etc/environment for all users (KEY=VALUE format for pam_env)
cat >> /etc/environment <<'PROXYEOF'
-{proxy_block}
+{proxy_env_file_block}
PROXYEOF
echo 'Proxy configured.'
""",
@@ -652,10 +694,12 @@ def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
))
# 3. Install CRI-O
+ _env_preamble = _source_env_preamble()
steps.append(ProvisionStep(
name="install_crio",
title=f"Install CRI-O {profile.crio_version}",
script=f"""set -euo pipefail
+{_env_preamble}
echo '>> Installing CRI-O {profile.crio_version}...'
OS="$(. /etc/os-release && echo "$ID")"
@@ -714,6 +758,7 @@ def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]:
name="install_k8s",
title=f"Install Kubernetes {profile.kubernetes_version} Components",
script=f"""set -euo pipefail
+{_env_preamble}
echo '>> Installing Kubernetes {profile.kubernetes_version} components...'
OS="$(. /etc/os-release && echo "$ID")"
@@ -762,12 +807,15 @@ def get_control_plane_steps(profile: ClusterProfile) -> List[ProvisionStep]:
steps: List[ProvisionStep] = []
+ _env_preamble = _source_env_preamble()
+
# 0. Proxy on CP (optional)
if proxy_block:
steps.append(ProvisionStep(
name="cp_proxy",
title="Set Proxy Environment for kubeadm",
script=f"""set -euo pipefail
+{_env_preamble}
echo '>> Setting proxy environment for kubeadm...'
{proxy_block}
echo 'Proxy environment set.'
@@ -780,6 +828,7 @@ def get_control_plane_steps(profile: ClusterProfile) -> List[ProvisionStep]:
name="kubeadm_init",
title="Run kubeadm init",
script=f"""set -euo pipefail
+{_env_preamble}
echo '>> Preparing kubeadm config...'
mkdir -p "{audit_log_dir}"
cat > /tmp/kubeadm-config.yaml < List[ProvisionStep]:
name="install_flannel",
title="Install Flannel CNI",
script=f"""set -euo pipefail
+{_env_preamble}
echo '>> Installing Flannel CNI...'
{flannel_apply}
echo '>> Waiting for Flannel pods to be ready...'
From 95c61e326635485e923a8ee9c506f660cc16b2c1 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 11:37:53 +0000
Subject: [PATCH 23/31] Fix unquoted paths in rm -rf reset commands and
sanitize profile name in kubeconfig path
---
k8s-agent/modules/cluster_creator.py | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py
index 5967556..3fea3c7 100644
--- a/k8s-agent/modules/cluster_creator.py
+++ b/k8s-agent/modules/cluster_creator.py
@@ -1173,7 +1173,7 @@ def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]:
title="Remove kubelet data",
script=f"""set -uo pipefail
echo '>> Removing kubelet data at {kubelet_root}...'
-rm -rf {kubelet_root}/*
+rm -rf "{kubelet_root}"/*
rm -rf /etc/kubernetes/*
rm -rf /tmp/kubeadm-join-command.txt
echo 'Kubelet data removed.'
@@ -1185,7 +1185,7 @@ def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]:
title="Remove CRI-O container data",
script=f"""set -uo pipefail
echo '>> Removing CRI-O storage at {crio_root}...'
-rm -rf {crio_root}/*
+rm -rf "{crio_root}"/*
echo '>> Removing CRI-O run root...'
rm -rf /run/containers/storage/*
echo 'CRI-O data removed.'
@@ -1209,8 +1209,8 @@ def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]:
title="Clean K8s-related logs",
script=f"""set -uo pipefail
echo '>> Cleaning K8s logs at {log_root}...'
-rm -rf {log_root}/pods/*
-rm -rf {log_root}/containers/*
+rm -rf "{log_root}"/pods/*
+rm -rf "{log_root}"/containers/*
rm -rf /var/log/kubernetes/* 2>/dev/null || true
echo 'Logs cleaned.'
""",
@@ -1390,9 +1390,14 @@ def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSH
is_helm = command.strip().startswith("helm ")
if profile.kubeconfig_content:
- # Write kubeconfig to a file and run locally
+ # Write kubeconfig to a file and run locally.
+ # Sanitize the profile name for use as a filename — replace any
+ # non-alphanumeric characters (spaces, shell metacharacters, etc.)
+ # with underscores so the path is always safe for shell interpolation.
+ import re as _re
+ safe_name = _re.sub(r"[^\w.-]", "_", profile.name) or "cluster"
kubeconfig_path = os.path.join(
- config.DATA_DIR, "kubeconfigs", f"{profile.name}.kubeconfig"
+ config.DATA_DIR, "kubeconfigs", f"{safe_name}.kubeconfig"
)
os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True)
with open(kubeconfig_path, "w") as f:
@@ -1406,7 +1411,7 @@ def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSH
resolved = command.strip()
if resolved.startswith("helm "):
resolved = bin_path + resolved[4:]
- full_cmd = f"KUBECONFIG={kubeconfig_path} {resolved}"
+ full_cmd = f'KUBECONFIG="{kubeconfig_path}" {resolved}'
else:
if not kubectl:
return SSHResult(
@@ -1425,7 +1430,7 @@ def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSH
),
success=False,
)
- full_cmd = f"{kubectl} --kubeconfig={kubeconfig_path} {command}"
+ full_cmd = f'{kubectl} --kubeconfig="{kubeconfig_path}" {command}'
try:
proc = subprocess.run(
full_cmd,
From f91b1294a2b63b3ab5c126658883206ac2b53c67 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 11:48:57 +0000
Subject: [PATCH 24/31] Add pod/container dropdowns to Pod Logs tab and fix
fetch button feedback
---
k8s-agent/app.py | 140 ++++++++++++++++++++++++++++++++++++-----------
1 file changed, 107 insertions(+), 33 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 85e3874..861c893 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -1646,6 +1646,8 @@ def page_log_analysis():
# ── Pod Logs ──────────────────────────────────────────────────────────
with tab_pod:
st.markdown("### Pod Logs")
+
+ # --- Namespace selection ---
col1, col2 = st.columns(2)
with col1:
if _cluster_namespaces:
@@ -1654,42 +1656,114 @@ def page_log_analysis():
key="pod_ns")
else:
pod_ns = st.text_input("Namespace", value="default", key="pod_ns")
- pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz", key="pod_name_input")
with col2:
- container = st.text_input("Container (optional)", key="pod_container")
- pod_lines = st.number_input("Lines", min_value=50, max_value=1000, value=200, key="pod_lines")
- pod_previous = st.checkbox("Previous container logs (crash recovery)")
-
- if st.button("Fetch Pod Logs", type="primary", key="fetch_pod") and pod_name:
- with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."):
- result = collect_pod_logs(
- cp_node, pod_ns, pod_name, container, pod_lines,
- "1h", pod_previous, profile=profile,
+ pod_lines = st.number_input("Lines", min_value=50, max_value=5000, value=200, key="pod_lines")
+
+ # --- Load pods from the cluster ---
+ if st.button("Load Pods", key="load_pods_btn"):
+ with st.spinner(f"Fetching pods in namespace '{pod_ns}'..."):
+ pod_result = get_pod_list(cp_node, namespace=pod_ns, profile=profile)
+ if pod_result.success and pod_result.stdout.strip():
+ _pods: list[dict] = []
+ for line in pod_result.stdout.strip().split("\n"):
+ parts = line.split()
+ if len(parts) >= 4:
+ _pods.append({
+ "namespace": parts[0],
+ "name": parts[1],
+ "status": parts[2],
+ "containers": parts[3],
+ })
+ elif len(parts) >= 2:
+ _pods.append({
+ "namespace": parts[0],
+ "name": parts[1],
+ "status": parts[2] if len(parts) > 2 else "Unknown",
+ "containers": parts[3] if len(parts) > 3 else "",
+ })
+ st.session_state["_pod_list"] = _pods
+ st.session_state["_pod_list_ns"] = pod_ns
+ st.success(f"Found {len(_pods)} pod(s) in namespace '{pod_ns}'.")
+ elif pod_result.success:
+ st.session_state["_pod_list"] = []
+ st.session_state["_pod_list_ns"] = pod_ns
+ st.warning(f"No pods found in namespace '{pod_ns}'.")
+ else:
+ st.error(f"Failed to fetch pods: {pod_result.stderr}")
+
+ # --- Pod & container dropdowns ---
+ _pods_loaded = st.session_state.get("_pod_list", [])
+ _pods_loaded_ns = st.session_state.get("_pod_list_ns", "")
+
+ col_p1, col_p2 = st.columns(2)
+ with col_p1:
+ if _pods_loaded and _pods_loaded_ns == pod_ns:
+ pod_options = [f"{p['name']} ({p['status']})" for p in _pods_loaded]
+ selected_pod_idx = st.selectbox(
+ "Pod Name", options=range(len(pod_options)),
+ format_func=lambda i: pod_options[i],
+ key="pod_name_select",
)
- if result.success:
- analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}")
- m1, m2, m3 = st.columns(3)
- m1.metric("Total Lines", analysis.total_lines)
- m2.metric("Errors", analysis.error_count)
- m3.metric("Warnings", analysis.warning_count)
-
- if analysis.error_patterns:
- st.markdown("**Error Patterns:**")
- for pattern, count in list(analysis.error_patterns.items())[:10]:
- st.markdown(f"- `{pattern}` (x{count})")
-
- st.code(result.stdout[-5000:], language="text")
-
- if analysis.error_count > 0 and is_llm_configured():
- if st.button("Analyze with AI", key="pod_ai"):
- with st.spinner("AI analyzing pod logs..."):
- ai_analysis = llm_analyze_logs(
- result.stdout, f"{pod_ns}/{pod_name}"
- )
- st.markdown(ai_analysis)
+ pod_name = _pods_loaded[selected_pod_idx]["name"] if selected_pod_idx is not None else ""
+ else:
+ pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz (click Load Pods to get dropdown)", key="pod_name_input")
+
+ with col_p2:
+ if _pods_loaded and _pods_loaded_ns == pod_ns and pod_name:
+ # Find the selected pod's containers
+ _selected_pod = next((p for p in _pods_loaded if p["name"] == pod_name), None)
+ _containers: list[str] = []
+ if _selected_pod and _selected_pod.get("containers"):
+ _containers = [c.strip() for c in _selected_pod["containers"].split(",") if c.strip()]
+ if _containers:
+ container_options = ["(all / default)"] + _containers
+ container_sel = st.selectbox("Container", options=container_options, key="pod_container_select")
+ container = "" if container_sel == "(all / default)" else container_sel
else:
- st.error("Failed to fetch pod logs")
- st.code(result.stderr, language="text")
+ container = st.text_input("Container (optional)", key="pod_container")
+ else:
+ container = st.text_input("Container (optional)", key="pod_container")
+
+ pod_previous = st.checkbox("Previous container logs (crash recovery)")
+
+ # --- Fetch logs ---
+ if st.button("Fetch Pod Logs", type="primary", key="fetch_pod"):
+ if not pod_name:
+ st.warning("Please enter a pod name or click **Load Pods** to select one.")
+ else:
+ with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."):
+ result = collect_pod_logs(
+ cp_node, pod_ns, pod_name, container, pod_lines,
+ "1h", pod_previous, profile=profile,
+ )
+ if result.success:
+ if not result.stdout.strip():
+ st.info(f"No log output returned for pod `{pod_ns}/{pod_name}`. "
+ "The pod may have just started or has no recent logs.")
+ else:
+ analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}")
+ m1, m2, m3 = st.columns(3)
+ m1.metric("Total Lines", analysis.total_lines)
+ m2.metric("Errors", analysis.error_count)
+ m3.metric("Warnings", analysis.warning_count)
+
+ if analysis.error_patterns:
+ st.markdown("**Error Patterns:**")
+ for pattern, count in list(analysis.error_patterns.items())[:10]:
+ st.markdown(f"- `{pattern}` (x{count})")
+
+ st.code(result.stdout[-5000:], language="text")
+
+ if analysis.error_count > 0 and is_llm_configured():
+ if st.button("Analyze with AI", key="pod_ai"):
+ with st.spinner("AI analyzing pod logs..."):
+ ai_analysis = llm_analyze_logs(
+ result.stdout, f"{pod_ns}/{pod_name}"
+ )
+ st.markdown(ai_analysis)
+ else:
+ st.error(f"Failed to fetch pod logs for `{pod_ns}/{pod_name}`")
+ st.code(result.stderr, language="text")
# ── Error Correlation ─────────────────────────────────────────────────
with tab_correlation:
From c70cd7fe93bc5b031757be05f71d1e9165f460ed Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 12:04:45 +0000
Subject: [PATCH 25/31] Add Istio/Envoy access log analysis: response time
analytics, status codes, per-path/upstream breakdowns, slow requests
---
k8s-agent/app.py | 198 ++++++++++++++++++
k8s-agent/modules/log_analyzer.py | 322 ++++++++++++++++++++++++++++++
2 files changed, 520 insertions(+)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 861c893..95462c1 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -85,6 +85,7 @@
detect_anomalies,
mine_log_patterns,
summarize_logs,
+ analyze_istio_access_logs,
)
from modules.llm_client import query_llm, stream_llm
@@ -2063,6 +2064,203 @@ def page_log_analysis():
except ImportError:
pass
+ # ── Istio / Envoy Access Log Analysis ────────────────────────
+ if sa_result.istio:
+ istio = sa_result.istio
+ st.markdown("---")
+ st.markdown("#### Istio / Envoy Access Log Analysis")
+ st.markdown(
+ "Detected **Istio/Envoy access logs** — showing response time analytics, "
+ "status code distribution, per-path and per-upstream breakdowns, and slow requests."
+ )
+
+ # ── Overview metrics ──
+ icol1, icol2, icol3, icol4, icol5, icol6 = st.columns(6)
+ icol1.metric("Total Requests", f"{istio.total_requests:,}")
+ icol2.metric("Avg Latency", f"{istio.avg_ms:.0f} ms")
+ icol3.metric("P50", f"{istio.p50_ms:.0f} ms")
+ icol4.metric("P95", f"{istio.p95_ms:.0f} ms")
+ icol5.metric("P99", f"{istio.p99_ms:.0f} ms")
+ icol6.metric("Error Rate", f"{istio.error_rate:.1f}%")
+
+ if istio.error_rate > 10:
+ st.error(f"High error rate: **{istio.error_rate:.1f}%** of requests returned 4xx/5xx.")
+ elif istio.error_rate > 2:
+ st.warning(f"Elevated error rate: **{istio.error_rate:.1f}%** of requests returned 4xx/5xx.")
+
+ icol7, icol8, icol9 = st.columns(3)
+ icol7.metric("Min Latency", f"{istio.min_ms:.0f} ms")
+ icol8.metric("Max Latency", f"{istio.max_ms:.0f} ms")
+ icol9.metric("P90", f"{istio.p90_ms:.0f} ms")
+
+ # ── Status Code Distribution ──
+ st.markdown("##### Status Code Distribution")
+ if istio.status_distribution:
+ import pandas as pd
+ status_data = [{"Status Code": str(k), "Count": v} for k, v in sorted(istio.status_distribution.items())]
+ df_status = pd.DataFrame(status_data)
+ scol_t, scol_c = st.columns([1, 1])
+ with scol_t:
+ st.dataframe(df_status, use_container_width=True, hide_index=True)
+ with scol_c:
+ try:
+ import plotly.express as px
+ fig = px.pie(
+ df_status, names="Status Code", values="Count",
+ title="Response Status Codes",
+ color="Status Code",
+ color_discrete_map={
+ str(k): ("#2ecc71" if k < 300 else "#f39c12" if k < 400 else "#e67e22" if k < 500 else "#e74c3c")
+ for k in istio.status_distribution
+ },
+ )
+ fig.update_layout(height=350)
+ st.plotly_chart(fig, use_container_width=True)
+ except ImportError:
+ pass
+
+ # ── Status class summary ──
+ if istio.status_class_distribution:
+ class_cols = st.columns(len(istio.status_class_distribution))
+ for idx, (cls, cnt) in enumerate(sorted(istio.status_class_distribution.items())):
+ class_cols[idx].metric(cls, cnt)
+
+ # ── Response Flags ──
+ if istio.response_flags_dist and len(istio.response_flags_dist) > 1:
+ with st.expander("Response Flags (Envoy)", expanded=False):
+ st.markdown(
+ "Envoy response flags indicate special conditions: "
+ "`UF`=upstream failure, `UH`=no healthy upstream, "
+ "`UT`=upstream timeout, `NR`=no route, `DC`=downstream disconnected, etc."
+ )
+ import pandas as pd
+ flags_data = [{"Flag": k, "Count": v} for k, v in istio.response_flags_dist.items()]
+ st.dataframe(pd.DataFrame(flags_data), use_container_width=True, hide_index=True)
+
+ # ── Latency Distribution Histogram ──
+ st.markdown("##### Latency Distribution")
+ try:
+ import plotly.express as px
+ durations = [e.duration_ms for e in istio.parsed_entries]
+ fig = px.histogram(
+ x=durations, nbins=50,
+ labels={"x": "Duration (ms)", "y": "Count"},
+ title="Request Latency Distribution",
+ )
+ fig.add_vline(x=istio.p50_ms, line_dash="dash", line_color="green",
+ annotation_text=f"P50: {istio.p50_ms:.0f}ms")
+ fig.add_vline(x=istio.p95_ms, line_dash="dash", line_color="orange",
+ annotation_text=f"P95: {istio.p95_ms:.0f}ms")
+ fig.add_vline(x=istio.p99_ms, line_dash="dash", line_color="red",
+ annotation_text=f"P99: {istio.p99_ms:.0f}ms")
+ fig.update_layout(height=400)
+ st.plotly_chart(fig, use_container_width=True)
+ except ImportError:
+ pass
+
+ # ── Per-Path Response Time ──
+ if istio.path_stats:
+ st.markdown("##### Per-Path Response Time")
+ import pandas as pd
+ path_data = []
+ for ps in istio.path_stats[:30]:
+ path_data.append({
+ "Path": ps["path"][:80],
+ "Requests": ps["count"],
+ "Avg (ms)": ps["avg_ms"],
+ "P50 (ms)": ps["p50_ms"],
+ "P95 (ms)": ps["p95_ms"],
+ "P99 (ms)": ps["p99_ms"],
+ "Max (ms)": ps["max_ms"],
+ "Errors": ps["error_count"],
+ "Error %": ps["error_rate"],
+ })
+ df_paths = pd.DataFrame(path_data)
+ st.dataframe(df_paths, use_container_width=True, hide_index=True)
+
+ # Bar chart of top paths by P95
+ try:
+ import plotly.express as px
+ top_paths = istio.path_stats[:15]
+ fig = px.bar(
+ x=[p["path"][:50] for p in top_paths],
+ y=[p["p95_ms"] for p in top_paths],
+ labels={"x": "Path", "y": "P95 Latency (ms)"},
+ title="Top Paths by P95 Latency",
+ color=[p["error_rate"] for p in top_paths],
+ color_continuous_scale="RdYlGn_r",
+ )
+ fig.update_layout(height=400, xaxis_tickangle=-45,
+ coloraxis_colorbar_title="Error %")
+ st.plotly_chart(fig, use_container_width=True)
+ except ImportError:
+ pass
+
+ # ── Per-Upstream Service Stats ──
+ if istio.upstream_stats:
+ st.markdown("##### Per-Upstream Service Stats")
+ import pandas as pd
+ up_data = []
+ for us in istio.upstream_stats[:20]:
+ up_data.append({
+ "Upstream": us["upstream"][:60],
+ "Requests": us["count"],
+ "Avg Duration (ms)": us["avg_duration_ms"],
+ "Avg Upstream (ms)": us["avg_upstream_ms"],
+ "P95 Duration (ms)": us["p95_duration_ms"],
+ "P95 Upstream (ms)": us["p95_upstream_ms"],
+ "Errors": us["error_count"],
+ "Error %": us["error_rate"],
+ })
+ st.dataframe(pd.DataFrame(up_data), use_container_width=True, hide_index=True)
+
+ # ── Slow Requests ──
+ if istio.slow_requests:
+ with st.expander(f"Slow Requests (>{istio.p95_ms:.0f}ms — top {len(istio.slow_requests)})", expanded=True):
+ import pandas as pd
+ slow_data = []
+ for sr in istio.slow_requests[:30]:
+ slow_data.append({
+ "Duration (ms)": sr.duration_ms,
+ "Upstream (ms)": sr.upstream_service_time_ms,
+ "Method": sr.method,
+ "Path": sr.path[:80],
+ "Status": sr.response_code,
+ "Flags": sr.response_flags,
+ "Upstream Host": sr.upstream_host[:40],
+ "Timestamp": sr.timestamp[:25] if sr.timestamp else "",
+ })
+ st.dataframe(pd.DataFrame(slow_data), use_container_width=True, hide_index=True)
+
+ # ── Istio Request Timeline ──
+ if istio.timeline_buckets and len(istio.timeline_buckets) > 1:
+ st.markdown("##### Request Timeline")
+ try:
+ import plotly.graph_objects as go
+ ts_labels = [b["timestamp"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"]
+ ts_totals = [b["total"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"]
+ ts_errors = [b["errors"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"]
+ ts_avg_dur = [b["avg_duration"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"]
+
+ if ts_labels:
+ fig = go.Figure()
+ fig.add_trace(go.Bar(x=ts_labels, y=ts_totals, name="Requests", marker_color="#326CE5"))
+ fig.add_trace(go.Bar(x=ts_labels, y=ts_errors, name="Errors (4xx+5xx)", marker_color="#FF4B4B"))
+ fig.add_trace(go.Scatter(
+ x=ts_labels, y=ts_avg_dur, name="Avg Latency (ms)",
+ mode="lines+markers", yaxis="y2", line=dict(color="orange"),
+ ))
+ fig.update_layout(
+ title="Requests & Latency Over Time",
+ yaxis_title="Request Count",
+ yaxis2=dict(title="Avg Latency (ms)", overlaying="y", side="right"),
+ barmode="overlay",
+ height=400,
+ )
+ st.plotly_chart(fig, use_container_width=True)
+ except ImportError:
+ pass
+
# ── AI Log Analysis ───────────────────────────────────────────────────
with tab_ai:
st.markdown("### AI-Powered Log Analysis")
diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py
index 3f6c6df..69c4e9d 100644
--- a/k8s-agent/modules/log_analyzer.py
+++ b/k8s-agent/modules/log_analyzer.py
@@ -484,6 +484,59 @@ class LogAnomaly:
source: str = ""
+@dataclass
+class IstioAccessEntry:
+ """A parsed Istio/Envoy access log entry."""
+ timestamp: str = ""
+ method: str = ""
+ path: str = ""
+ protocol: str = ""
+ response_code: int = 0
+ response_flags: str = ""
+ bytes_received: int = 0
+ bytes_sent: int = 0
+ duration_ms: float = 0.0 # total request duration
+ upstream_service_time_ms: float = 0.0 # time spent in upstream
+ upstream_cluster: str = ""
+ upstream_host: str = ""
+ downstream_remote: str = ""
+ downstream_local: str = ""
+ requested_server_name: str = ""
+ authority: str = "" # Host header
+ user_agent: str = ""
+ raw_line: str = ""
+
+
+@dataclass
+class IstioAnalysisResult:
+ """Result from Istio access log analysis."""
+ total_requests: int = 0
+ parsed_entries: list[IstioAccessEntry] = field(default_factory=list)
+ # Latency percentiles
+ p50_ms: float = 0.0
+ p90_ms: float = 0.0
+ p95_ms: float = 0.0
+ p99_ms: float = 0.0
+ avg_ms: float = 0.0
+ max_ms: float = 0.0
+ min_ms: float = 0.0
+ # Status code distribution
+ status_distribution: dict = field(default_factory=dict) # code -> count
+ status_class_distribution: dict = field(default_factory=dict) # "2xx"->count
+ # Error rate
+ error_rate: float = 0.0 # percentage of 4xx+5xx
+ # Slow requests (above p95)
+ slow_requests: list[IstioAccessEntry] = field(default_factory=list)
+ # Per-path stats
+ path_stats: list[dict] = field(default_factory=list)
+ # Per-upstream stats
+ upstream_stats: list[dict] = field(default_factory=list)
+ # Response flags distribution
+ response_flags_dist: dict = field(default_factory=dict)
+ # Timeline buckets (per-minute)
+ timeline_buckets: list[dict] = field(default_factory=list)
+
+
@dataclass
class SmartAnalysisResult:
"""Full result from smart log analysis."""
@@ -493,6 +546,7 @@ class SmartAnalysisResult:
patterns: list[dict] = field(default_factory=list)
summary: dict = field(default_factory=dict)
timeline_buckets: list[dict] = field(default_factory=list)
+ istio: IstioAnalysisResult | None = None # populated when Istio logs detected
def _tokenize_log(message: str) -> str:
@@ -835,4 +889,272 @@ def smart_analyze(log_text: str, source: str = "") -> SmartAnalysisResult:
result.timeline_buckets = sorted(ts_buckets.values(), key=lambda b: b["timestamp"])
+ # 6. Istio / Envoy access log analysis (auto-detected)
+ istio_result = analyze_istio_access_logs(log_text)
+ if istio_result and istio_result.total_requests > 0:
+ result.istio = istio_result
+
+ return result
+
+
+# ══════════════════════════════════════════════════════════════════════════
+# Istio / Envoy Access Log Analysis
+# Parses Envoy access log format used by Istio sidecars and provides
+# response-time analytics, status code distributions, per-path and
+# per-upstream breakdowns, and slow-request detection.
+# ══════════════════════════════════════════════════════════════════════════
+
+# Envoy default access log format (as emitted by Istio):
+# [%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%"
+# %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT%
+# %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)%
+# "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%"
+# "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" %UPSTREAM_CLUSTER%
+# %UPSTREAM_LOCAL_ADDRESS% %DOWNSTREAM_LOCAL_ADDRESS%
+# %DOWNSTREAM_REMOTE_ADDRESS% %REQUESTED_SERVER_NAME% %ROUTE_NAME%
+
+_ISTIO_LOG_RE = re.compile(
+ r'\[(?P[^\]]+)\]\s+'
+ r'"(?P\S+)\s+(?P\S+)\s+(?P[^"]*?)"\s+'
+ r'(?P\d+)\s+'
+ r'(?P\S+)\s+'
+ r'(?P\d+)\s+'
+ r'(?P\d+)\s+'
+ r'(?P\d+)\s+'
+ r'(?P\d+|-)\s+'
+ r'"(?P[^"]*)"\s+'
+ r'"(?P[^"]*)"\s+'
+ r'"(?P[^"]*)"\s+'
+ r'"(?P[^"]*)"\s+'
+ r'"(?P[^"]*)"\s*'
+ r'(?P.*)'
+)
+
+# Simpler fallback: JSON-format Istio access logs (structured logging)
+_ISTIO_JSON_KEYS = {
+ "response_code", "duration", "method", "path", "upstream_service_time",
+ "upstream_cluster", "authority", "bytes_received", "bytes_sent",
+}
+
+
+def _parse_istio_line(line: str) -> IstioAccessEntry | None:
+ """Try to parse a single line as an Istio/Envoy access log entry."""
+ import json as _json
+
+ # Try structured JSON format first
+ stripped = line.strip()
+ if stripped.startswith("{"):
+ try:
+ obj = _json.loads(stripped)
+ # Verify it looks like an Istio access log
+ if "response_code" in obj or "method" in obj or "duration" in obj:
+ duration = obj.get("duration", 0)
+ ust = obj.get("upstream_service_time", 0)
+ # Istio JSON logs may use different field names
+ return IstioAccessEntry(
+ timestamp=str(obj.get("start_time", obj.get("timestamp", ""))),
+ method=str(obj.get("method", obj.get("request_method", ""))),
+ path=str(obj.get("path", obj.get("request_path", ""))),
+ protocol=str(obj.get("protocol", "")),
+ response_code=int(obj.get("response_code", 0)),
+ response_flags=str(obj.get("response_flags", "-")),
+ bytes_received=int(obj.get("bytes_received", 0)),
+ bytes_sent=int(obj.get("bytes_sent", 0)),
+ duration_ms=float(duration) if duration not in ("-", "", None) else 0.0,
+ upstream_service_time_ms=float(ust) if ust not in ("-", "", None) else 0.0,
+ upstream_cluster=str(obj.get("upstream_cluster", "")),
+ upstream_host=str(obj.get("upstream_host", "")),
+ authority=str(obj.get("authority", obj.get("host", ""))),
+ user_agent=str(obj.get("user_agent", "")),
+ downstream_remote=str(obj.get("downstream_remote_address", "")),
+ downstream_local=str(obj.get("downstream_local_address", "")),
+ requested_server_name=str(obj.get("requested_server_name", "")),
+ raw_line=line,
+ )
+ except (_json.JSONDecodeError, ValueError, TypeError):
+ pass
+
+ # Try standard Envoy text format
+ m = _ISTIO_LOG_RE.match(stripped)
+ if m:
+ ust = m.group("upstream_service_time")
+ rest = m.group("rest").strip()
+ # Parse remaining fields from rest (upstream_cluster, etc.)
+ rest_parts = rest.split()
+ upstream_cluster = rest_parts[0] if rest_parts else ""
+ return IstioAccessEntry(
+ timestamp=m.group("timestamp"),
+ method=m.group("method"),
+ path=m.group("path"),
+ protocol=m.group("protocol"),
+ response_code=int(m.group("response_code")),
+ response_flags=m.group("response_flags"),
+ bytes_received=int(m.group("bytes_received")),
+ bytes_sent=int(m.group("bytes_sent")),
+ duration_ms=float(m.group("duration")),
+ upstream_service_time_ms=float(ust) if ust != "-" else 0.0,
+ upstream_cluster=upstream_cluster,
+ upstream_host=m.group("upstream_host"),
+ authority=m.group("authority"),
+ user_agent=m.group("user_agent"),
+ downstream_remote=m.group("xff") or "",
+ raw_line=line,
+ )
+
+ return None
+
+
+def _is_likely_istio_log(lines: list[str], sample_size: int = 20) -> bool:
+ """Heuristic: check if a meaningful fraction of lines look like Istio access logs."""
+ sample = lines[:sample_size]
+ parsed = sum(1 for l in sample if _parse_istio_line(l) is not None)
+ return parsed >= max(1, len(sample) * 0.3) # at least 30% parse successfully
+
+
+def analyze_istio_access_logs(log_text: str) -> IstioAnalysisResult | None:
+ """Parse and analyze Istio/Envoy access logs.
+
+ Returns None if the logs don't look like Istio access logs.
+ Returns an IstioAnalysisResult with latency stats, status distribution,
+ per-path breakdowns, per-upstream breakdowns, and slow requests.
+ """
+ lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()]
+ if not lines:
+ return None
+
+ # Quick heuristic — bail early if this doesn't look like Istio logs
+ if not _is_likely_istio_log(lines):
+ return None
+
+ entries: list[IstioAccessEntry] = []
+ for line in lines:
+ entry = _parse_istio_line(line)
+ if entry is not None:
+ entries.append(entry)
+
+ if not entries:
+ return None
+
+ result = IstioAnalysisResult(
+ total_requests=len(entries),
+ parsed_entries=entries,
+ )
+
+ # ── Latency percentiles ──────────────────────────────────────────
+ import numpy as np
+ durations = np.array([e.duration_ms for e in entries])
+ if len(durations) > 0:
+ result.avg_ms = float(np.mean(durations))
+ result.min_ms = float(np.min(durations))
+ result.max_ms = float(np.max(durations))
+ result.p50_ms = float(np.percentile(durations, 50))
+ result.p90_ms = float(np.percentile(durations, 90))
+ result.p95_ms = float(np.percentile(durations, 95))
+ result.p99_ms = float(np.percentile(durations, 99))
+
+ # ── Status code distribution ─────────────────────────────────────
+ status_counter: Counter = Counter()
+ class_counter: Counter = Counter()
+ for e in entries:
+ status_counter[e.response_code] += 1
+ class_label = f"{e.response_code // 100}xx"
+ class_counter[class_label] += 1
+
+ result.status_distribution = dict(status_counter.most_common())
+ result.status_class_distribution = dict(class_counter.most_common())
+
+ # Error rate (4xx + 5xx)
+ error_count = sum(1 for e in entries if e.response_code >= 400)
+ result.error_rate = (error_count / len(entries)) * 100 if entries else 0.0
+
+ # ── Slow requests (above p95) ────────────────────────────────────
+ p95_threshold = result.p95_ms
+ slow = [e for e in entries if e.duration_ms > p95_threshold]
+ # Sort by duration descending, limit to top 50
+ slow.sort(key=lambda e: e.duration_ms, reverse=True)
+ result.slow_requests = slow[:50]
+
+ # ── Per-path stats ───────────────────────────────────────────────
+ path_groups: dict[str, list[IstioAccessEntry]] = {}
+ for e in entries:
+ # Normalize path: strip query params for grouping
+ base_path = e.path.split("?")[0] if e.path else "(unknown)"
+ path_groups.setdefault(base_path, []).append(e)
+
+ path_stats = []
+ for path, group in path_groups.items():
+ durations_g = [e.duration_ms for e in group]
+ errors_g = sum(1 for e in group if e.response_code >= 400)
+ path_stats.append({
+ "path": path,
+ "count": len(group),
+ "avg_ms": round(sum(durations_g) / len(durations_g), 1) if durations_g else 0,
+ "p50_ms": round(float(np.percentile(durations_g, 50)), 1) if durations_g else 0,
+ "p95_ms": round(float(np.percentile(durations_g, 95)), 1) if durations_g else 0,
+ "p99_ms": round(float(np.percentile(durations_g, 99)), 1) if durations_g else 0,
+ "max_ms": round(max(durations_g), 1) if durations_g else 0,
+ "error_count": errors_g,
+ "error_rate": round((errors_g / len(group)) * 100, 1) if group else 0,
+ })
+ path_stats.sort(key=lambda p: p["count"], reverse=True)
+ result.path_stats = path_stats[:50]
+
+ # ── Per-upstream stats ───────────────────────────────────────────
+ upstream_groups: dict[str, list[IstioAccessEntry]] = {}
+ for e in entries:
+ key = e.upstream_cluster or e.upstream_host or "(direct/unknown)"
+ upstream_groups.setdefault(key, []).append(e)
+
+ upstream_stats = []
+ for upstream, group in upstream_groups.items():
+ ust_vals = [e.upstream_service_time_ms for e in group if e.upstream_service_time_ms > 0]
+ dur_vals = [e.duration_ms for e in group]
+ errors_g = sum(1 for e in group if e.response_code >= 400)
+ upstream_stats.append({
+ "upstream": upstream,
+ "count": len(group),
+ "avg_duration_ms": round(sum(dur_vals) / len(dur_vals), 1) if dur_vals else 0,
+ "avg_upstream_ms": round(sum(ust_vals) / len(ust_vals), 1) if ust_vals else 0,
+ "p95_duration_ms": round(float(np.percentile(dur_vals, 95)), 1) if dur_vals else 0,
+ "p95_upstream_ms": round(float(np.percentile(ust_vals, 95)), 1) if ust_vals else 0,
+ "error_count": errors_g,
+ "error_rate": round((errors_g / len(group)) * 100, 1) if group else 0,
+ })
+ upstream_stats.sort(key=lambda u: u["count"], reverse=True)
+ result.upstream_stats = upstream_stats[:30]
+
+ # ── Response flags distribution ──────────────────────────────────
+ flags_counter: Counter = Counter()
+ for e in entries:
+ flag = e.response_flags if e.response_flags and e.response_flags != "-" else "(none)"
+ flags_counter[flag] += 1
+ result.response_flags_dist = dict(flags_counter.most_common())
+
+ # ── Timeline buckets (per-minute) ────────────────────────────────
+ ts_buckets: dict[str, dict] = {}
+ for e in entries:
+ # Try to extract minute-level bucket from timestamp
+ ts = e.timestamp
+ if ts:
+ # Envoy format: 2024-01-15T10:30:45.123Z or similar
+ bucket_key = ts[:16] if len(ts) >= 16 else ts[:10]
+ else:
+ bucket_key = "unknown"
+ if bucket_key not in ts_buckets:
+ ts_buckets[bucket_key] = {
+ "timestamp": bucket_key, "total": 0, "errors": 0,
+ "avg_duration": 0.0, "_durations": [],
+ }
+ ts_buckets[bucket_key]["total"] += 1
+ ts_buckets[bucket_key]["_durations"].append(e.duration_ms)
+ if e.response_code >= 400:
+ ts_buckets[bucket_key]["errors"] += 1
+
+ # Compute avg duration per bucket
+ for bucket in ts_buckets.values():
+ durs = bucket.pop("_durations", [])
+ bucket["avg_duration"] = round(sum(durs) / len(durs), 1) if durs else 0
+
+ result.timeline_buckets = sorted(ts_buckets.values(), key=lambda b: b["timestamp"])
+
return result
From 63ecf080f7b9f2c44bd67c9a188b4799be745b1c Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Tue, 7 Apr 2026 12:11:50 +0000
Subject: [PATCH 26/31] Remove Helm/Network Policy tabs, remove init containers
from Resource Limits, add pod count per node, fix Set Active profile button
---
k8s-agent/app.py | 376 ++++++++---------------------------------------
1 file changed, 61 insertions(+), 315 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 95462c1..39f0be2 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -576,6 +576,7 @@ def page_profile_manager():
st.markdown(f"**Updated:** {profile.updated_at[:10] if profile.updated_at else 'N/A'}")
if st.button("Set Active", key=f"activate_{profile.name}"):
st.session_state.active_profile = profile.name
+ st.session_state.profile_selector = profile.name
st.rerun()
if st.button("Delete", key=f"delete_{profile.name}", type="secondary"):
delete_profile(profile.name)
@@ -2350,8 +2351,8 @@ def page_resource_viewer():
_rv_namespaces = fetch_namespaces(profile.kubeconfig_content)
(tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl,
- tab_node_health, tab_rbac, tab_helm, tab_events,
- tab_restart_tracker, tab_netpol, tab_pvc) = st.tabs([
+ tab_node_health, tab_rbac, tab_events,
+ tab_restart_tracker, tab_pvc) = st.tabs([
"Cluster Resources",
"Scaling",
"Pod Shell",
@@ -2359,10 +2360,8 @@ def page_resource_viewer():
"Node Containers",
"Node Health",
"RBAC Viewer",
- "Helm Releases",
"Events Timeline",
"Pod Restart Tracker",
- "Network Policies",
"PVC / Storage",
])
@@ -2827,7 +2826,6 @@ def page_resource_viewer():
template = spec.get("template", {})
pod_spec = template.get("spec", {})
containers = pod_spec.get("containers", [])
- init_containers = pod_spec.get("initContainers", [])
for ctr in containers:
res = ctr.get("resources", {})
req = res.get("requests", {})
@@ -2836,23 +2834,6 @@ def page_resource_viewer():
"Type": wl_label,
"Workload": workload_name,
"Container": ctr.get("name", "?"),
- "Init": "",
- "CPU Req": req.get("cpu", "-"),
- "CPU Lim": lim.get("cpu", "-"),
- "Mem Req": req.get("memory", "-"),
- "Mem Lim": lim.get("memory", "-"),
- "Eph Req": req.get("ephemeral-storage", "-"),
- "Eph Lim": lim.get("ephemeral-storage", "-"),
- })
- for ctr in init_containers:
- res = ctr.get("resources", {})
- req = res.get("requests", {})
- lim = res.get("limits", {})
- all_rows.append({
- "Type": wl_label,
- "Workload": workload_name,
- "Container": ctr.get("name", "?"),
- "Init": "init",
"CPU Req": req.get("cpu", "-"),
"CPU Lim": lim.get("cpu", "-"),
"Mem Req": req.get("memory", "-"),
@@ -2874,7 +2855,6 @@ def page_resource_viewer():
"Type": st.column_config.TextColumn(width="small"),
"Workload": st.column_config.TextColumn(width="medium"),
"Container": st.column_config.TextColumn(width="medium"),
- "Init": st.column_config.TextColumn(width="small"),
"CPU Req": st.column_config.TextColumn(width="small"),
"CPU Lim": st.column_config.TextColumn(width="small"),
"Mem Req": st.column_config.TextColumn(width="small"),
@@ -2887,20 +2867,20 @@ def page_resource_viewer():
# Summary stats
st.markdown("---")
st.markdown("#### Summary")
- no_cpu_req = sum(1 for r in all_rows if r["CPU Req"] == "-" and r["Init"] == "")
- no_mem_req = sum(1 for r in all_rows if r["Mem Req"] == "-" and r["Init"] == "")
- no_cpu_lim = sum(1 for r in all_rows if r["CPU Lim"] == "-" and r["Init"] == "")
- no_mem_lim = sum(1 for r in all_rows if r["Mem Lim"] == "-" and r["Init"] == "")
- non_init = sum(1 for r in all_rows if r["Init"] == "")
+ total_ctr = len(all_rows)
+ no_cpu_req = sum(1 for r in all_rows if r["CPU Req"] == "-")
+ no_mem_req = sum(1 for r in all_rows if r["Mem Req"] == "-")
+ no_cpu_lim = sum(1 for r in all_rows if r["CPU Lim"] == "-")
+ no_mem_lim = sum(1 for r in all_rows if r["Mem Lim"] == "-")
sc1, sc2, sc3, sc4 = st.columns(4)
with sc1:
- st.metric("No CPU Request", f"{no_cpu_req}/{non_init}")
+ st.metric("No CPU Request", f"{no_cpu_req}/{total_ctr}")
with sc2:
- st.metric("No CPU Limit", f"{no_cpu_lim}/{non_init}")
+ st.metric("No CPU Limit", f"{no_cpu_lim}/{total_ctr}")
with sc3:
- st.metric("No Mem Request", f"{no_mem_req}/{non_init}")
+ st.metric("No Mem Request", f"{no_mem_req}/{total_ctr}")
with sc4:
- st.metric("No Mem Limit", f"{no_mem_lim}/{non_init}")
+ st.metric("No Mem Limit", f"{no_mem_lim}/{total_ctr}")
if no_cpu_req > 0 or no_mem_req > 0:
st.warning(
@@ -2914,10 +2894,10 @@ def page_resource_viewer():
)
# Download as TSV
- tsv_lines = ["Type\tWorkload\tContainer\tInit\tCPU Req\tCPU Lim\tMem Req\tMem Lim\tEph Req\tEph Lim"]
+ tsv_lines = ["Type\tWorkload\tContainer\tCPU Req\tCPU Lim\tMem Req\tMem Lim\tEph Req\tEph Lim"]
for r in all_rows:
tsv_lines.append(
- f"{r['Type']}\t{r['Workload']}\t{r['Container']}\t{r['Init']}\t"
+ f"{r['Type']}\t{r['Workload']}\t{r['Container']}\t"
f"{r['CPU Req']}\t{r['CPU Lim']}\t{r['Mem Req']}\t{r['Mem Lim']}\t"
f"{r['Eph Req']}\t{r['Eph Lim']}"
)
@@ -2959,8 +2939,54 @@ def page_resource_viewer():
)
if node_result.success and node_result.stdout.strip():
node_names = [n.strip() for n in node_result.stdout.strip().split("\n") if n.strip()]
+
+ # Pod count distribution across nodes
+ node_pod_counts: dict[str, int] = {}
+ for node_name in node_names:
+ with st.spinner(f"Fetching pods on {node_name}..."):
+ count_result = run_kubectl(
+ profile,
+ f"get pods -A --field-selector spec.nodeName={node_name} "
+ "--no-headers",
+ timeout=15,
+ )
+ if count_result.success:
+ lines = [l for l in (count_result.stdout or "").strip().split("\n") if l.strip()]
+ node_pod_counts[node_name] = len(lines)
+ else:
+ node_pod_counts[node_name] = 0
+
+ # Show pod distribution summary
+ st.markdown("#### Pod Distribution Across Nodes")
+ dist_cols = st.columns(min(len(node_names), 6))
+ for idx, node_name in enumerate(node_names):
+ with dist_cols[idx % min(len(node_names), 6)]:
+ st.metric(node_name, f"{node_pod_counts.get(node_name, 0)} pods")
+ total_pods = sum(node_pod_counts.values())
+ if total_pods > 0 and len(node_names) > 1:
+ avg_pods = total_pods / len(node_names)
+ max_pods = max(node_pod_counts.values())
+ min_pods = min(node_pod_counts.values())
+ spread = max_pods - min_pods
+ st.markdown(
+ f"**Total:** {total_pods} pods across {len(node_names)} nodes | "
+ f"**Avg:** {avg_pods:.1f} | **Min:** {min_pods} | **Max:** {max_pods} | "
+ f"**Spread:** {spread}"
+ )
+ if spread > avg_pods * 0.5 and avg_pods > 0:
+ st.warning(
+ f"Pod distribution is uneven (spread of {spread}). "
+ "Consider checking node affinity rules or pod topology spread constraints."
+ )
+ else:
+ st.success("Pods are reasonably well-distributed across nodes.")
+
+ st.markdown("---")
+
+ # Detailed per-node pod listing
for node_name in node_names:
- with st.expander(f"Node: **{node_name}**", expanded=True):
+ pod_count = node_pod_counts.get(node_name, 0)
+ with st.expander(f"Node: **{node_name}** ({pod_count} pods)", expanded=True):
with st.spinner(f"Fetching containers on {node_name}..."):
pod_result = run_kubectl(
profile,
@@ -3219,115 +3245,6 @@ def page_resource_viewer():
st.error("Describe failed")
st.code(result.stderr, language="text")
- # ── Helm Releases ────────────────────────────────────────────────────
- with tab_helm:
- st.markdown("### Helm Release Manager")
- st.markdown("List, inspect, and manage Helm releases on your cluster.")
-
- helm_tab_list, helm_tab_install, helm_tab_history = st.tabs([
- "List Releases", "Install Chart", "Release History",
- ])
-
- with helm_tab_list:
- helm_ns_all = st.checkbox("All namespaces", value=True, key="helm_ns_all")
- helm_ns = ""
- if not helm_ns_all:
- if _rv_namespaces:
- helm_ns = st.selectbox("Namespace", options=_rv_namespaces,
- index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
- key="helm_ns")
- else:
- helm_ns = st.text_input("Namespace", value="default", key="helm_ns")
-
- if st.button("List Helm Releases", type="primary", key="helm_list"):
- helm_cmd = "helm list"
- if helm_ns_all:
- helm_cmd += " -A"
- elif helm_ns:
- helm_cmd += f" -n {helm_ns}"
- helm_cmd += " -o table"
-
- with st.spinner("Fetching Helm releases..."):
- result = run_kubectl(profile, helm_cmd.replace("kubectl ", ""), timeout=15)
- if result.success:
- st.code(result.stdout or "(no releases found)", language="text")
- else:
- st.warning("Helm may not be installed on this cluster.")
- st.code(result.stderr, language="text")
-
- with helm_tab_install:
- st.markdown("#### Install a Helm Chart")
- hcol1, hcol2 = st.columns(2)
- with hcol1:
- helm_release_name = st.text_input("Release Name", placeholder="my-release", key="helm_rel")
- helm_chart = st.text_input("Chart", placeholder="prometheus-community/kube-prometheus-stack", key="helm_chart")
- with hcol2:
- if _rv_namespaces:
- helm_install_ns = st.selectbox("Namespace", options=_rv_namespaces,
- index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0,
- key="helm_install_ns")
- else:
- helm_install_ns = st.text_input("Namespace", value="default", key="helm_install_ns")
- helm_create_ns = st.checkbox("Create namespace if not exists", value=True, key="helm_create_ns")
- helm_values = st.text_area(
- "Values (YAML, optional)",
- placeholder="# Custom values.yaml content here",
- height=150,
- key="helm_values",
- )
-
- if st.button("Install Chart", type="primary", key="helm_install") and helm_release_name and helm_chart:
- install_cmd = f"helm install {helm_release_name} {helm_chart} -n {helm_install_ns}"
- if helm_create_ns:
- install_cmd += " --create-namespace"
- # If user provided values, write to temp file
- if helm_values.strip():
- values_path = os.path.join(config.UPLOADS_DIR, f"helm-values-{helm_release_name}.yaml")
- with open(values_path, "w") as vf:
- vf.write(helm_values)
- install_cmd += f" -f {values_path}"
-
- with st.spinner(f"Installing {helm_chart}..."):
- result = run_kubectl(profile, install_cmd.replace("kubectl ", ""), timeout=120)
- if result.success:
- st.success(f"Release '{helm_release_name}' installed!")
- st.code(result.stdout, language="text")
- else:
- st.error("Helm install failed")
- st.code(result.stderr, language="text")
-
- with helm_tab_history:
- st.markdown("#### Release History")
- hist_name = st.text_input("Release name", placeholder="my-release", key="helm_hist_name")
- hist_ns = st.text_input("Namespace", value="default", key="helm_hist_ns")
-
- if st.button("Get History", key="helm_hist") and hist_name:
- hist_cmd = f"helm history {hist_name} -n {hist_ns}"
- with st.spinner("Fetching history..."):
- result = run_kubectl(profile, hist_cmd.replace("kubectl ", ""), timeout=15)
- if result.success:
- st.code(result.stdout, language="text")
- else:
- st.error("Could not get release history")
- st.code(result.stderr, language="text")
-
- st.markdown("---")
- st.markdown("#### Rollback Release")
- rb_name = st.text_input("Release name", placeholder="my-release", key="helm_rb_name")
- rb_ns = st.text_input("Namespace", value="default", key="helm_rb_ns")
- rb_rev = st.number_input("Revision number", min_value=1, value=1, key="helm_rb_rev")
-
- if st.button("Rollback", key="helm_rollback") and rb_name:
- rb_cmd = f"helm rollback {rb_name} {rb_rev} -n {rb_ns}"
- with st.spinner(f"Rolling back {rb_name} to revision {rb_rev}..."):
- result = run_kubectl(profile, rb_cmd.replace("kubectl ", ""), timeout=60)
- if result.success:
- st.success(f"Rolled back '{rb_name}' to revision {rb_rev}")
- st.code(result.stdout, language="text")
- else:
- st.error("Rollback failed")
- st.code(result.stderr, language="text")
-
# ── Events Timeline ──────────────────────────────────────────────────
with tab_events:
st.markdown("### Cluster Events Timeline")
@@ -3592,177 +3509,6 @@ def page_resource_viewer():
st.error("Failed to fetch pods")
st.code(result.stderr, language="text")
- # ── Network Policy Visualizer ─────────────────────────────────────────
- with tab_netpol:
- st.markdown("### Network Policy Visualizer")
- st.markdown("View and analyze NetworkPolicies to understand pod-to-pod communication rules.")
-
- npcol1, npcol2 = st.columns([2, 1])
- with npcol1:
- if _rv_namespaces:
- np_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="netpol_ns")
- else:
- np_ns = st.text_input("Namespace (blank = all)", value="", key="netpol_ns_text")
- if not np_ns:
- np_ns = "All Namespaces"
-
- if st.button("Load Network Policies", type="primary", key="load_netpol"):
- ns_flag = "-A" if np_ns == "All Namespaces" else f"-n {np_ns}"
- cmd = f"get networkpolicies {ns_flag} -o json"
- with st.spinner("Fetching network policies..."):
- result = run_kubectl(profile, cmd, timeout=15)
- if result.success and result.stdout.strip():
- try:
- import pandas as pd
- np_json = json.loads(result.stdout)
- policies = np_json.get("items", [])
- if not policies:
- st.info("No NetworkPolicies found. All pod-to-pod traffic is allowed by default.")
- else:
- st.markdown(f"**Found {len(policies)} NetworkPolicies**")
-
- policy_summary = []
- for pol in policies:
- meta = pol.get("metadata", {})
- spec = pol.get("spec", {})
- pol_name = meta.get("name", "?")
- pol_ns = meta.get("namespace", "?")
- # Pod selector
- pod_sel = spec.get("podSelector", {})
- match_labels = pod_sel.get("matchLabels", {})
- selector_str = ", ".join(f"{k}={v}" for k, v in match_labels.items()) if match_labels else "(all pods)"
- # Policy types
- policy_types = spec.get("policyTypes", [])
- # Ingress rules count
- ingress_rules = spec.get("ingress", [])
- egress_rules = spec.get("egress", [])
-
- policy_summary.append({
- "Namespace": pol_ns,
- "Policy": pol_name,
- "Pod Selector": selector_str,
- "Types": ", ".join(policy_types) if policy_types else "N/A",
- "Ingress Rules": len(ingress_rules),
- "Egress Rules": len(egress_rules),
- })
-
- st.dataframe(pd.DataFrame(policy_summary), use_container_width=True, hide_index=True)
-
- # Detailed view per policy
- for pol in policies:
- meta = pol.get("metadata", {})
- spec = pol.get("spec", {})
- pol_name = meta.get("name", "?")
- pol_ns = meta.get("namespace", "?")
- with st.expander(f"{pol_ns}/{pol_name}", expanded=False):
- # Pod selector
- pod_sel = spec.get("podSelector", {})
- match_labels = pod_sel.get("matchLabels", {})
- if match_labels:
- st.markdown("**Applies to pods matching:** " + ", ".join(f"`{k}={v}`" for k, v in match_labels.items()))
- else:
- st.markdown("**Applies to:** All pods in namespace")
-
- # Ingress
- ingress_rules = spec.get("ingress", [])
- if ingress_rules:
- st.markdown("**Ingress Rules:**")
- for i, rule in enumerate(ingress_rules):
- sources = []
- for fr in rule.get("from", []):
- if "podSelector" in fr:
- labels = fr["podSelector"].get("matchLabels", {})
- sources.append("Pods: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
- if "namespaceSelector" in fr:
- labels = fr["namespaceSelector"].get("matchLabels", {})
- sources.append("Namespaces: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
- if "ipBlock" in fr:
- sources.append(f"CIDR: {fr['ipBlock'].get('cidr', '?')}")
- ports = []
- for p in rule.get("ports", []):
- ports.append(f"{p.get('protocol', 'TCP')}/{p.get('port', '*')}")
- src_str = ", ".join(sources) if sources else "any"
- port_str = ", ".join(ports) if ports else "all ports"
- st.markdown(f" - Rule {i+1}: Allow from **{src_str}** on **{port_str}**")
- elif "Ingress" in spec.get("policyTypes", []):
- st.warning("Ingress type declared but no rules — all ingress traffic is **denied**.")
-
- # Egress
- egress_rules = spec.get("egress", [])
- if egress_rules:
- st.markdown("**Egress Rules:**")
- for i, rule in enumerate(egress_rules):
- destinations = []
- for to in rule.get("to", []):
- if "podSelector" in to:
- labels = to["podSelector"].get("matchLabels", {})
- destinations.append("Pods: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
- if "namespaceSelector" in to:
- labels = to["namespaceSelector"].get("matchLabels", {})
- destinations.append("Namespaces: " + (", ".join(f"{k}={v}" for k, v in labels.items()) if labels else "all"))
- if "ipBlock" in to:
- destinations.append(f"CIDR: {to['ipBlock'].get('cidr', '?')}")
- ports = []
- for p in rule.get("ports", []):
- ports.append(f"{p.get('protocol', 'TCP')}/{p.get('port', '*')}")
- dest_str = ", ".join(destinations) if destinations else "any"
- port_str = ", ".join(ports) if ports else "all ports"
- st.markdown(f" - Rule {i+1}: Allow to **{dest_str}** on **{port_str}**")
- elif "Egress" in spec.get("policyTypes", []):
- st.warning("Egress type declared but no rules — all egress traffic is **denied**.")
-
- st.markdown("---")
- st.markdown("**Raw YAML:**")
- import yaml
- st.code(yaml.dump(pol, default_flow_style=False), language="yaml")
-
- # Coverage check
- st.markdown("---")
- st.markdown("#### Coverage Analysis")
- if st.button("Check Unprotected Pods", key="netpol_coverage"):
- # Get all pods and check which are selected by a policy
- pod_ns_flag = f"-n {np_ns}" if np_ns != "All Namespaces" else "-A"
- pod_cmd = f"get pods {pod_ns_flag} -o json"
- with st.spinner("Analyzing coverage..."):
- pod_result = run_kubectl(profile, pod_cmd, timeout=15)
- if pod_result.success and pod_result.stdout.strip():
- try:
- all_pods = json.loads(pod_result.stdout).get("items", [])
- protected_pods = set()
- for pol in policies:
- pol_ns_name = pol.get("metadata", {}).get("namespace", "")
- pod_sel = pol.get("spec", {}).get("podSelector", {})
- match_labels = pod_sel.get("matchLabels", {})
- for p in all_pods:
- p_ns = p.get("metadata", {}).get("namespace", "")
- p_name = p.get("metadata", {}).get("name", "")
- p_labels = p.get("metadata", {}).get("labels", {})
- if p_ns != pol_ns_name:
- continue
- if not match_labels or all(p_labels.get(k) == v for k, v in match_labels.items()):
- protected_pods.add(f"{p_ns}/{p_name}")
- unprotected = []
- for p in all_pods:
- p_ns = p.get("metadata", {}).get("namespace", "")
- p_name = p.get("metadata", {}).get("name", "")
- if f"{p_ns}/{p_name}" not in protected_pods:
- unprotected.append({"Namespace": p_ns, "Pod": p_name})
- if unprotected:
- st.warning(f"{len(unprotected)} pod(s) are **not covered** by any NetworkPolicy (all traffic allowed):")
- st.dataframe(pd.DataFrame(unprotected), use_container_width=True, hide_index=True)
- else:
- st.success("All pods are covered by at least one NetworkPolicy.")
- except (json.JSONDecodeError, KeyError):
- st.error("Failed to parse pod data for coverage analysis.")
-
- except (json.JSONDecodeError, KeyError) as e:
- st.error(f"Failed to parse network policy data: {e}")
- elif result.success:
- st.info("No NetworkPolicies found. All pod-to-pod traffic is allowed by default.")
- else:
- st.error("Failed to fetch network policies")
- st.code(result.stderr, language="text")
-
# ── PVC / Storage Dashboard ───────────────────────────────────────────
with tab_pvc:
st.markdown("### PVC / Storage Dashboard")
From 368b50dcdd78894704b9ad0209ce43433ca15ca6 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Wed, 8 Apr 2026 08:17:46 +0000
Subject: [PATCH 27/31] Fix: restrict profile JSON file permissions to 0600 to
protect kubeconfig credentials
---
k8s-agent/modules/profile_manager.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py
index d5707cd..0b7eb10 100644
--- a/k8s-agent/modules/profile_manager.py
+++ b/k8s-agent/modules/profile_manager.py
@@ -82,7 +82,8 @@ def save_profile(profile: ClusterProfile) -> str:
profile.updated_at = now
path = _profile_path(profile.name)
- with open(path, "w") as f:
+ fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
+ with os.fdopen(fd, "w") as f:
json.dump(asdict(profile), f, indent=2)
return path
From b06dbf3e63b77c526ba75ca9d6e7ef45cc52cbf4 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:00:45 +0000
Subject: [PATCH 28/31] Fix Set Active profile button (delete widget state
before rerun), quote kubeconfig paths in shell commands
---
k8s-agent/app.py | 3 ++-
k8s-agent/config.py | 2 +-
k8s-agent/modules/cluster_debugger.py | 2 +-
3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 39f0be2..2f8030b 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -576,7 +576,8 @@ def page_profile_manager():
st.markdown(f"**Updated:** {profile.updated_at[:10] if profile.updated_at else 'N/A'}")
if st.button("Set Active", key=f"activate_{profile.name}"):
st.session_state.active_profile = profile.name
- st.session_state.profile_selector = profile.name
+ if "profile_selector" in st.session_state:
+ del st.session_state["profile_selector"]
st.rerun()
if st.button("Delete", key=f"delete_{profile.name}", type="secondary"):
delete_profile(profile.name)
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index 90eb95b..6cfe0cf 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -120,7 +120,7 @@ def fetch_namespaces(kubeconfig_content: str) -> list[str]:
f.write(kubeconfig_content)
try:
proc = subprocess.run(
- f"{kubectl} --kubeconfig={kc_path} get namespaces -o jsonpath='{{.items[*].metadata.name}}'",
+ f"{kubectl} --kubeconfig=\"{kc_path}\" get namespaces -o jsonpath='{{.items[*].metadata.name}}'",
shell=True, capture_output=True, text=True, timeout=15,
)
if proc.returncode == 0 and proc.stdout.strip():
diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py
index 6815a42..df80c9d 100644
--- a/k8s-agent/modules/cluster_debugger.py
+++ b/k8s-agent/modules/cluster_debugger.py
@@ -77,7 +77,7 @@ def _run_local_kubectl(kubeconfig_content: str, kubectl_args: str, timeout: int
kubeconfig_path = config.get_kubeconfig_path("_debug_temp")
with open(kubeconfig_path, "w") as f:
f.write(kubeconfig_content)
- full_cmd = f"{kubectl} --kubeconfig={kubeconfig_path} {kubectl_args}"
+ full_cmd = f"{kubectl} --kubeconfig=\"{kubeconfig_path}\" {kubectl_args}"
try:
proc = subprocess.run(full_cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return SSHResult(
From 0d9734d7b71e8f727c15d4a0ccc85c8ee0ff8c76 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:05:41 +0000
Subject: [PATCH 29/31] Add 'Collect pod logs' mode to Smart Log Analysis for
Istio access log analysis
---
k8s-agent/app.py | 149 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 147 insertions(+), 2 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 2f8030b..66d12fb 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -1824,9 +1824,16 @@ def page_log_analysis():
"**pattern mining** (Drain-style), and **auto-summarization**."
)
+ st.info(
+ "**Istio access log analysis:** Select **Collect pod logs** and choose an "
+ "application pod with an Istio sidecar (e.g. `istio-proxy` container). "
+ "The pipeline auto-detects Envoy/Istio access logs and shows response time "
+ "analytics, status codes, per-path breakdowns, and slow requests."
+ )
+
smart_mode = st.radio(
"Analysis mode",
- ["Collect from cluster", "Paste logs"],
+ ["Collect from cluster", "Collect pod logs", "Paste logs"],
horizontal=True,
key="smart_mode",
)
@@ -1873,11 +1880,149 @@ def page_log_analysis():
if "_smart_log_text" in st.session_state and not smart_log_text:
smart_log_text = st.session_state["_smart_log_text"]
+ elif smart_mode == "Collect pod logs":
+ st.markdown(
+ "Fetch logs from a specific pod — ideal for **Istio sidecar** access "
+ "logs (`istio-proxy` container) or any application pod."
+ )
+ spcol1, spcol2 = st.columns(2)
+ with spcol1:
+ if _cluster_namespaces:
+ smart_pod_ns = st.selectbox(
+ "Namespace", options=_cluster_namespaces,
+ index=_cluster_namespaces.index("default") if "default" in _cluster_namespaces else 0,
+ key="smart_pod_ns",
+ )
+ else:
+ smart_pod_ns = st.text_input("Namespace", value="default", key="smart_pod_ns")
+ with spcol2:
+ smart_pod_lines = st.number_input(
+ "Lines to fetch", min_value=100, max_value=10000, value=1000, key="smart_pod_lines",
+ )
+
+ # Load pods button
+ if st.button("Load Pods", key="smart_load_pods"):
+ with st.spinner(f"Fetching pods in namespace '{smart_pod_ns}'..."):
+ pod_result = get_pod_list(cp_node, namespace=smart_pod_ns, profile=profile)
+ if pod_result.success and pod_result.stdout.strip():
+ _sp_pods: list[dict] = []
+ for line in pod_result.stdout.strip().split("\n"):
+ parts = line.split()
+ if len(parts) >= 2:
+ _sp_pods.append({
+ "namespace": parts[0],
+ "name": parts[1],
+ "status": parts[2] if len(parts) > 2 else "Unknown",
+ "containers": parts[3] if len(parts) > 3 else "",
+ })
+ st.session_state["_smart_pod_list"] = _sp_pods
+ st.session_state["_smart_pod_list_ns"] = smart_pod_ns
+ st.success(f"Found {len(_sp_pods)} pod(s) in namespace '{smart_pod_ns}'.")
+ elif pod_result.success:
+ st.session_state["_smart_pod_list"] = []
+ st.warning(f"No pods found in namespace '{smart_pod_ns}'.")
+ else:
+ st.error(f"Failed to fetch pods: {pod_result.stderr}")
+
+ # Pod & container selection
+ _sp_pods_loaded = st.session_state.get("_smart_pod_list", [])
+ _sp_pods_ns = st.session_state.get("_smart_pod_list_ns", "")
+
+ sp_col1, sp_col2 = st.columns(2)
+ with sp_col1:
+ if _sp_pods_loaded and _sp_pods_ns == smart_pod_ns:
+ sp_pod_options = [f"{p['name']} ({p['status']})" for p in _sp_pods_loaded]
+ sp_selected_idx = st.selectbox(
+ "Pod Name", options=range(len(sp_pod_options)),
+ format_func=lambda i: sp_pod_options[i],
+ key="smart_pod_select",
+ )
+ smart_pod_name = _sp_pods_loaded[sp_selected_idx]["name"] if sp_selected_idx is not None else ""
+ else:
+ smart_pod_name = st.text_input(
+ "Pod Name", placeholder="Click 'Load Pods' to get dropdown", key="smart_pod_name",
+ )
+ with sp_col2:
+ # Container selection — show istio-proxy hint
+ if _sp_pods_loaded and _sp_pods_ns == smart_pod_ns and smart_pod_name:
+ matching = [p for p in _sp_pods_loaded if p["name"] == smart_pod_name]
+ container_names = []
+ if matching and matching[0].get("containers"):
+ container_names = [c.strip() for c in matching[0]["containers"].split(",") if c.strip()]
+ if container_names:
+ container_names = ["(all / default)"] + container_names
+ # Pre-select istio-proxy if available
+ default_idx = 0
+ for idx, cn in enumerate(container_names):
+ if cn == "istio-proxy":
+ default_idx = idx
+ break
+ smart_pod_container = st.selectbox(
+ "Container (select `istio-proxy` for Istio access logs)",
+ options=container_names,
+ index=default_idx,
+ key="smart_pod_container",
+ )
+ if smart_pod_container == "(all / default)":
+ smart_pod_container = ""
+ else:
+ smart_pod_container = st.text_input(
+ "Container (e.g. istio-proxy)",
+ value="istio-proxy",
+ key="smart_pod_container_text",
+ )
+ else:
+ smart_pod_container = st.text_input(
+ "Container (e.g. istio-proxy for Istio access logs)",
+ value="istio-proxy",
+ key="smart_pod_container_text2",
+ )
+
+ # Fetch & analyze
+ smart_pod_since_opts = {
+ "Last 15 min": "15m",
+ "Last 1 hour": "1h",
+ "Last 6 hours": "6h",
+ "Last 24 hours": "24h",
+ }
+ smart_pod_since_label = st.selectbox(
+ "Time Range", list(smart_pod_since_opts.keys()), index=1, key="smart_pod_since",
+ )
+ smart_pod_since_k8s = smart_pod_since_opts[smart_pod_since_label]
+
+ if st.button("Fetch Pod Logs & Analyze", type="primary", key="smart_pod_collect"):
+ if not smart_pod_name:
+ st.warning("Please select or enter a pod name.")
+ else:
+ with st.spinner(f"Fetching logs from pod '{smart_pod_name}' (container: {smart_pod_container or 'default'})..."):
+ pod_log_result = collect_pod_logs(
+ cp_node,
+ namespace=smart_pod_ns,
+ pod_name=smart_pod_name,
+ container=smart_pod_container,
+ lines=smart_pod_lines,
+ since_k8s=smart_pod_since_k8s,
+ profile=profile,
+ )
+ if pod_log_result.success and pod_log_result.stdout.strip():
+ smart_log_text = pod_log_result.stdout
+ st.session_state["_smart_log_text"] = smart_log_text
+ st.session_state["_smart_source"] = f"pod:{smart_pod_name}/{smart_pod_container or 'default'}"
+ st.success(f"Fetched {len(smart_log_text.splitlines())} log lines from pod '{smart_pod_name}'.")
+ elif pod_log_result.success:
+ st.info(f"No logs returned from pod '{smart_pod_name}' for the selected time range.")
+ else:
+ st.error(f"Failed to fetch pod logs: {pod_log_result.stderr}")
+
+ # Persist across reruns
+ if "_smart_log_text" in st.session_state and not smart_log_text:
+ smart_log_text = st.session_state["_smart_log_text"]
+
else:
smart_log_text = st.text_area(
"Paste log output",
height=200,
- placeholder="Paste your Kubernetes logs here for smart analysis...",
+ placeholder="Paste your Kubernetes / Istio access logs here for smart analysis...",
key="smart_paste",
)
if smart_log_text:
From 45560a6e89f3224d10408f29ffe382820d84d082 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:16:18 +0000
Subject: [PATCH 30/31] Fix profile switching: sync widget key with
active_profile before selectbox renders, add on_change callback, delete
profile_selector on all profile state changes
---
k8s-agent/app.py | 43 +++++++++++++++++++++++++++++++++++++------
1 file changed, 37 insertions(+), 6 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index 66d12fb..d2ab0e5 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -176,16 +176,41 @@ def render_sidebar():
profile_names = [p.name for p in profiles]
if profile_names:
+ _selector_options = ["(none)"] + profile_names
+ _current = st.session_state.get("active_profile")
+ _default_idx = (
+ profile_names.index(_current) + 1
+ if _current and _current in profile_names
+ else 0
+ )
+
+ # Sync widget key with active_profile before widget renders.
+ # Streamlit reads the widget value from session_state[key] when
+ # the key already exists, ignoring ``index``. So we must write
+ # the desired option into session_state["profile_selector"]
+ # *before* the selectbox is instantiated.
+ if "profile_selector" not in st.session_state:
+ # First render or key was deleted — seed from active_profile
+ st.session_state["profile_selector"] = _selector_options[_default_idx]
+ elif st.session_state["profile_selector"] not in _selector_options:
+ # Profile was deleted — reset
+ st.session_state["profile_selector"] = "(none)"
+
+ def _on_profile_change():
+ sel = st.session_state.get("profile_selector", "(none)")
+ if sel != "(none)":
+ st.session_state.active_profile = sel
+ else:
+ st.session_state.active_profile = None
+
selected = st.selectbox(
"Active Profile",
- options=["(none)"] + profile_names,
- index=(
- profile_names.index(st.session_state.active_profile) + 1
- if st.session_state.active_profile in profile_names
- else 0
- ),
+ options=_selector_options,
key="profile_selector",
+ on_change=_on_profile_change,
)
+
+ # Also keep active_profile in sync on this run
if selected != "(none)":
st.session_state.active_profile = selected
profile = load_profile(selected)
@@ -475,6 +500,8 @@ def page_profile_manager():
)
path = save_profile(profile)
st.session_state.active_profile = name
+ if "profile_selector" in st.session_state:
+ del st.session_state["profile_selector"]
st.session_state._flash_message = ("success", f"Profile '{name}' created successfully! Select it from the sidebar to get started.")
st.rerun()
@@ -532,6 +559,8 @@ def page_profile_manager():
try:
save_profile(profile)
st.session_state.active_profile = import_name
+ if "profile_selector" in st.session_state:
+ del st.session_state["profile_selector"]
st.session_state._flash_message = (
"success",
f"Cluster '{import_name}' imported successfully! "
@@ -583,6 +612,8 @@ def page_profile_manager():
delete_profile(profile.name)
if st.session_state.active_profile == profile.name:
st.session_state.active_profile = None
+ if "profile_selector" in st.session_state:
+ del st.session_state["profile_selector"]
st.rerun()
# ── Import / Export ───────────────────────────────────────────────────
From c89f8cc940a919b1c90bcff9c742fd70720fcee2 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 10 Apr 2026 15:49:16 +0000
Subject: [PATCH 31/31] Add Ollama LLM support: provider selection
(OpenAI/Ollama), local Ollama connection, model fetching, streaming support
---
k8s-agent/app.py | 107 ++++++++++++----
k8s-agent/config.py | 29 ++++-
k8s-agent/modules/llm_client.py | 211 ++++++++++++++++++++------------
3 files changed, 241 insertions(+), 106 deletions(-)
diff --git a/k8s-agent/app.py b/k8s-agent/app.py
index d2ab0e5..2e06614 100644
--- a/k8s-agent/app.py
+++ b/k8s-agent/app.py
@@ -87,7 +87,7 @@
summarize_logs,
analyze_istio_access_logs,
)
-from modules.llm_client import query_llm, stream_llm
+from modules.llm_client import query_llm, stream_llm, list_ollama_models
# ── Page Configuration ────────────────────────────────────────────────────
@@ -263,27 +263,82 @@ def _on_profile_change():
# ── LLM config ──
with st.expander("LLM Settings"):
- st.text_input(
- "API URL",
- value=config.LLM_API_URL,
- key="llm_api_url",
- help="Endpoint for the LLM API",
- )
- st.text_input(
- "API Key",
- value=config.LLM_API_KEY[:8] + "..." if config.LLM_API_KEY else "",
- type="password",
- key="llm_api_key_display",
- disabled=True,
- help="Set via LLM_API_KEY or INFOSYS_CODER_API_KEY env var",
- )
- st.selectbox(
- "Model",
- options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"],
- index=0,
- key="llm_model_select",
+ provider_options = ["openai", "ollama"]
+ _prov_idx = provider_options.index(config.LLM_PROVIDER) if config.LLM_PROVIDER in provider_options else 0
+ llm_provider = st.selectbox(
+ "Provider",
+ options=provider_options,
+ format_func=lambda p: {"openai": "OpenAI-compatible", "ollama": "Ollama (local)"}[p],
+ index=_prov_idx,
+ key="llm_provider_select",
+ help="Select 'Ollama (local)' to connect to a local Ollama instance",
)
+ if llm_provider == "ollama":
+ ollama_url = st.text_input(
+ "Ollama URL",
+ value=config.OLLAMA_BASE_URL,
+ key="ollama_url_input",
+ help="Base URL for your Ollama instance (e.g. http://10.73.98.113:11434)",
+ )
+ # Fetch models button
+ if st.button("Fetch available models", key="ollama_fetch_models"):
+ with st.spinner("Connecting to Ollama..."):
+ models = list_ollama_models(ollama_url)
+ if models:
+ st.session_state["_ollama_models"] = models
+ st.success(f"Found {len(models)} model(s)")
+ else:
+ st.error(f"Could not connect to Ollama at {ollama_url}")
+ _cached_models = st.session_state.get("_ollama_models", [])
+ if _cached_models:
+ st.selectbox(
+ "Model",
+ options=_cached_models,
+ index=0,
+ key="ollama_model_select",
+ )
+ else:
+ st.text_input(
+ "Model",
+ value=config.OLLAMA_MODEL,
+ key="ollama_model_input",
+ help="Model name (e.g. llama3, mistral, codellama)",
+ )
+
+ # Apply Ollama settings at runtime
+ config.LLM_PROVIDER = "ollama"
+ config.OLLAMA_BASE_URL = ollama_url
+ _sel_model = st.session_state.get("ollama_model_select") or st.session_state.get("ollama_model_input", config.OLLAMA_MODEL)
+ config.OLLAMA_MODEL = _sel_model
+
+ if config.is_llm_configured():
+ st.caption(f"✓ Ollama configured → `{config.OLLAMA_BASE_URL}` / `{config.OLLAMA_MODEL}`")
+ else:
+ st.caption("Enter the Ollama URL above to enable AI features")
+ else:
+ st.text_input(
+ "API URL",
+ value=config.LLM_API_URL,
+ key="llm_api_url",
+ help="Endpoint for the LLM API",
+ )
+ st.text_input(
+ "API Key",
+ value=config.LLM_API_KEY[:8] + "..." if config.LLM_API_KEY else "",
+ type="password",
+ key="llm_api_key_display",
+ disabled=True,
+ help="Set via LLM_API_KEY or INFOSYS_CODER_API_KEY env var",
+ )
+ st.selectbox(
+ "Model",
+ options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"],
+ index=0,
+ key="llm_model_select",
+ )
+ config.LLM_PROVIDER = "openai"
+
return selected_page
@@ -1100,7 +1155,7 @@ def page_cluster_creation():
st.markdown("### AI Cluster Setup Advisor")
if not is_llm_configured():
st.info(
- "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
"environment variables to enable AI-powered recommendations."
)
else:
@@ -1183,7 +1238,7 @@ def page_cluster_debugger():
if st.session_state.debug_results:
if not is_llm_configured():
- st.info("Enable AI analysis by setting `LLM_API_URL` and `LLM_API_KEY` env vars.")
+ st.info("Enable AI analysis by selecting a provider (OpenAI or Ollama) in the sidebar LLM Settings.")
elif st.button("Analyze with AI", type="secondary"):
with st.spinner("AI is analyzing diagnostics..."):
analysis = analyze_diagnostics(
@@ -1237,7 +1292,7 @@ def page_cluster_debugger():
st.markdown("### AI Debug Assistant")
if not is_llm_configured():
st.info(
- "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
"environment variables to enable AI-powered debugging."
)
st.markdown(
@@ -1571,7 +1626,7 @@ def page_monitoring_setup():
st.markdown("### AI Monitoring Advisor")
if not is_llm_configured():
st.info(
- "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
"environment variables to enable AI-powered monitoring advice."
)
else:
@@ -2444,7 +2499,7 @@ def page_log_analysis():
st.markdown("### AI-Powered Log Analysis")
if not is_llm_configured():
st.info(
- "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
"environment variables to enable AI-powered log analysis."
)
st.markdown(
@@ -4150,7 +4205,7 @@ def page_ai_assistant():
if not is_llm_configured():
st.info(
- "LLM is not configured. Set `LLM_API_URL` and `LLM_API_KEY` "
+ "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` "
"environment variables to enable the AI chat assistant."
)
st.markdown(
diff --git a/k8s-agent/config.py b/k8s-agent/config.py
index 6cfe0cf..2951fa1 100644
--- a/k8s-agent/config.py
+++ b/k8s-agent/config.py
@@ -6,6 +6,8 @@
# LLM Configuration
+# Provider: "openai" (OpenAI-compatible endpoint) or "ollama" (local Ollama)
+LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai")
LLM_API_URL = os.getenv(
"LLM_API_URL",
"https://aigateway-intern.ad.infosys.com/aigateway/chat/completions",
@@ -15,12 +17,37 @@
LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096"))
+# Ollama-specific defaults
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://10.73.98.113:11434")
+OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3")
+
def is_llm_configured() -> bool:
- """Return True if the LLM endpoint and API key are both set."""
+ """Return True if the LLM is configured.
+
+ For Ollama, only the base URL is required (no API key).
+ For OpenAI-compatible endpoints, both URL and key are required.
+ """
+ if LLM_PROVIDER == "ollama":
+ return bool(OLLAMA_BASE_URL)
return bool(LLM_API_URL and LLM_API_KEY)
+def get_active_llm_url() -> str:
+ """Return the effective chat completions URL based on the active provider."""
+ if LLM_PROVIDER == "ollama":
+ base = OLLAMA_BASE_URL.rstrip("/")
+ return f"{base}/api/chat"
+ return LLM_API_URL
+
+
+def get_active_model() -> str:
+ """Return the effective model name based on the active provider."""
+ if LLM_PROVIDER == "ollama":
+ return OLLAMA_MODEL
+ return LLM_MODEL
+
+
# Application paths
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
PROFILES_DIR = os.path.join(DATA_DIR, "profiles")
diff --git a/k8s-agent/modules/llm_client.py b/k8s-agent/modules/llm_client.py
index fc77be1..ab8cb18 100644
--- a/k8s-agent/modules/llm_client.py
+++ b/k8s-agent/modules/llm_client.py
@@ -1,8 +1,11 @@
-"""LLM client — optional integration with an OpenAI-compatible endpoint.
+"""LLM client — optional integration with OpenAI-compatible or Ollama endpoints.
+
+Supports two providers:
+ * **openai** — Any OpenAI-compatible chat completions API (default).
+ * **ollama** — Local Ollama instance (no API key required).
All public functions gracefully return a fallback message when the LLM is not
-configured (i.e. ``LLM_API_KEY`` or ``LLM_API_URL`` is empty). The rest of the
-application works without any LLM dependency.
+configured. The rest of the application works without any LLM dependency.
"""
import json
@@ -13,8 +16,9 @@
import config
_NOT_CONFIGURED_MSG = (
- "LLM is not configured. Set the LLM_API_URL and LLM_API_KEY environment "
- "variables to enable AI-powered features."
+ "LLM is not configured. Set the LLM provider and connection details in "
+ "the sidebar LLM Settings panel, or via environment variables "
+ "(LLM_PROVIDER, LLM_API_URL / OLLAMA_BASE_URL)."
)
@@ -32,64 +36,103 @@
the provided information is insufficient."""
-def query_llm(
+def _build_messages(
user_message: str,
system_message: Optional[str] = None,
conversation_history: Optional[list[dict]] = None,
- temperature: Optional[float] = None,
- max_tokens: Optional[int] = None,
-) -> str:
- """Send a query to the LLM and return the response text.
-
- Args:
- user_message: The user's message/query.
- system_message: Optional system prompt override.
- conversation_history: Optional list of prior messages for context.
- temperature: Optional temperature override.
- max_tokens: Optional max tokens override.
-
- Returns:
- The assistant's response text.
- """
+) -> list[dict]:
+ """Assemble the messages list shared by both query and stream."""
messages = []
-
sys_msg = system_message or SYSTEM_PROMPT
messages.append({"role": "system", "content": sys_msg})
-
if conversation_history:
messages.extend(conversation_history)
-
messages.append({"role": "user", "content": user_message})
+ return messages
+
+
+def _build_headers() -> dict:
+ """Return request headers for the active provider."""
+ headers = {"Content-Type": "application/json"}
+ if config.LLM_PROVIDER != "ollama" and config.LLM_API_KEY:
+ headers["Authorization"] = f"Bearer {config.LLM_API_KEY}"
+ return headers
- headers = {
- "Content-Type": "application/json",
- "Authorization": f"Bearer {config.LLM_API_KEY}",
- }
+def _build_payload(
+ messages: list[dict],
+ temperature: Optional[float] = None,
+ max_tokens: Optional[int] = None,
+ stream: bool = False,
+) -> dict:
+ """Return the request payload for the active provider."""
+ temp = temperature if temperature is not None else config.LLM_TEMPERATURE
+ model = config.get_active_model()
+
+ if config.LLM_PROVIDER == "ollama":
+ payload: dict = {
+ "model": model,
+ "messages": messages,
+ "stream": stream,
+ "options": {
+ "temperature": temp,
+ },
+ }
+ if max_tokens is not None or config.LLM_MAX_TOKENS:
+ payload["options"]["num_predict"] = (
+ max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS
+ )
+ return payload
+
+ # OpenAI-compatible
payload = {
- "model": config.LLM_MODEL,
+ "model": model,
"messages": messages,
- "temperature": temperature if temperature is not None else config.LLM_TEMPERATURE,
+ "temperature": temp,
"max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
}
+ if stream:
+ payload["stream"] = True
+ return payload
+
+
+def query_llm(
+ user_message: str,
+ system_message: Optional[str] = None,
+ conversation_history: Optional[list[dict]] = None,
+ temperature: Optional[float] = None,
+ max_tokens: Optional[int] = None,
+) -> str:
+ """Send a query to the LLM and return the response text.
+ Supports both OpenAI-compatible and Ollama endpoints.
+ """
if not config.is_llm_configured():
return _NOT_CONFIGURED_MSG
+ messages = _build_messages(user_message, system_message, conversation_history)
+ headers = _build_headers()
+ payload = _build_payload(messages, temperature, max_tokens, stream=False)
+ url = config.get_active_llm_url()
+
try:
- response = requests.post(
- config.LLM_API_URL,
- headers=headers,
- json=payload,
- timeout=120,
- )
+ response = requests.post(url, headers=headers, json=payload, timeout=120)
response.raise_for_status()
data = response.json()
+
+ # Ollama returns {"message": {"content": "..."}}
+ if config.LLM_PROVIDER == "ollama":
+ return data.get("message", {}).get("content", "")
+
+ # OpenAI returns {"choices": [{"message": {"content": "..."}}]}
return data["choices"][0]["message"]["content"]
except requests.exceptions.Timeout:
return "Error: LLM request timed out. Please try again."
except requests.exceptions.ConnectionError:
- return "Error: Could not connect to the LLM endpoint. Please check your network and LLM_API_URL configuration."
+ return (
+ f"Error: Could not connect to the LLM endpoint at {url}. "
+ "Please check your network and LLM configuration."
+ )
except requests.exceptions.HTTPError as exc:
return f"Error: LLM API returned HTTP {exc.response.status_code}: {exc.response.text}"
except (KeyError, IndexError, json.JSONDecodeError) as exc:
@@ -105,58 +148,68 @@ def stream_llm(
) -> Generator[str, None, None]:
"""Stream a response from the LLM token-by-token.
+ Supports both OpenAI-compatible and Ollama endpoints.
Yields chunks of text as they arrive from the API.
"""
- messages = []
-
- sys_msg = system_message or SYSTEM_PROMPT
- messages.append({"role": "system", "content": sys_msg})
-
- if conversation_history:
- messages.extend(conversation_history)
-
- messages.append({"role": "user", "content": user_message})
-
- headers = {
- "Content-Type": "application/json",
- "Authorization": f"Bearer {config.LLM_API_KEY}",
- }
-
- payload = {
- "model": config.LLM_MODEL,
- "messages": messages,
- "temperature": temperature if temperature is not None else config.LLM_TEMPERATURE,
- "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS,
- "stream": True,
- }
-
if not config.is_llm_configured():
yield _NOT_CONFIGURED_MSG
return
+ messages = _build_messages(user_message, system_message, conversation_history)
+ headers = _build_headers()
+ payload = _build_payload(messages, temperature, max_tokens, stream=True)
+ url = config.get_active_llm_url()
+
try:
response = requests.post(
- config.LLM_API_URL,
- headers=headers,
- json=payload,
- timeout=120,
- stream=True,
+ url, headers=headers, json=payload, timeout=120, stream=True,
)
response.raise_for_status()
- for line in response.iter_lines(decode_unicode=True):
- if not line or not line.startswith("data: "):
- continue
- data_str = line[len("data: "):]
- if data_str.strip() == "[DONE]":
- break
- try:
- chunk = json.loads(data_str)
- delta = chunk.get("choices", [{}])[0].get("delta", {})
- content = delta.get("content", "")
- if content:
- yield content
- except (json.JSONDecodeError, KeyError, IndexError):
- continue
+ if config.LLM_PROVIDER == "ollama":
+ # Ollama streams newline-delimited JSON objects
+ for line in response.iter_lines(decode_unicode=True):
+ if not line:
+ continue
+ try:
+ chunk = json.loads(line)
+ content = chunk.get("message", {}).get("content", "")
+ if content:
+ yield content
+ if chunk.get("done", False):
+ break
+ except json.JSONDecodeError:
+ continue
+ else:
+ # OpenAI SSE format: "data: {...}\n"
+ for line in response.iter_lines(decode_unicode=True):
+ if not line or not line.startswith("data: "):
+ continue
+ data_str = line[len("data: "):]
+ if data_str.strip() == "[DONE]":
+ break
+ try:
+ chunk = json.loads(data_str)
+ delta = chunk.get("choices", [{}])[0].get("delta", {})
+ content = delta.get("content", "")
+ if content:
+ yield content
+ except (json.JSONDecodeError, KeyError, IndexError):
+ continue
except requests.exceptions.RequestException as exc:
yield f"\n\nError during streaming: {exc}"
+
+
+def list_ollama_models(base_url: str = "") -> list[str]:
+ """Fetch available model names from an Ollama instance.
+
+ Returns a list of model name strings, or an empty list on failure.
+ """
+ url = (base_url or config.OLLAMA_BASE_URL).rstrip("/") + "/api/tags"
+ try:
+ resp = requests.get(url, timeout=10)
+ resp.raise_for_status()
+ data = resp.json()
+ return [m["name"] for m in data.get("models", [])]
+ except Exception:
+ return []