diff --git a/.gitignore b/.gitignore index 9359002..dc0bed2 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ charts/*/charts/ *.pem *.key kubeconfig* +k8s-agent/__pycache__/ +k8s-agent/data/profiles/*.json +k8s-agent/modules/__pycache__/ diff --git a/k8s-agent/README.md b/k8s-agent/README.md new file mode 100644 index 0000000..17c03e9 --- /dev/null +++ b/k8s-agent/README.md @@ -0,0 +1,87 @@ +# K8s Agent — On-Prem Kubernetes Cluster Management + +A Streamlit-based UI for managing on-premises Kubernetes clusters with CRI-O container runtime and Flannel CNI. + +## Features + +1. **Profile Manager** — Create and manage profiles for multiple clusters with node definitions (control-plane / worker), SSH credentials, and K8s configuration. + +2. **Cluster Creation** — SSH into nodes and provision a full Kubernetes cluster: + - Installs CRI-O container runtime + - Installs kubeadm, kubelet, kubectl + - Initializes control plane with best-practice kubeadm config + - Deploys Flannel CNI + - Joins worker nodes automatically + - Applies security hardening (NetworkPolicies, RBAC, ResourceQuotas, PodSecurity) + +3. **Cluster Debugger** — Run diagnostic commands and get AI-powered analysis: + - Pre-built checks for nodes, pods, networking, storage, certificates + - Category-based scanning (Cluster Overview, Networking, Security, etc.) + - Custom command execution via SSH + - AI-powered root cause analysis and remediation recommendations + +4. **Monitoring Setup** — Deploy Prometheus + Grafana with production-ready configuration: + - One-click kube-prometheus-stack installation + - Grafana dashboard imports (cluster overview, node exporter, pods, etcd, API server, etc.) + - Alerting rules for node health, pod crashes, disk pressure, etcd latency + - AI-powered monitoring recommendations + +5. **Log Analysis** — Collect, parse, and correlate logs across cluster components: + - System component logs (kubelet, CRI-O, API server, etcd, Flannel, CoreDNS) + - Pod-level log collection with previous container support + - Automated error pattern extraction and grouping + - Cross-source error correlation + - AI-powered deep log analysis and root cause identification + +6. **AI Assistant** — Chat interface for Kubernetes questions powered by your LLM. + +## Quick Start + +```bash +cd k8s-agent +pip install -r requirements.txt + +# Set your LLM API key +export LLM_API_KEY="your-api-key" +# Or use the Infosys AI Gateway key +export INFOSYS_CODER_API_KEY="your-key" + +# Run the app +streamlit run app.py +``` + +## Configuration + +Environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `LLM_API_URL` | LLM API endpoint | Infosys AI Gateway | +| `LLM_API_KEY` | LLM API key | Falls back to `INFOSYS_CODER_API_KEY` | +| `LLM_MODEL` | Model name | `gpt-4` | +| `LLM_TEMPERATURE` | Response temperature | `0.3` | +| `LLM_MAX_TOKENS` | Max response tokens | `4096` | + +## Architecture + +``` +k8s-agent/ +├── app.py # Main Streamlit application +├── config.py # Configuration and environment variables +├── requirements.txt # Python dependencies +├── modules/ +│ ├── llm_client.py # LLM API integration (query + streaming) +│ ├── profile_manager.py # Cluster profile CRUD operations +│ ├── cluster_creator.py # SSH-based cluster provisioning +│ ├── cluster_debugger.py # Diagnostic commands and AI analysis +│ ├── monitoring_setup.py # Prometheus/Grafana deployment +│ └── log_analyzer.py # Log collection, parsing, correlation +├── templates/ # Configuration templates +└── data/profiles/ # Stored cluster profiles (JSON) +``` + +## Requirements + +- Python 3.10+ +- SSH access to target nodes (for cluster operations) +- LLM API endpoint (Infosys AI Gateway or compatible OpenAI-style API) diff --git a/k8s-agent/app.py b/k8s-agent/app.py new file mode 100644 index 0000000..2e06614 --- /dev/null +++ b/k8s-agent/app.py @@ -0,0 +1,5033 @@ +"""K8s Agent — Streamlit-based Kubernetes Cluster Management UI.""" + +import sys +import os + +# Ensure the k8s-agent directory is on the Python path so sibling imports work. +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import json +import streamlit as st +# Navigation uses native st.radio — no third-party component needed. + +import config +from config import is_llm_configured, get_kubectl_path, fetch_namespaces, get_kubeconfig_path +from modules.profile_manager import ( + ClusterProfile, + save_profile, + load_profile, + list_profiles, + delete_profile, + update_profile_status, +) +from modules.cluster_creator import ( + test_ssh_connectivity, + run_ssh_command, + generate_common_setup_script, + generate_control_plane_init_script, + generate_worker_join_script, + generate_best_practices_script, + provision_node_common, + init_control_plane, + retrieve_join_command, + join_worker_node, + apply_best_practices, + get_cluster_status, + get_llm_cluster_advice, + upload_flannel_manifest_to_node, + run_kubectl, + ProvisionStep, + _run_step, + get_common_setup_steps, + get_control_plane_steps, + get_worker_join_steps, + get_best_practices_steps, + get_cluster_reset_steps, +) +from modules.cluster_debugger import ( + DIAGNOSTIC_COMMANDS, + KUBECTL_DIAGNOSTIC_COMMANDS, + CATEGORY_MAP, + get_available_commands, + run_diagnostic, + run_category_diagnostics, + run_all_diagnostics, + run_custom_command, + analyze_diagnostics, + get_debug_suggestion, + check_pod_issues, +) +from modules.monitoring_setup import ( + GRAFANA_DASHBOARDS, + install_helm, + install_prometheus_stack, + install_dashboards, + install_alert_rules, + get_monitoring_status, + get_monitoring_advice, + generate_prometheus_install_script, + generate_dashboard_import_script, + generate_alerting_rules_script, +) +from modules.log_analyzer import ( + LOG_SOURCES, + get_available_log_sources, + collect_logs, + collect_pod_logs, + collect_multi_source_logs, + analyze_logs, + correlate_errors, + llm_analyze_logs, + llm_correlate_analysis, + get_pod_list, + smart_analyze, + cluster_logs, + detect_anomalies, + mine_log_patterns, + summarize_logs, + analyze_istio_access_logs, +) +from modules.llm_client import query_llm, stream_llm, list_ollama_models + + +# ── Page Configuration ──────────────────────────────────────────────────── + +st.set_page_config( + page_title="K8s Agent", + page_icon="☸", + layout="wide", + initial_sidebar_state="expanded", +) + +# ── Custom CSS ──────────────────────────────────────────────────────────── + +st.markdown(""" + +""", unsafe_allow_html=True) + + +# ── Session state initialization ────────────────────────────────────────── + +def init_session_state(): + defaults = { + "active_profile": None, + "chat_history": [], + "provisioning_log": [], + "debug_results": {}, + "log_analysis_results": {}, + "_flash_message": None, + } + for key, value in defaults.items(): + if key not in st.session_state: + st.session_state[key] = value + + +init_session_state() + + +# ── Sidebar: Profile Manager + Navigation ───────────────────────────────── + +def render_sidebar(): + with st.sidebar: + st.markdown('
☸ K8s Agent
', unsafe_allow_html=True) + st.markdown('
On-Prem Kubernetes Management
', unsafe_allow_html=True) + + st.divider() + + # ── Profile selector ── + st.markdown("### Cluster Profiles") + profiles = list_profiles() + profile_names = [p.name for p in profiles] + + if profile_names: + _selector_options = ["(none)"] + profile_names + _current = st.session_state.get("active_profile") + _default_idx = ( + profile_names.index(_current) + 1 + if _current and _current in profile_names + else 0 + ) + + # Sync widget key with active_profile before widget renders. + # Streamlit reads the widget value from session_state[key] when + # the key already exists, ignoring ``index``. So we must write + # the desired option into session_state["profile_selector"] + # *before* the selectbox is instantiated. + if "profile_selector" not in st.session_state: + # First render or key was deleted — seed from active_profile + st.session_state["profile_selector"] = _selector_options[_default_idx] + elif st.session_state["profile_selector"] not in _selector_options: + # Profile was deleted — reset + st.session_state["profile_selector"] = "(none)" + + def _on_profile_change(): + sel = st.session_state.get("profile_selector", "(none)") + if sel != "(none)": + st.session_state.active_profile = sel + else: + st.session_state.active_profile = None + + selected = st.selectbox( + "Active Profile", + options=_selector_options, + key="profile_selector", + on_change=_on_profile_change, + ) + + # Also keep active_profile in sync on this run + if selected != "(none)": + st.session_state.active_profile = selected + profile = load_profile(selected) + if profile: + status_class = f"status-{profile.status}" + st.markdown( + f"**Status:** {profile.status.upper()}", + unsafe_allow_html=True, + ) + if profile.cluster_source == "imported": + st.caption( + f"K8s {profile.kubernetes_version} | Imported Cluster" + ) + else: + st.caption( + f"K8s {profile.kubernetes_version} | CRI-O {profile.crio_version} | " + f"{len(profile.get_control_plane_nodes())} CP + " + f"{len(profile.get_worker_nodes())} Workers" + ) + else: + st.session_state.active_profile = None + else: + st.info("No profiles yet. Create one in Profile Manager.") + + st.divider() + + # ── Navigation ── + st.markdown("### Navigation") + nav_options = [ + "Multi-Cluster Dashboard", + "Profile Manager", + "Cluster Creation", + "Resource Viewer", + "Cluster Debugger", + "Monitoring Setup", + "Log Analysis", + "Upgrade Planner", + "Certificate Manager", + "Cost Optimizer", + "AI Assistant", + ] + selected_page = st.radio( + "Go to", + options=nav_options, + index=0, + label_visibility="collapsed", + ) + + st.divider() + + # ── LLM config ── + with st.expander("LLM Settings"): + provider_options = ["openai", "ollama"] + _prov_idx = provider_options.index(config.LLM_PROVIDER) if config.LLM_PROVIDER in provider_options else 0 + llm_provider = st.selectbox( + "Provider", + options=provider_options, + format_func=lambda p: {"openai": "OpenAI-compatible", "ollama": "Ollama (local)"}[p], + index=_prov_idx, + key="llm_provider_select", + help="Select 'Ollama (local)' to connect to a local Ollama instance", + ) + + if llm_provider == "ollama": + ollama_url = st.text_input( + "Ollama URL", + value=config.OLLAMA_BASE_URL, + key="ollama_url_input", + help="Base URL for your Ollama instance (e.g. http://10.73.98.113:11434)", + ) + # Fetch models button + if st.button("Fetch available models", key="ollama_fetch_models"): + with st.spinner("Connecting to Ollama..."): + models = list_ollama_models(ollama_url) + if models: + st.session_state["_ollama_models"] = models + st.success(f"Found {len(models)} model(s)") + else: + st.error(f"Could not connect to Ollama at {ollama_url}") + _cached_models = st.session_state.get("_ollama_models", []) + if _cached_models: + st.selectbox( + "Model", + options=_cached_models, + index=0, + key="ollama_model_select", + ) + else: + st.text_input( + "Model", + value=config.OLLAMA_MODEL, + key="ollama_model_input", + help="Model name (e.g. llama3, mistral, codellama)", + ) + + # Apply Ollama settings at runtime + config.LLM_PROVIDER = "ollama" + config.OLLAMA_BASE_URL = ollama_url + _sel_model = st.session_state.get("ollama_model_select") or st.session_state.get("ollama_model_input", config.OLLAMA_MODEL) + config.OLLAMA_MODEL = _sel_model + + if config.is_llm_configured(): + st.caption(f"✓ Ollama configured → `{config.OLLAMA_BASE_URL}` / `{config.OLLAMA_MODEL}`") + else: + st.caption("Enter the Ollama URL above to enable AI features") + else: + st.text_input( + "API URL", + value=config.LLM_API_URL, + key="llm_api_url", + help="Endpoint for the LLM API", + ) + st.text_input( + "API Key", + value=config.LLM_API_KEY[:8] + "..." if config.LLM_API_KEY else "", + type="password", + key="llm_api_key_display", + disabled=True, + help="Set via LLM_API_KEY or INFOSYS_CODER_API_KEY env var", + ) + st.selectbox( + "Model", + options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"], + index=0, + key="llm_model_select", + ) + config.LLM_PROVIDER = "openai" + + return selected_page + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Profile Manager +# ══════════════════════════════════════════════════════════════════════════ + +def page_profile_manager(): + st.markdown("## Cluster Profile Manager") + st.markdown("Create, edit, and manage profiles for your on-prem Kubernetes clusters.") + + # Show any flash message from a previous action (e.g. after st.rerun) + if st.session_state.get("_flash_message"): + _flash = st.session_state._flash_message + if _flash[0] == "success": + st.success(_flash[1]) + elif _flash[0] == "error": + st.error(_flash[1]) + elif _flash[0] == "info": + st.info(_flash[1]) + st.session_state._flash_message = None + + tab_create, tab_import_cluster, tab_list, tab_import = st.tabs([ + "Create Profile", "Import Existing Cluster", "Manage Profiles", "Import / Export", + ]) + + # ── Create Profile ──────────────────────────────────────────────────── + with tab_create: + with st.form("create_profile_form"): + st.markdown("### New Cluster Profile") + col1, col2 = st.columns(2) + + with col1: + name = st.text_input("Profile Name *", placeholder="production-cluster") + description = st.text_area("Description", placeholder="Production on-prem cluster") + k8s_version = st.selectbox( + "Kubernetes Version", + ["1.35", "1.34", "1.33", "1.32", "1.31", "1.30", "1.29", "1.28", "1.27"], + index=0, + ) + crio_version = st.selectbox( + "CRI-O Version", + ["1.35", "1.34", "1.33", "1.32", "1.31", "1.30", "1.29", "1.28", "1.27"], + index=0, + ) + pod_security = st.selectbox( + "Pod Security Standard", + ["restricted", "baseline", "privileged"], + index=0, + help="Controls what pods are allowed to run in the cluster.", + ) + # Explain each PSS level + with st.expander("What do these Pod Security Standards mean?"): + st.markdown( + "**Restricted** (most secure)\n" + "- Heavily restricted policy following Pod hardening best practices.\n" + "- Disallows privilege escalation, host namespaces, host paths, and most Linux capabilities.\n" + "- Containers must run as non-root with a read-only root filesystem.\n" + "- Only allows seccomp profile RuntimeDefault or Localhost.\n" + "- Best for: production workloads, multi-tenant clusters, security-sensitive environments.\n\n" + "**Baseline** (moderate)\n" + "- Minimally restrictive policy that prevents known privilege escalations.\n" + "- Allows most default Kubernetes configurations but blocks hostNetwork, hostPID, hostIPC.\n" + "- Containers can run as root but cannot use privileged mode.\n" + "- Allows all seccomp profiles.\n" + "- Best for: general workloads, development/staging, teams new to PSS.\n\n" + "**Privileged** (unrestricted)\n" + "- Completely unrestricted policy — no security restrictions enforced.\n" + "- Allows privileged containers, host namespaces, host paths, any capabilities.\n" + "- Containers can run as root with full access to the host.\n" + "- Best for: system-level workloads (monitoring agents, CNI plugins, storage drivers), " + "trusted single-tenant clusters.\n\n" + "**Recommendation:** Start with *Restricted* and relax to *Baseline* only for " + "workloads that require it. Avoid *Privileged* unless absolutely necessary." + ) + + with col2: + pod_cidr = st.text_input("Pod CIDR", value="10.244.0.0/16") + service_cidr = st.text_input("Service CIDR", value="10.96.0.0/12") + dns_domain = st.text_input("DNS Domain", value="cluster.local") + + st.divider() + st.markdown("### Storage Paths") + st.markdown( + "Configure where CRI-O stores container images, pods, and logs. " + "Change these to use a dedicated disk instead of the default `/var/lib`." + ) + scol1, scol2 = st.columns(2) + with scol1: + crio_root = st.text_input( + "CRI-O Storage Root", + value="/var/lib/containers/storage", + help="Root directory for CRI-O container/image storage (default: /var/lib/containers/storage)", + ) + crio_runroot = st.text_input( + "CRI-O Run Root", + value="/run/containers/storage", + help="Runtime root for CRI-O (default: /run/containers/storage)", + ) + with scol2: + kubelet_root = st.text_input( + "Kubelet Data Directory", + value="/var/lib/kubelet", + help="Kubelet data directory for pods, volumes, etc. (default: /var/lib/kubelet)", + ) + log_root = st.text_input( + "Log Root Directory", + value="/var/log", + help="Base directory for all logs — CRI-O pod logs, kubernetes audit logs, etc. (default: /var/log)", + ) + + st.divider() + st.markdown("### Proxy Settings (Master Node)") + st.markdown( + "Configure HTTP/HTTPS proxy for the master/control-plane node. " + "These are used during package installation and cluster initialization." + ) + pcol1, pcol2 = st.columns(2) + with pcol1: + http_proxy = st.text_input( + "HTTP Proxy", + value="", + placeholder="http://proxy.example.com:8080", + help="Primary HTTP proxy for outbound connections", + ) + https_proxy = st.text_input( + "HTTPS Proxy", + value="", + placeholder="http://proxy.example.com:8443", + help="Primary HTTPS proxy for outbound connections", + ) + no_proxy = st.text_input( + "No Proxy", + value="", + placeholder="localhost,127.0.0.1,10.96.0.0/12,10.244.0.0/16", + help="Comma-separated list of hosts/CIDRs to bypass proxy", + ) + with pcol2: + http_proxy_alt = st.text_input( + "Alternate HTTP Proxy", + value="", + placeholder="http://backup-proxy.example.com:8080", + help="Fallback HTTP proxy if the primary is unavailable", + ) + https_proxy_alt = st.text_input( + "Alternate HTTPS Proxy", + value="", + placeholder="http://backup-proxy.example.com:8443", + help="Fallback HTTPS proxy if the primary is unavailable", + ) + + st.divider() + st.markdown("### Nodes") + st.markdown("Define your control-plane and worker nodes.") + + num_nodes = st.number_input("Number of Nodes", min_value=1, max_value=50, value=3, step=1) + + nodes = [] + for i in range(int(num_nodes)): + st.markdown(f"**Node {i + 1}**") + ncol1, ncol2, ncol3, ncol4, ncol5 = st.columns([2, 2, 1.5, 1, 1.5]) + with ncol1: + hostname = st.text_input(f"Hostname", key=f"host_{i}", placeholder=f"node-{i + 1}") + with ncol2: + ip_addr = st.text_input(f"IP Address", key=f"ip_{i}", placeholder="192.168.1.x") + with ncol3: + role = st.selectbox(f"Role", ["control-plane", "worker"], key=f"role_{i}", + index=0 if i == 0 else 1) + with ncol4: + ssh_user = st.text_input(f"SSH User", key=f"user_{i}", value="root") + with ncol5: + ssh_key = st.text_input(f"SSH Key Path", key=f"key_{i}", value="~/.ssh/id_rsa") + + nodes.append({ + "hostname": hostname, + "ip_address": ip_addr, + "role": role, + "ssh_user": ssh_user, + "ssh_port": 22, + "ssh_key_path": ssh_key, + }) + + submitted = st.form_submit_button("Create Profile", type="primary", use_container_width=True) + + if submitted: + if not name: + st.error("Profile name is required.") + elif not any(n["ip_address"] for n in nodes): + st.error("At least one node must have an IP address.") + elif not any(n["role"] == "control-plane" for n in nodes): + st.error("At least one control-plane node is required.") + else: + valid_nodes = [n for n in nodes if n["ip_address"]] + profile = ClusterProfile( + name=name, + description=description, + kubernetes_version=k8s_version, + crio_version=crio_version, + cni_plugin="flannel", + pod_cidr=pod_cidr, + service_cidr=service_cidr, + dns_domain=dns_domain, + nodes=valid_nodes, + pod_security_standard=pod_security, + crio_root=crio_root, + crio_runroot=crio_runroot, + kubelet_root=kubelet_root, + log_root=log_root, + http_proxy=http_proxy, + https_proxy=https_proxy, + no_proxy=no_proxy, + http_proxy_alt=http_proxy_alt, + https_proxy_alt=https_proxy_alt, + ) + path = save_profile(profile) + st.session_state.active_profile = name + if "profile_selector" in st.session_state: + del st.session_state["profile_selector"] + st.session_state._flash_message = ("success", f"Profile '{name}' created successfully! Select it from the sidebar to get started.") + st.rerun() + + # ── Import Existing Cluster ────────────────────────────────────────── + with tab_import_cluster: + st.markdown("### Import Existing Kubernetes Cluster") + st.markdown( + "Connect to an existing K8s cluster by uploading its **kubeconfig** file. " + "This lets you use the Debugger, Monitoring, Log Analysis, and Resource Viewer " + "without provisioning a new cluster." + ) + + # NOTE: file_uploader is kept OUTSIDE st.form because Streamlit + # resets the uploaded file on form submission, causing the import + # to silently do nothing. + import_name = st.text_input( + "Profile Name *", + placeholder="my-existing-cluster", + key="import_cluster_name", + ) + import_desc = st.text_area( + "Description", + placeholder="Production cluster running in datacenter A", + key="import_cluster_desc", + ) + kubeconfig_file = st.file_uploader( + "Upload kubeconfig file", + type=["yaml", "yml", "conf", "config", "txt"], + key="kubeconfig_upload", + help="Usually found at ~/.kube/config on your cluster's control-plane node. " + "If your file has no extension, rename it to config.yaml or config.txt before uploading.", + ) + k8s_ver = st.text_input( + "Kubernetes Version (optional)", + placeholder="1.30", + value="1.30", + key="import_cluster_k8s_ver", + ) + + if st.button("Import Cluster", type="primary", use_container_width=True, key="import_cluster_btn"): + if not import_name: + st.error("Profile name is required.") + elif not kubeconfig_file: + st.error("Please upload a kubeconfig file.") + else: + kubeconfig_content = kubeconfig_file.read().decode("utf-8") + profile = ClusterProfile( + name=import_name, + description=import_desc, + kubernetes_version=k8s_ver or "1.30", + status="imported", + cluster_source="imported", + kubeconfig_content=kubeconfig_content, + ) + try: + save_profile(profile) + st.session_state.active_profile = import_name + if "profile_selector" in st.session_state: + del st.session_state["profile_selector"] + st.session_state._flash_message = ( + "success", + f"Cluster '{import_name}' imported successfully! " + "It is now the active profile. Use the sidebar navigation to go to " + "Cluster Debugger, Resource Viewer, Monitoring Setup, etc." + ) + st.rerun() + except Exception as e: + st.error(f"Failed to import cluster: {e}") + + # ── Manage Profiles ─────────────────────────────────────────────────── + with tab_list: + profiles = list_profiles() + if not profiles: + st.info("No profiles created yet.") + return + + for profile in profiles: + with st.expander(f"**{profile.name}** — {profile.status.upper()}", expanded=False): + col1, col2, col3 = st.columns([2, 2, 1]) + with col1: + st.markdown(f"**Description:** {profile.description or 'N/A'}") + st.markdown(f"**Kubernetes:** {profile.kubernetes_version} | **CRI-O:** {profile.crio_version}") + st.markdown(f"**Pod CIDR:** {profile.pod_cidr} | **Service CIDR:** {profile.service_cidr}") + st.markdown(f"**Pod Security:** {profile.pod_security_standard}") + st.markdown(f"**CRI-O Root:** `{profile.crio_root}` | **Kubelet Dir:** `{profile.kubelet_root}`") + st.markdown(f"**Log Root:** `{profile.log_root}`") + if profile.http_proxy or profile.https_proxy: + st.markdown(f"**Proxy:** `{profile.http_proxy or profile.https_proxy}`") + if profile.http_proxy_alt or profile.https_proxy_alt: + st.markdown(f"**Alt Proxy:** `{profile.http_proxy_alt or profile.https_proxy_alt}`") + with col2: + st.markdown("**Nodes:**") + for node in profile.nodes: + icon = "🔵" if node["role"] == "control-plane" else "🟢" + st.markdown( + f"{icon} `{node.get('hostname', 'N/A')}` — " + f"`{node['ip_address']}` ({node['role']})" + ) + with col3: + st.markdown(f"**Created:** {profile.created_at[:10] if profile.created_at else 'N/A'}") + st.markdown(f"**Updated:** {profile.updated_at[:10] if profile.updated_at else 'N/A'}") + if st.button("Set Active", key=f"activate_{profile.name}"): + st.session_state.active_profile = profile.name + if "profile_selector" in st.session_state: + del st.session_state["profile_selector"] + st.rerun() + if st.button("Delete", key=f"delete_{profile.name}", type="secondary"): + delete_profile(profile.name) + if st.session_state.active_profile == profile.name: + st.session_state.active_profile = None + if "profile_selector" in st.session_state: + del st.session_state["profile_selector"] + st.rerun() + + # ── Import / Export ─────────────────────────────────────────────────── + with tab_import: + col_export, col_import = st.columns(2) + with col_export: + st.markdown("### Export Profile") + profiles = list_profiles() + if profiles: + export_name = st.selectbox("Select profile to export", [p.name for p in profiles]) + if st.button("Export as JSON"): + profile = load_profile(export_name) + if profile: + from dataclasses import asdict + st.download_button( + label="Download JSON", + data=json.dumps(asdict(profile), indent=2), + file_name=f"{export_name}.json", + mime="application/json", + ) + + with col_import: + st.markdown("### Import Profile") + uploaded = st.file_uploader("Upload profile JSON", type=["json"]) + if uploaded: + try: + data = json.loads(uploaded.read()) + profile = ClusterProfile(**data) + save_profile(profile) + st.success(f"Profile '{profile.name}' imported!") + st.rerun() + except Exception as e: + st.error(f"Failed to import: {e}") + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Cluster Creation +# ══════════════════════════════════════════════════════════════════════════ + +def page_cluster_creation(): + st.markdown("## Cluster Creation") + st.markdown("Provision an on-prem K8s cluster via SSH with CRI-O, Flannel CNI, and best practices.") + + profile = _get_active_profile() + if not profile: + return + + _show_profile_summary(profile) + + tab_preflight, tab_provision, tab_reset, tab_scripts, tab_manifests, tab_advice = st.tabs([ + "Pre-flight Checks", + "Provision Cluster", + "Reset Cluster", + "View Scripts", + "Offline Manifests", + "AI Advice", + ]) + + # ── Pre-flight: SSH connectivity ────────────────────────────────────── + with tab_preflight: + st.markdown("### SSH Connectivity Test") + st.markdown("Test SSH access to all nodes before provisioning.") + + if profile.cluster_source == "imported": + st.info( + "SSH connectivity tests are not applicable for imported clusters. " + "Imported clusters connect via kubeconfig — no SSH access is needed. " + "Use the **Cluster Debugger** or **Resource Viewer** to verify connectivity." + ) + elif not profile.nodes: + st.warning("No nodes defined in this profile. Add nodes in the Profile Manager first.") + else: + if st.button("Test All Nodes", type="primary"): + all_ok = True + for node in profile.nodes: + with st.status(f"Testing {node.get('hostname', node['ip_address'])}...", expanded=True): + result = test_ssh_connectivity(node) + if result.success: + st.success(f"Connected to {node['ip_address']}") + st.code(result.stdout, language="text") + else: + all_ok = False + st.error(f"Failed to connect to {node['ip_address']}") + st.code(result.stderr, language="text") + if all_ok: + st.success("All nodes are reachable via SSH. You can proceed to provisioning.") + else: + st.error("Some nodes failed SSH connectivity. Fix the issues above before provisioning.") + + # ── Provision ───────────────────────────────────────────────────────── + with tab_provision: + st.markdown("### Automated Cluster Provisioning") + + if profile.cluster_source == "imported": + st.info( + "Provisioning is not available for imported clusters. " + "This cluster was imported via kubeconfig and is managed externally. " + "Use the **Resource Viewer**, **Cluster Debugger**, or **Monitoring Setup** " + "pages to work with your cluster." + ) + elif not profile.nodes: + st.warning("No nodes defined in this profile. Add nodes in the Profile Manager first.") + else: + st.warning( + "This will SSH into each node and execute every provisioning step " + "automatically. Ensure all nodes are accessible and you have root/sudo access." + ) + + cp_nodes = profile.get_control_plane_nodes() + worker_nodes = profile.get_worker_nodes() + + if profile.cluster_source != "imported" and profile.nodes: + st.markdown(f"**Control Plane:** {len(cp_nodes)} node(s) | **Workers:** {len(worker_nodes)} node(s)") + + col1, col2, col3 = st.columns(3) + with col1: + step1 = st.checkbox("Step 1: Common Setup (all nodes)", value=True) + with col2: + step2 = st.checkbox("Step 2: Init Control Plane", value=True) + with col3: + step3 = st.checkbox("Step 3: Join Workers", value=True) + step4 = st.checkbox("Step 4: Apply Best Practices", value=True) + else: + step1 = step2 = step3 = step4 = False + + if profile.cluster_source == "imported" or not profile.nodes: + pass # messages shown above + elif st.button("Start Provisioning", type="primary", use_container_width=True): + update_profile_status(profile.name, "provisioning") + overall_success = True + + # ── Step 1: Common setup on ALL nodes (granular per-step) ──── + if step1: + st.markdown("---") + st.markdown("### Step 1: Common Node Setup") + common_steps = get_common_setup_steps(profile) + for node in profile.nodes: + node_label = f"{node.get('hostname', node['ip_address'])} ({node['role']})" + st.markdown(f"#### Node: {node_label}") + node_ok = True + progress = st.progress(0, text=f"Starting setup on {node_label}...") + for idx, step in enumerate(common_steps): + pct = int((idx / len(common_steps)) * 100) + progress.progress(pct, text=f"[{idx+1}/{len(common_steps)}] {step.title}") + with st.status(f"{step.title}...", expanded=False) as status: + result = _run_step(node, step) + if result.success: + st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text") + status.update(label=f"{step.title} — done", state="complete") + else: + st.error(f"FAILED: {step.title}") + st.code(result.stderr or result.stdout, language="text") + status.update(label=f"{step.title} — FAILED", state="error") + node_ok = False + if step.fatal: + overall_success = False + break + progress.progress(100, text=f"{'Setup complete' if node_ok else 'Setup FAILED'} on {node_label}") + if node_ok: + st.success(f"Common setup complete on {node['ip_address']}") + else: + st.error(f"Common setup failed on {node['ip_address']}") + + # ── Step 2: Control plane init (granular per-step) ─────────── + if step2 and cp_nodes and overall_success: + st.markdown("---") + st.markdown("### Step 2: Control Plane Initialization") + cp_node = cp_nodes[0] + cp_steps = get_control_plane_steps(profile) + progress = st.progress(0, text="Starting control plane init...") + for idx, step in enumerate(cp_steps): + pct = int((idx / len(cp_steps)) * 100) + progress.progress(pct, text=f"[{idx+1}/{len(cp_steps)}] {step.title}") + with st.status(f"{step.title}...", expanded=False) as status: + result = _run_step(cp_node, step) + if result.success: + st.code(result.stdout[-2000:] if result.stdout else "(no output)", language="text") + status.update(label=f"{step.title} — done", state="complete") + else: + st.error(f"FAILED: {step.title}") + st.code(result.stderr or result.stdout, language="text") + status.update(label=f"{step.title} — FAILED", state="error") + overall_success = False + if step.fatal: + break + progress.progress(100, text="Control plane initialization complete" if overall_success else "Control plane init FAILED") + if overall_success: + st.success("Control plane initialized!") + else: + st.error("Control plane initialization failed!") + + # ── Step 3: Join workers (granular per-step) ───────────────── + if step3 and worker_nodes and cp_nodes and overall_success: + st.markdown("---") + st.markdown("### Step 3: Join Worker Nodes") + join_cmd = retrieve_join_command(cp_nodes[0]) + if join_cmd: + worker_join_steps = get_worker_join_steps(join_cmd) + for node in worker_nodes: + node_label = f"{node.get('hostname', node['ip_address'])}" + st.markdown(f"#### Worker: {node_label}") + for step in worker_join_steps: + with st.status(f"{step.title} on {node_label}...", expanded=False) as status: + result = _run_step(node, step) + if result.success: + st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text") + status.update(label=f"{step.title} — done", state="complete") + st.success(f"Worker {node['ip_address']} joined!") + else: + st.error(f"FAILED to join {node['ip_address']}") + st.code(result.stderr or result.stdout, language="text") + status.update(label=f"{step.title} — FAILED", state="error") + else: + st.error("Could not retrieve join command from control plane.") + + # ── Step 4: Best practices (granular per-step) ─────────────── + if step4 and cp_nodes and overall_success: + st.markdown("---") + st.markdown("### Step 4: Apply Best Practices") + bp_steps = get_best_practices_steps() + progress = st.progress(0, text="Applying best practices...") + for idx, step in enumerate(bp_steps): + pct = int((idx / len(bp_steps)) * 100) + progress.progress(pct, text=f"[{idx+1}/{len(bp_steps)}] {step.title}") + with st.status(f"{step.title}...", expanded=False) as status: + result = _run_step(cp_nodes[0], step) + if result.success: + st.code(result.stdout[-1000:] if result.stdout else "(no output)", language="text") + status.update(label=f"{step.title} — done", state="complete") + else: + st.error(f"FAILED: {step.title}") + st.code(result.stderr or result.stdout, language="text") + status.update(label=f"{step.title} — FAILED", state="error") + if step.fatal: + overall_success = False + break + progress.progress(100, text="Best practices applied" if overall_success else "Best practices FAILED") + if overall_success: + st.success("Best practices applied!") + + # ── Final cluster status ───────────────────────────────────── + st.markdown("---") + st.markdown("### Cluster Status") + if cp_nodes and overall_success: + with st.status("Checking cluster status...", expanded=True) as status: + result = get_cluster_status(cp_nodes[0]) + if result.success: + update_profile_status(profile.name, "active") + st.success("Cluster is active!") + st.code(result.stdout, language="text") + status.update(label="Cluster is active", state="complete") + else: + update_profile_status(profile.name, "error") + st.error("Could not verify cluster status") + st.code(result.stderr, language="text") + status.update(label="Status check failed", state="error") + elif not overall_success: + update_profile_status(profile.name, "error") + st.error("Provisioning did not complete successfully. Check the errors above.") + + # ── Reset Cluster ──────────────────────────────────────────────────── + with tab_reset: + st.markdown("### Reset / Tear Down Cluster") + st.markdown( + "Completely reset the Kubernetes cluster on all (or selected) nodes. " + "This will run `kubeadm reset`, stop services, remove CRI-O data, " + "CNI configs, etcd data, and flush iptables — preparing nodes for a " + "fresh cluster installation." + ) + + if profile.cluster_source == "imported": + st.info( + "Cluster reset requires SSH access to each node and is only " + "available for **provisioned** clusters. For imported clusters, " + "run `kubeadm reset` directly on each node." + ) + else: + all_nodes = profile.nodes + if not all_nodes: + st.warning("No nodes defined in this profile.") + else: + st.error( + "**WARNING:** This is a destructive operation. All Kubernetes data, " + "containers, etcd data, and configuration will be permanently deleted " + "from the selected nodes. This cannot be undone." + ) + + # Node selection + reset_node_labels = [ + f"{n.get('hostname', n.get('ip_address', '?'))} ({n.get('ip_address', '?')}) [{n.get('role', '?')}]" + for n in all_nodes + ] + reset_all = st.checkbox("Reset ALL nodes", value=True, key="reset_all_nodes") + + if not reset_all: + reset_idx = st.multiselect( + "Select nodes to reset", + options=list(range(len(all_nodes))), + format_func=lambda i: reset_node_labels[i], + default=list(range(len(all_nodes))), + key="reset_node_select", + ) + reset_nodes = [all_nodes[i] for i in reset_idx] + else: + reset_nodes = all_nodes + + # Options + col_r1, col_r2 = st.columns(2) + with col_r1: + remove_packages = st.checkbox( + "Also remove kubeadm/kubelet/kubectl packages", + value=False, + key="reset_remove_pkgs", + ) + with col_r2: + auto_reprovision = st.checkbox( + "Re-provision cluster after reset", + value=False, + key="reset_reprovision", + help="After reset completes, automatically start fresh provisioning using the Provision Cluster flow.", + ) + + # Confirmation + confirm_text = st.text_input( + 'Type **RESET** to confirm', + key="reset_confirm", + help="Type RESET (all caps) to enable the reset button.", + ) + + reset_enabled = confirm_text.strip() == "RESET" and len(reset_nodes) > 0 + if st.button( + f"Reset {len(reset_nodes)} Node(s)", + type="primary", + disabled=not reset_enabled, + use_container_width=True, + key="reset_go", + ): + update_profile_status(profile.name, "provisioning") + reset_steps = get_cluster_reset_steps(profile) + + # Optionally add package removal step + if remove_packages: + reset_steps.append( + ProvisionStep( + name="remove_packages", + title="Remove kubeadm/kubelet/kubectl packages", + script="""set -uo pipefail +echo '>> Removing Kubernetes packages...' +if command -v yum &>/dev/null; then + yum remove -y kubeadm kubelet kubectl cri-o 2>/dev/null || true +elif command -v apt-get &>/dev/null; then + apt-get remove -y --purge kubeadm kubelet kubectl cri-o 2>/dev/null || true +fi +echo 'Packages removed.' +""", + timeout=120, + fatal=False, + ) + ) + + reset_success = True + for node in reset_nodes: + node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})" + st.markdown(f"---\n#### Resetting: {node_label} [{node.get('role', '')}]") + progress = st.progress(0, text=f"Starting reset on {node_label}...") + node_ok = True + for idx, step in enumerate(reset_steps): + pct = int((idx / len(reset_steps)) * 100) + progress.progress(pct, text=f"[{idx+1}/{len(reset_steps)}] {step.title}") + with st.status(f"{step.title}...", expanded=False) as status: + result = _run_step(node, step) + if result.success: + st.code(result.stdout[-1500:] if result.stdout else "(no output)", language="text") + status.update(label=f"{step.title} — done", state="complete") + else: + st.warning(f"{step.title} — issue encountered") + st.code(result.stderr or result.stdout, language="text") + status.update(label=f"{step.title} — issue", state="error") + node_ok = False + if step.fatal: + reset_success = False + break + progress.progress(100, text=f"{'Reset complete' if node_ok else 'Reset had issues'} on {node_label}") + if node_ok: + st.success(f"Node {node_label} reset successfully.") + else: + st.warning(f"Node {node_label} reset completed with some issues. Check details above.") + + if reset_success: + update_profile_status(profile.name, "draft") + st.success("All selected nodes have been reset. The cluster has been torn down.") + st.info("You can now go to the **Provision Cluster** tab to create a new cluster on these nodes.") + + if auto_reprovision: + st.markdown("---") + st.markdown("### Auto Re-provisioning") + st.info( + "Auto re-provision is enabled. Please switch to the **Provision Cluster** tab " + "and click **Start Provisioning** to set up a fresh cluster with the current profile settings." + ) + else: + update_profile_status(profile.name, "error") + st.error("Reset encountered fatal errors on some nodes. Review the output above before re-provisioning.") + + # ── View Scripts ────────────────────────────────────────────────────── + with tab_scripts: + st.markdown("### Generated Scripts") + st.markdown("Review the scripts that will be executed during provisioning.") + + with st.expander("Common Setup Script (all nodes)", expanded=False): + st.code(generate_common_setup_script(profile), language="bash") + + with st.expander("Control Plane Init Script", expanded=False): + st.code(generate_control_plane_init_script(profile), language="bash") + + with st.expander("Worker Join Script", expanded=False): + st.code(generate_worker_join_script(), language="bash") + + with st.expander("Best Practices Script", expanded=False): + st.code(generate_best_practices_script(), language="bash") + + # ── Offline Manifests ─────────────────────────────────────────────────── + with tab_manifests: + st.markdown("### Offline / Custom Manifests") + st.markdown( + "If your environment cannot download manifests directly (air-gapped / proxy-restricted), " + "upload them here. They will be used instead of the default download URLs during provisioning." + ) + + st.markdown("#### Flannel CNI Manifest") + flannel_file = st.file_uploader( + "Upload kube-flannel.yml", + type=["yml", "yaml"], + key="flannel_upload", + help="Download from: https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml", + ) + if flannel_file is not None: + flannel_path = os.path.join(config.UPLOADS_DIR, "kube-flannel.yml") + with open(flannel_path, "wb") as f: + f.write(flannel_file.getvalue()) + profile.flannel_manifest_path = flannel_path + save_profile(profile) + st.success(f"Flannel manifest saved. It will be SCP'd to nodes during provisioning.") + + if profile.flannel_manifest_path: + st.info(f"Current Flannel manifest: `{profile.flannel_manifest_path}`") + if st.button("Clear Flannel manifest (use default URL)", key="clear_flannel"): + profile.flannel_manifest_path = "" + save_profile(profile) + st.rerun() + else: + st.info("No custom manifest — Flannel will be downloaded from the official GitHub release URL.") + + st.markdown("---") + st.markdown("#### Other Manifests") + st.markdown( + "You can also upload any additional YAML manifests. They will be stored " + "and can be applied manually via the **Custom Command** feature in the Cluster Debugger." + ) + extra_file = st.file_uploader( + "Upload additional manifest (YAML)", + type=["yml", "yaml"], + key="extra_manifest_upload", + ) + if extra_file is not None: + extra_path = os.path.join(config.UPLOADS_DIR, extra_file.name) + with open(extra_path, "wb") as f: + f.write(extra_file.getvalue()) + st.success(f"Saved `{extra_file.name}` to uploads.") + + # List existing uploaded files + if os.path.exists(config.UPLOADS_DIR): + uploaded_files = [ + f for f in os.listdir(config.UPLOADS_DIR) + if f.endswith((".yml", ".yaml")) + ] + if uploaded_files: + st.markdown("**Uploaded manifests:**") + for fname in sorted(uploaded_files): + st.markdown(f"- `{fname}`") + + # ── AI Advice ───────────────────────────────────────────────────────── + with tab_advice: + st.markdown("### AI Cluster Setup Advisor") + if not is_llm_configured(): + st.info( + "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` " + "environment variables to enable AI-powered recommendations." + ) + else: + context = st.text_area( + "Additional context or questions", + placeholder="e.g., We have 3 nodes with 16GB RAM each. Any special considerations?", + ) + if st.button("Get AI Recommendations", type="primary"): + with st.spinner("Analyzing your cluster configuration..."): + advice = get_llm_cluster_advice(profile, context) + st.markdown(advice) + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Cluster Debugger +# ══════════════════════════════════════════════════════════════════════════ + +def page_cluster_debugger(): + st.markdown("## Cluster Debugger") + st.markdown("Diagnose issues and get recommendations.") + + profile = _get_active_profile() + if not profile: + return + + # For imported clusters we don't need a CP node — commands run locally via kubeconfig + cp_node = None + if profile.cluster_source != "imported": + cp_nodes = profile.get_control_plane_nodes() + if not cp_nodes: + st.error("No control-plane node defined in this profile.") + return + cp_node = cp_nodes[0] + + # kubectl availability warning for imported clusters + if profile.cluster_source == "imported" and not get_kubectl_path(): + st.warning( + "kubectl not found on this machine. Commands will fail until kubectl is installed.\n\n" + "Install: `curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl && mv kubectl ~/.local/bin/`" + ) + + available_commands = get_available_commands(profile) + + tab_quick, tab_category, tab_custom, tab_ai = st.tabs([ + "Quick Diagnostics", + "Category Scan", + "Custom Command", + "AI Debug Assistant", + ]) + + # ── Quick Diagnostics ───────────────────────────────────────────────── + with tab_quick: + st.markdown("### Quick Diagnostic Checks") + col1, col2 = st.columns(2) + with col1: + selected_checks = st.multiselect( + "Select checks to run", + options=list(available_commands.keys()), + default=["Node Status", "Pod Status (All Namespaces)", "Events (Recent)"], + ) + with col2: + run_all = st.checkbox("Run ALL diagnostics") + + if st.button("Run Diagnostics", type="primary"): + if run_all: + with st.spinner("Running all diagnostics..."): + results = run_all_diagnostics(cp_node, profile=profile) + else: + results = {} + for check in selected_checks: + with st.spinner(f"Running: {check}..."): + results[check] = run_diagnostic(cp_node, check, profile=profile) + + st.session_state.debug_results = results + + for name, result in results.items(): + status_icon = "+" if result.success else "-" + with st.expander(f"{'✅' if result.success else '❌'} {name}", expanded=not result.success): + st.code(result.stdout if result.success else result.stderr, language="text") + + if st.session_state.debug_results: + if not is_llm_configured(): + st.info("Enable AI analysis by selecting a provider (OpenAI or Ollama) in the sidebar LLM Settings.") + elif st.button("Analyze with AI", type="secondary"): + with st.spinner("AI is analyzing diagnostics..."): + analysis = analyze_diagnostics( + st.session_state.debug_results, + profile=profile, + ) + st.markdown(analysis) + + # ── Category Scan ───────────────────────────────────────────────────── + with tab_category: + st.markdown("### Category-Based Diagnostics") + category = st.selectbox("Select Category", options=list(CATEGORY_MAP.keys())) + + if st.button("Run Category Scan", type="primary", key="cat_scan"): + with st.spinner(f"Running {category} diagnostics..."): + results = run_category_diagnostics(cp_node, category, profile=profile) + + for name, result in results.items(): + with st.expander(f"{'✅' if result.success else '❌'} {name}"): + st.code(result.stdout if result.success else result.stderr, language="text") + + if is_llm_configured(): + if st.button("Analyze Category with AI", key="cat_ai"): + with st.spinner("Analyzing..."): + analysis = analyze_diagnostics(results, profile=profile) + st.markdown(analysis) + + # ── Custom Command ──────────────────────────────────────────────────── + with tab_custom: + st.markdown("### Run Custom Command") + if profile.cluster_source == "imported": + st.info("Commands run locally via kubectl using the imported kubeconfig.") + else: + st.warning("Commands execute on the control-plane node via SSH.") + custom_cmd = st.text_area( + "Command", + placeholder="kubectl get pods -A -o wide", + height=100, + ) + if st.button("Execute", type="primary", key="exec_custom") and custom_cmd: + with st.spinner("Executing..."): + result = run_custom_command(cp_node, custom_cmd, profile=profile) + if result.success: + st.code(result.stdout, language="text") + else: + st.error("Command failed") + st.code(result.stderr, language="text") + + # ── AI Debug Assistant ──────────────────────────────────────────────── + with tab_ai: + st.markdown("### AI Debug Assistant") + if not is_llm_configured(): + st.info( + "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` " + "environment variables to enable AI-powered debugging." + ) + st.markdown( + "You can still use the **Quick Diagnostics**, **Category Scan**, and " + "**Custom Command** tabs to collect diagnostic data without an LLM." + ) + else: + st.markdown("Describe your issue and get AI-powered debugging help.") + + issue = st.text_area( + "Describe the issue", + placeholder="e.g., Pods are stuck in CrashLoopBackOff in the default namespace", + height=120, + ) + + col1, col2 = st.columns(2) + with col1: + auto_collect = st.checkbox("Auto-collect relevant diagnostics", value=True) + with col2: + check_pods = st.checkbox("Check for problematic pods", value=True) + + if st.button("Debug", type="primary", key="ai_debug") and issue: + collected_data = "" + + if check_pods: + with st.spinner("Checking pod issues..."): + pod_result = check_pod_issues(cp_node, profile=profile) + if pod_result.success and pod_result.stdout.strip(): + collected_data += f"\n\nProblematic Pods:\n{pod_result.stdout}" + with st.expander("Problematic Pods"): + st.code(pod_result.stdout, language="text") + + if auto_collect: + with st.spinner("Collecting diagnostics..."): + diag_results = run_category_diagnostics(cp_node, "Cluster Overview", profile=profile) + for name, result in diag_results.items(): + if result.success: + collected_data += f"\n\n{name}:\n{result.stdout}" + + with st.spinner("AI is analyzing the issue..."): + full_context = f"Issue: {issue}\n\nCollected Data:{collected_data}" + suggestion = get_debug_suggestion(issue, collected_data) + st.markdown("### AI Recommendation") + st.markdown(suggestion) + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Monitoring Setup +# ══════════════════════════════════════════════════════════════════════════ + +def page_monitoring_setup(): + st.markdown("## Monitoring Setup") + st.markdown("Deploy Prometheus, Grafana, dashboards, and alerting rules.") + + profile = _get_active_profile() + if not profile: + return + + # For imported clusters we don't need a CP node + cp_node = None + if profile.cluster_source != "imported": + cp_nodes = profile.get_control_plane_nodes() + if not cp_nodes: + st.error("No control-plane node defined in this profile.") + return + cp_node = cp_nodes[0] + + # Namespace selection — auto-fetch from cluster for imported clusters + if profile.cluster_source == "imported" and profile.kubeconfig_content: + cluster_ns = fetch_namespaces(profile.kubeconfig_content) + if cluster_ns: + # Ensure "monitoring" is an option even if it doesn't exist yet + ns_options = cluster_ns if "monitoring" in cluster_ns else cluster_ns + ["monitoring"] + default_idx = ns_options.index("monitoring") if "monitoring" in ns_options else 0 + namespace = st.selectbox("Monitoring Namespace", options=ns_options, index=default_idx, key="mon_ns") + else: + namespace = st.text_input("Monitoring Namespace", value="monitoring", key="mon_ns_txt") + else: + namespace = st.text_input("Monitoring Namespace", value="monitoring", key="mon_ns_txt") + + tab_install, tab_metrics, tab_dashboards, tab_alerts, tab_status, tab_scripts, tab_advice = st.tabs([ + "Install Stack", + "Metrics Components", + "Dashboards", + "Alert Rules", + "Status", + "View Scripts", + "AI Advice", + ]) + + # ── Install ─────────────────────────────────────────────────────────── + with tab_install: + st.markdown("### Install Monitoring Stack") + st.markdown("This installs **kube-prometheus-stack** (Prometheus + Grafana + exporters).") + + col1, col2 = st.columns(2) + with col1: + install_helm_first = st.checkbox("Install Helm (if not present)", value=True) + with col2: + install_alerts_too = st.checkbox("Also install alert rules", value=True) + + if st.button("Install Prometheus + Grafana", type="primary", use_container_width=True): + if install_helm_first: + with st.status("Installing Helm...", expanded=True): + result = install_helm(cp_node, profile=profile) + if result.success: + st.success("Helm ready!") + else: + st.error("Helm installation failed") + st.code(result.stderr, language="text") + + with st.status("Installing kube-prometheus-stack (this may take several minutes)...", expanded=True): + result = install_prometheus_stack(cp_node, namespace, profile=profile) + if result.success: + st.success("Prometheus + Grafana installed!") + st.code(result.stdout[-2000:], language="text") + else: + st.error("Installation failed") + st.code(result.stderr, language="text") + + if install_alerts_too: + with st.status("Installing alert rules...", expanded=True): + result = install_alert_rules(cp_node, namespace, profile=profile) + if result.success: + st.success("Alert rules installed!") + else: + st.error("Alert rules installation failed") + st.code(result.stderr, language="text") + + # ── Metrics Components ──────────────────────────────────────────────── + with tab_metrics: + st.markdown("### Metrics Components") + st.markdown( + "Install **metrics-server** (enables `kubectl top`) and/or " + "**kube-state-metrics** (exposes workload/object-level metrics to Prometheus)." + ) + + met_col1, met_col2 = st.columns(2) + + with met_col1: + st.markdown("#### metrics-server") + st.markdown( + "Provides CPU/memory usage for pods and nodes. " + "Required for `kubectl top` and HPA autoscaling." + ) + ms_insecure = st.checkbox( + "Add `--kubelet-insecure-tls` flag (self-signed certs)", + value=True, + key="ms_insecure", + ) + if st.button("Install metrics-server", type="primary", key="install_ms"): + ms_url = ( + "https://github.com/kubernetes-sigs/metrics-server" + "/releases/latest/download/components.yaml" + ) + with st.status("Installing metrics-server...", expanded=True): + # Apply the manifest + apply_result = run_kubectl( + profile, + f"apply -f {ms_url}", + timeout=60, + ) + if apply_result.success: + st.write("Manifest applied successfully.") + st.code(apply_result.stdout, language="text") + # Patch for insecure TLS if requested + if ms_insecure: + patch_cmd = ( + "patch deployment metrics-server -n kube-system " + "--type=json -p=" + "'[{\"op\":\"add\",\"path\":\"/spec/template/spec/containers/0/args/-\"," + "\"value\":\"--kubelet-insecure-tls\"}]'" + ) + patch_result = run_kubectl(profile, patch_cmd, timeout=30) + if patch_result.success: + st.success("metrics-server installed with --kubelet-insecure-tls!") + else: + st.warning("Installed but TLS patch may have failed (already applied?).") + st.code(patch_result.stderr, language="text") + else: + st.success("metrics-server installed!") + else: + st.error("metrics-server installation failed") + st.code(apply_result.stderr, language="text") + + # Check status + if st.button("Check metrics-server status", key="ms_status"): + with st.spinner("Checking..."): + result = run_kubectl( + profile, + "get deployment metrics-server -n kube-system -o wide", + timeout=15, + ) + if result.success: + st.code(result.stdout, language="text") + else: + st.warning("metrics-server not found or not ready.") + st.code(result.stderr, language="text") + + with met_col2: + st.markdown("#### kube-state-metrics") + st.markdown( + "Exposes object-level metrics (Deployments, Pods, Nodes, etc.) " + "to Prometheus for dashboards and alerting." + ) + ksm_ns = namespace # reuse the monitoring namespace + if st.button("Install kube-state-metrics", type="primary", key="install_ksm"): + with st.status("Installing kube-state-metrics...", expanded=True): + # Use helm if available, otherwise apply raw manifest + helm_cmd = ( + f"helm install kube-state-metrics " + f"oci://registry-1.docker.io/bitnamicharts/kube-state-metrics " + f"-n {ksm_ns} --create-namespace" + ) + result = run_kubectl(profile, helm_cmd, timeout=120) + if result.success: + st.success("kube-state-metrics installed via Helm!") + st.code(result.stdout, language="text") + else: + st.warning("Helm install failed, trying kubectl apply...") + st.code(result.stderr, language="text") + # Fallback: direct manifest from GitHub + ksm_url = ( + "https://raw.githubusercontent.com/kubernetes/" + "kube-state-metrics/main/examples/standard/service.yaml" + ) + apply_result = run_kubectl( + profile, + f"apply -f https://raw.githubusercontent.com/kubernetes/kube-state-metrics/main/examples/standard/ 2>/dev/null || echo 'Manual install required'", + timeout=60, + ) + if apply_result.success: + st.success("kube-state-metrics applied!") + st.code(apply_result.stdout, language="text") + else: + st.error( + "Could not install kube-state-metrics automatically.\n\n" + "Manual install:\n" + "```\nhelm repo add prometheus-community " + "https://prometheus-community.github.io/helm-charts\n" + "helm install kube-state-metrics " + f"prometheus-community/kube-state-metrics -n {ksm_ns}\n```" + ) + + if st.button("Check kube-state-metrics status", key="ksm_status"): + with st.spinner("Checking..."): + result = run_kubectl( + profile, + f"get pods -n {ksm_ns} -l app.kubernetes.io/name=kube-state-metrics -o wide", + timeout=15, + ) + if result.success and result.stdout.strip(): + st.code(result.stdout, language="text") + else: + # Try broader search + result2 = run_kubectl( + profile, + "get pods -A -l app.kubernetes.io/name=kube-state-metrics -o wide", + timeout=15, + ) + if result2.success and result2.stdout.strip(): + st.code(result2.stdout, language="text") + else: + st.warning("kube-state-metrics not found on the cluster.") + + # ── Dashboards ──────────────────────────────────────────────────────── + with tab_dashboards: + st.markdown("### Grafana Dashboards") + st.markdown("Select dashboards to import into Grafana.") + + selected_dashboards = [] + cols = st.columns(2) + for i, (key, dash) in enumerate(GRAFANA_DASHBOARDS.items()): + with cols[i % 2]: + if st.checkbox(f"**{dash['name']}**\n{dash['description']}", value=True, key=f"dash_{key}"): + selected_dashboards.append(key) + + if st.button("Import Dashboards", type="primary") and selected_dashboards: + with st.status("Importing dashboards...", expanded=True): + result = install_dashboards(cp_node, selected_dashboards, namespace, profile=profile) + if result.success: + st.success(f"Imported {len(selected_dashboards)} dashboards!") + st.code(result.stdout, language="text") + else: + st.error("Dashboard import failed") + st.code(result.stderr, language="text") + + # ── Alert Rules ─────────────────────────────────────────────────────── + with tab_alerts: + st.markdown("### Alerting Rules") + st.markdown("Install production-ready alerting rules for nodes, pods, and etcd.") + + with st.expander("View Alert Rules", expanded=False): + st.code(generate_alerting_rules_script(namespace), language="yaml") + + if st.button("Install Alert Rules", type="primary", key="install_alerts"): + with st.spinner("Installing alert rules..."): + result = install_alert_rules(cp_node, namespace, profile=profile) + if result.success: + st.success("Alert rules installed!") + st.code(result.stdout, language="text") + else: + st.error("Failed to install alert rules") + st.code(result.stderr, language="text") + + # ── Status ──────────────────────────────────────────────────────────── + with tab_status: + st.markdown("### Monitoring Stack Status") + if st.button("Check Status", type="primary", key="mon_status"): + with st.spinner("Checking monitoring stack..."): + result = get_monitoring_status(cp_node, namespace, profile=profile) + if result.success: + st.code(result.stdout, language="text") + else: + st.warning("Could not retrieve monitoring status") + st.code(result.stderr, language="text") + + # ── View Scripts ────────────────────────────────────────────────────── + with tab_scripts: + st.markdown("### Generated Scripts") + with st.expander("Prometheus Install Script"): + st.code(generate_prometheus_install_script(namespace), language="bash") + with st.expander("Dashboard Import Script"): + all_keys = list(GRAFANA_DASHBOARDS.keys()) + st.code(generate_dashboard_import_script(all_keys, namespace), language="bash") + with st.expander("Alert Rules Script"): + st.code(generate_alerting_rules_script(namespace), language="bash") + + # ── AI Advice ───────────────────────────────────────────────────────── + with tab_advice: + st.markdown("### AI Monitoring Advisor") + if not is_llm_configured(): + st.info( + "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` " + "environment variables to enable AI-powered monitoring advice." + ) + else: + if st.button("Get Monitoring Recommendations", type="primary", key="mon_advice"): + current_status = "" + status_result = get_monitoring_status(cp_node, namespace, profile=profile) + if status_result.success: + current_status = status_result.stdout + + with st.spinner("Getting AI recommendations..."): + advice = get_monitoring_advice(profile, current_status) + st.markdown(advice) + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Log Analysis +# ══════════════════════════════════════════════════════════════════════════ + +def page_log_analysis(): + st.markdown("## Log Analysis & Error Correlation") + st.markdown("Collect, parse, and analyze logs from your cluster components.") + + profile = _get_active_profile() + if not profile: + return + + # For imported clusters we don't need a CP node + cp_node = None + if profile.cluster_source != "imported": + cp_nodes = profile.get_control_plane_nodes() + if not cp_nodes: + st.error("No control-plane node defined in this profile.") + return + cp_node = cp_nodes[0] + + # Pre-fetch namespaces for imported clusters (used by Pod Logs tab) + _cluster_namespaces: list[str] = [] + if profile.cluster_source == "imported" and profile.kubeconfig_content: + _cluster_namespaces = fetch_namespaces(profile.kubeconfig_content) + + available_log_sources = get_available_log_sources(profile) + + tab_system, tab_pod, tab_correlation, tab_smart, tab_ai = st.tabs([ + "System Logs", + "Pod Logs", + "Error Correlation", + "Smart Log Analysis", + "AI Log Analysis", + ]) + + # ── System Logs ─────────────────────────────────────────────────────── + with tab_system: + st.markdown("### System Component Logs") + col1, col2, col3 = st.columns(3) + with col1: + default_sources = [s for s in ["Kubelet", "CRI-O", "Events"] if s in available_log_sources] + if not default_sources: + default_sources = available_log_sources[:3] if available_log_sources else [] + sources = st.multiselect( + "Log Sources", + options=available_log_sources, + default=default_sources, + ) + with col2: + log_lines = st.number_input("Lines to fetch", min_value=50, max_value=1000, value=200) + with col3: + since_options = {"Last 15 min": ("15 minutes ago", "15m"), + "Last 1 hour": ("1 hour ago", "1h"), + "Last 6 hours": ("6 hours ago", "6h"), + "Last 24 hours": ("24 hours ago", "24h")} + since_label = st.selectbox("Time Range", options=list(since_options.keys()), index=1) + since, since_k8s = since_options[since_label] + + if st.button("Collect Logs", type="primary", key="collect_sys"): + log_data = {} + for source in sources: + with st.spinner(f"Collecting {source} logs..."): + result = collect_logs(cp_node, source, log_lines, since, since_k8s, profile=profile) + if result.success: + log_data[source] = result.stdout + analysis = analyze_logs(result.stdout, source) + + with st.expander( + f"{'❌' if analysis.error_count > 0 else '✅'} {source} " + f"({analysis.error_count} errors, {analysis.warning_count} warnings)", + expanded=analysis.error_count > 0, + ): + # Metrics + m1, m2, m3 = st.columns(3) + m1.metric("Total Lines", analysis.total_lines) + m2.metric("Errors", analysis.error_count) + m3.metric("Warnings", analysis.warning_count) + + if analysis.error_patterns: + st.markdown("**Top Error Patterns:**") + for pattern, count in list(analysis.error_patterns.items())[:5]: + st.markdown(f"- `{pattern}` (x{count})") + + st.code(result.stdout[-3000:], language="text") + else: + with st.expander(f"❌ {source} — FAILED"): + st.code(result.stderr, language="text") + + st.session_state.log_analysis_results = log_data + + # ── Pod Logs ────────────────────────────────────────────────────────── + with tab_pod: + st.markdown("### Pod Logs") + + # --- Namespace selection --- + col1, col2 = st.columns(2) + with col1: + if _cluster_namespaces: + pod_ns = st.selectbox("Namespace", options=_cluster_namespaces, + index=_cluster_namespaces.index("default") if "default" in _cluster_namespaces else 0, + key="pod_ns") + else: + pod_ns = st.text_input("Namespace", value="default", key="pod_ns") + with col2: + pod_lines = st.number_input("Lines", min_value=50, max_value=5000, value=200, key="pod_lines") + + # --- Load pods from the cluster --- + if st.button("Load Pods", key="load_pods_btn"): + with st.spinner(f"Fetching pods in namespace '{pod_ns}'..."): + pod_result = get_pod_list(cp_node, namespace=pod_ns, profile=profile) + if pod_result.success and pod_result.stdout.strip(): + _pods: list[dict] = [] + for line in pod_result.stdout.strip().split("\n"): + parts = line.split() + if len(parts) >= 4: + _pods.append({ + "namespace": parts[0], + "name": parts[1], + "status": parts[2], + "containers": parts[3], + }) + elif len(parts) >= 2: + _pods.append({ + "namespace": parts[0], + "name": parts[1], + "status": parts[2] if len(parts) > 2 else "Unknown", + "containers": parts[3] if len(parts) > 3 else "", + }) + st.session_state["_pod_list"] = _pods + st.session_state["_pod_list_ns"] = pod_ns + st.success(f"Found {len(_pods)} pod(s) in namespace '{pod_ns}'.") + elif pod_result.success: + st.session_state["_pod_list"] = [] + st.session_state["_pod_list_ns"] = pod_ns + st.warning(f"No pods found in namespace '{pod_ns}'.") + else: + st.error(f"Failed to fetch pods: {pod_result.stderr}") + + # --- Pod & container dropdowns --- + _pods_loaded = st.session_state.get("_pod_list", []) + _pods_loaded_ns = st.session_state.get("_pod_list_ns", "") + + col_p1, col_p2 = st.columns(2) + with col_p1: + if _pods_loaded and _pods_loaded_ns == pod_ns: + pod_options = [f"{p['name']} ({p['status']})" for p in _pods_loaded] + selected_pod_idx = st.selectbox( + "Pod Name", options=range(len(pod_options)), + format_func=lambda i: pod_options[i], + key="pod_name_select", + ) + pod_name = _pods_loaded[selected_pod_idx]["name"] if selected_pod_idx is not None else "" + else: + pod_name = st.text_input("Pod Name", placeholder="my-pod-xyz (click Load Pods to get dropdown)", key="pod_name_input") + + with col_p2: + if _pods_loaded and _pods_loaded_ns == pod_ns and pod_name: + # Find the selected pod's containers + _selected_pod = next((p for p in _pods_loaded if p["name"] == pod_name), None) + _containers: list[str] = [] + if _selected_pod and _selected_pod.get("containers"): + _containers = [c.strip() for c in _selected_pod["containers"].split(",") if c.strip()] + if _containers: + container_options = ["(all / default)"] + _containers + container_sel = st.selectbox("Container", options=container_options, key="pod_container_select") + container = "" if container_sel == "(all / default)" else container_sel + else: + container = st.text_input("Container (optional)", key="pod_container") + else: + container = st.text_input("Container (optional)", key="pod_container") + + pod_previous = st.checkbox("Previous container logs (crash recovery)") + + # --- Fetch logs --- + if st.button("Fetch Pod Logs", type="primary", key="fetch_pod"): + if not pod_name: + st.warning("Please enter a pod name or click **Load Pods** to select one.") + else: + with st.spinner(f"Fetching logs for {pod_ns}/{pod_name}..."): + result = collect_pod_logs( + cp_node, pod_ns, pod_name, container, pod_lines, + "1h", pod_previous, profile=profile, + ) + if result.success: + if not result.stdout.strip(): + st.info(f"No log output returned for pod `{pod_ns}/{pod_name}`. " + "The pod may have just started or has no recent logs.") + else: + analysis = analyze_logs(result.stdout, f"{pod_ns}/{pod_name}") + m1, m2, m3 = st.columns(3) + m1.metric("Total Lines", analysis.total_lines) + m2.metric("Errors", analysis.error_count) + m3.metric("Warnings", analysis.warning_count) + + if analysis.error_patterns: + st.markdown("**Error Patterns:**") + for pattern, count in list(analysis.error_patterns.items())[:10]: + st.markdown(f"- `{pattern}` (x{count})") + + st.code(result.stdout[-5000:], language="text") + + if analysis.error_count > 0 and is_llm_configured(): + if st.button("Analyze with AI", key="pod_ai"): + with st.spinner("AI analyzing pod logs..."): + ai_analysis = llm_analyze_logs( + result.stdout, f"{pod_ns}/{pod_name}" + ) + st.markdown(ai_analysis) + else: + st.error(f"Failed to fetch pod logs for `{pod_ns}/{pod_name}`") + st.code(result.stderr, language="text") + + # ── Error Correlation ───────────────────────────────────────────────── + with tab_correlation: + st.markdown("### Cross-Source Error Correlation") + st.markdown("Collect logs from multiple sources and correlate errors across them.") + + default_corr = [s for s in ["Kubelet", "CRI-O", "API Server", "Events"] if s in available_log_sources] + if not default_corr: + default_corr = available_log_sources[:4] if available_log_sources else [] + corr_sources = st.multiselect( + "Sources to correlate", + options=available_log_sources, + default=default_corr, + key="corr_sources", + ) + + if st.button("Collect & Correlate", type="primary", key="correlate"): + with st.spinner("Collecting logs from multiple sources..."): + results = collect_multi_source_logs(cp_node, corr_sources, lines=150, profile=profile) + + correlated = correlate_errors(results) + + if correlated: + st.markdown(f"### Found {len(correlated)} correlated error groups") + for i, group in enumerate(correlated): + with st.expander( + f"Correlation #{i + 1}: {', '.join(group['sources_involved'])}", + expanded=True, + ): + st.markdown(f"**Primary Error** ({group['primary']['source']}):") + st.code(group["primary"]["message"], language="text") + st.markdown("**Related Errors:**") + for related in group["related"]: + st.markdown(f"- **{related['source']}**: `{related['message'][:200]}`") + else: + st.info("No correlated errors found across sources.") + + # LLM correlation analysis + if is_llm_configured(): + if st.button("Deep AI Correlation Analysis", key="deep_corr"): + multi_logs = { + src: res.stdout for src, res in results.items() if res.success + } + with st.spinner("AI is performing deep correlation analysis..."): + analysis = llm_correlate_analysis(multi_logs) + st.markdown(analysis) + + # ── Smart Log Analysis (LogAI-inspired) ───────────────────────────── + with tab_smart: + st.markdown("### Smart Log Analysis (LogAI-inspired)") + st.markdown( + "ML-powered log analysis using techniques from " + "[Salesforce LogAI](https://github.com/salesforce/logai): " + "**log clustering** (TF-IDF + DBSCAN), **anomaly detection**, " + "**pattern mining** (Drain-style), and **auto-summarization**." + ) + + st.info( + "**Istio access log analysis:** Select **Collect pod logs** and choose an " + "application pod with an Istio sidecar (e.g. `istio-proxy` container). " + "The pipeline auto-detects Envoy/Istio access logs and shows response time " + "analytics, status codes, per-path breakdowns, and slow requests." + ) + + smart_mode = st.radio( + "Analysis mode", + ["Collect from cluster", "Collect pod logs", "Paste logs"], + horizontal=True, + key="smart_mode", + ) + + smart_log_text = "" + + if smart_mode == "Collect from cluster": + scol1, scol2, scol3 = st.columns(3) + with scol1: + smart_source = st.selectbox( + "Log Source", available_log_sources, key="smart_source", + ) + with scol2: + smart_lines = st.number_input( + "Lines to fetch", min_value=100, max_value=5000, value=500, key="smart_lines", + ) + with scol3: + smart_since_opts = { + "Last 15 min": ("15 minutes ago", "15m"), + "Last 1 hour": ("1 hour ago", "1h"), + "Last 6 hours": ("6 hours ago", "6h"), + "Last 24 hours": ("24 hours ago", "24h"), + } + smart_since_label = st.selectbox( + "Time Range", list(smart_since_opts.keys()), index=1, key="smart_since", + ) + smart_since, smart_since_k8s = smart_since_opts[smart_since_label] + + if st.button("Collect & Analyze", type="primary", key="smart_collect"): + with st.spinner(f"Collecting {smart_source} logs..."): + result = collect_logs( + cp_node, smart_source, smart_lines, smart_since, smart_since_k8s, profile=profile, + ) + if result.success and result.stdout.strip(): + smart_log_text = result.stdout + st.session_state["_smart_log_text"] = smart_log_text + st.session_state["_smart_source"] = smart_source + elif result.success: + st.info("No logs returned for the selected source and time range.") + else: + st.error(f"Failed to collect logs: {result.stderr}") + + # Persist across reruns + if "_smart_log_text" in st.session_state and not smart_log_text: + smart_log_text = st.session_state["_smart_log_text"] + + elif smart_mode == "Collect pod logs": + st.markdown( + "Fetch logs from a specific pod — ideal for **Istio sidecar** access " + "logs (`istio-proxy` container) or any application pod." + ) + spcol1, spcol2 = st.columns(2) + with spcol1: + if _cluster_namespaces: + smart_pod_ns = st.selectbox( + "Namespace", options=_cluster_namespaces, + index=_cluster_namespaces.index("default") if "default" in _cluster_namespaces else 0, + key="smart_pod_ns", + ) + else: + smart_pod_ns = st.text_input("Namespace", value="default", key="smart_pod_ns") + with spcol2: + smart_pod_lines = st.number_input( + "Lines to fetch", min_value=100, max_value=10000, value=1000, key="smart_pod_lines", + ) + + # Load pods button + if st.button("Load Pods", key="smart_load_pods"): + with st.spinner(f"Fetching pods in namespace '{smart_pod_ns}'..."): + pod_result = get_pod_list(cp_node, namespace=smart_pod_ns, profile=profile) + if pod_result.success and pod_result.stdout.strip(): + _sp_pods: list[dict] = [] + for line in pod_result.stdout.strip().split("\n"): + parts = line.split() + if len(parts) >= 2: + _sp_pods.append({ + "namespace": parts[0], + "name": parts[1], + "status": parts[2] if len(parts) > 2 else "Unknown", + "containers": parts[3] if len(parts) > 3 else "", + }) + st.session_state["_smart_pod_list"] = _sp_pods + st.session_state["_smart_pod_list_ns"] = smart_pod_ns + st.success(f"Found {len(_sp_pods)} pod(s) in namespace '{smart_pod_ns}'.") + elif pod_result.success: + st.session_state["_smart_pod_list"] = [] + st.warning(f"No pods found in namespace '{smart_pod_ns}'.") + else: + st.error(f"Failed to fetch pods: {pod_result.stderr}") + + # Pod & container selection + _sp_pods_loaded = st.session_state.get("_smart_pod_list", []) + _sp_pods_ns = st.session_state.get("_smart_pod_list_ns", "") + + sp_col1, sp_col2 = st.columns(2) + with sp_col1: + if _sp_pods_loaded and _sp_pods_ns == smart_pod_ns: + sp_pod_options = [f"{p['name']} ({p['status']})" for p in _sp_pods_loaded] + sp_selected_idx = st.selectbox( + "Pod Name", options=range(len(sp_pod_options)), + format_func=lambda i: sp_pod_options[i], + key="smart_pod_select", + ) + smart_pod_name = _sp_pods_loaded[sp_selected_idx]["name"] if sp_selected_idx is not None else "" + else: + smart_pod_name = st.text_input( + "Pod Name", placeholder="Click 'Load Pods' to get dropdown", key="smart_pod_name", + ) + with sp_col2: + # Container selection — show istio-proxy hint + if _sp_pods_loaded and _sp_pods_ns == smart_pod_ns and smart_pod_name: + matching = [p for p in _sp_pods_loaded if p["name"] == smart_pod_name] + container_names = [] + if matching and matching[0].get("containers"): + container_names = [c.strip() for c in matching[0]["containers"].split(",") if c.strip()] + if container_names: + container_names = ["(all / default)"] + container_names + # Pre-select istio-proxy if available + default_idx = 0 + for idx, cn in enumerate(container_names): + if cn == "istio-proxy": + default_idx = idx + break + smart_pod_container = st.selectbox( + "Container (select `istio-proxy` for Istio access logs)", + options=container_names, + index=default_idx, + key="smart_pod_container", + ) + if smart_pod_container == "(all / default)": + smart_pod_container = "" + else: + smart_pod_container = st.text_input( + "Container (e.g. istio-proxy)", + value="istio-proxy", + key="smart_pod_container_text", + ) + else: + smart_pod_container = st.text_input( + "Container (e.g. istio-proxy for Istio access logs)", + value="istio-proxy", + key="smart_pod_container_text2", + ) + + # Fetch & analyze + smart_pod_since_opts = { + "Last 15 min": "15m", + "Last 1 hour": "1h", + "Last 6 hours": "6h", + "Last 24 hours": "24h", + } + smart_pod_since_label = st.selectbox( + "Time Range", list(smart_pod_since_opts.keys()), index=1, key="smart_pod_since", + ) + smart_pod_since_k8s = smart_pod_since_opts[smart_pod_since_label] + + if st.button("Fetch Pod Logs & Analyze", type="primary", key="smart_pod_collect"): + if not smart_pod_name: + st.warning("Please select or enter a pod name.") + else: + with st.spinner(f"Fetching logs from pod '{smart_pod_name}' (container: {smart_pod_container or 'default'})..."): + pod_log_result = collect_pod_logs( + cp_node, + namespace=smart_pod_ns, + pod_name=smart_pod_name, + container=smart_pod_container, + lines=smart_pod_lines, + since_k8s=smart_pod_since_k8s, + profile=profile, + ) + if pod_log_result.success and pod_log_result.stdout.strip(): + smart_log_text = pod_log_result.stdout + st.session_state["_smart_log_text"] = smart_log_text + st.session_state["_smart_source"] = f"pod:{smart_pod_name}/{smart_pod_container or 'default'}" + st.success(f"Fetched {len(smart_log_text.splitlines())} log lines from pod '{smart_pod_name}'.") + elif pod_log_result.success: + st.info(f"No logs returned from pod '{smart_pod_name}' for the selected time range.") + else: + st.error(f"Failed to fetch pod logs: {pod_log_result.stderr}") + + # Persist across reruns + if "_smart_log_text" in st.session_state and not smart_log_text: + smart_log_text = st.session_state["_smart_log_text"] + + else: + smart_log_text = st.text_area( + "Paste log output", + height=200, + placeholder="Paste your Kubernetes / Istio access logs here for smart analysis...", + key="smart_paste", + ) + if smart_log_text: + st.session_state["_smart_log_text"] = smart_log_text + st.session_state["_smart_source"] = "pasted" + + # Run analysis if we have log text + if smart_log_text: + src_label = st.session_state.get("_smart_source", "") + with st.spinner("Running LogAI-inspired analysis pipeline..."): + sa_result = smart_analyze(smart_log_text, source=src_label) + + # ── Summary / Health Score ──────────────────────────────── + st.markdown("---") + st.markdown("#### Log Summary & Health Score") + summary = sa_result.summary + health = summary.get("health_score", 100) + health_color = "green" if health >= 80 else ("orange" if health >= 50 else "red") + scol1, scol2, scol3, scol4, scol5 = st.columns(5) + scol1.metric("Total Lines", summary.get("total_lines", 0)) + scol2.metric("Errors", summary.get("error_count", 0)) + scol3.metric("Warnings", summary.get("warning_count", 0)) + scol4.metric("Unique Templates", summary.get("unique_templates", 0)) + scol5.metric("Health Score", f"{health}/100") + + if health < 50: + st.error(f"Health score is **{health}/100** — significant issues detected in logs.") + elif health < 80: + st.warning(f"Health score is **{health}/100** — some issues detected.") + else: + st.success(f"Health score is **{health}/100** — logs look healthy.") + + st.markdown( + f"**Time span:** {summary.get('first_timestamp', 'N/A')} → {summary.get('last_timestamp', 'N/A')} | " + f"**Template diversity:** {summary.get('template_diversity', 0)}%" + ) + + # Top errors + top_errors = summary.get("top_errors", []) + if top_errors: + with st.expander(f"Top {len(top_errors)} Error Patterns", expanded=True): + for pattern, count in top_errors: + st.markdown(f"- **x{count}** — `{pattern[:200]}`") + + # ── Log Clustering ──────────────────────────────────────── + st.markdown("---") + st.markdown("#### Log Clustering (TF-IDF + DBSCAN)") + st.markdown( + "Groups similar log messages together to reduce noise and highlight distinct message types. " + "Uses TF-IDF vectorization and DBSCAN density-based clustering." + ) + if sa_result.clusters: + import pandas as pd + cluster_data = [] + for c in sa_result.clusters: + label = f"Cluster {c.cluster_id}" if c.cluster_id >= 0 else "Noise (unique)" + cluster_data.append({ + "Cluster": label, + "Count": c.count, + "Level": c.level, + "Template": c.template[:120], + "First Seen": c.first_seen or "N/A", + "Last Seen": c.last_seen or "N/A", + }) + df_clusters = pd.DataFrame(cluster_data) + st.dataframe(df_clusters, use_container_width=True, hide_index=True) + + # Cluster distribution chart + try: + import plotly.express as px + fig = px.pie( + df_clusters, names="Cluster", values="Count", + title="Log Message Distribution by Cluster", + color_discrete_sequence=px.colors.qualitative.Set3, + ) + fig.update_layout(height=400) + st.plotly_chart(fig, use_container_width=True) + except ImportError: + pass + + # Show sample messages per cluster + error_clusters = [c for c in sa_result.clusters if c.level == "ERROR"] + if error_clusters: + with st.expander(f"Error Clusters ({len(error_clusters)})", expanded=True): + for c in error_clusters: + label = f"Cluster {c.cluster_id}" if c.cluster_id >= 0 else "Noise" + st.markdown(f"**{label}** — {c.count} messages") + for sample in c.sample_messages[:2]: + st.code(sample, language="text") + else: + st.info("Not enough log lines for clustering (need 3+ lines).") + + # ── Anomaly Detection ───────────────────────────────────── + st.markdown("---") + st.markdown("#### Anomaly Detection") + st.markdown( + "Detects unusual log lines using TF-IDF distance from centroid (outlier scoring) " + "and frequency-based rare template detection." + ) + if sa_result.anomalies: + st.markdown(f"**{len(sa_result.anomalies)} anomalous log line(s) detected**") + import pandas as pd + anomaly_data = [] + for a in sa_result.anomalies[:30]: + anomaly_data.append({ + "Score": round(a.score, 2), + "Reason": a.reason, + "Timestamp": a.timestamp or "N/A", + "Message": a.message[:150], + }) + df_anomalies = pd.DataFrame(anomaly_data) + st.dataframe(df_anomalies, use_container_width=True, hide_index=True) + + # Show full messages for top anomalies + with st.expander("Top Anomaly Details", expanded=False): + for i, a in enumerate(sa_result.anomalies[:10]): + st.markdown(f"**#{i+1}** (score: {a.score:.2f}) — {a.reason}") + st.code(a.message, language="text") + else: + st.success("No anomalous log lines detected — all messages follow expected patterns.") + + # ── Pattern Mining ──────────────────────────────────────── + st.markdown("---") + st.markdown("#### Pattern Mining (Drain-style)") + st.markdown( + "Extracts frequent log templates by replacing variable tokens (IPs, IDs, numbers, paths) " + "with placeholders — similar to LogAI's Drain parser." + ) + if sa_result.patterns: + import pandas as pd + pattern_data = [] + for p in sa_result.patterns[:20]: + pattern_data.append({ + "Template": p["template"][:120], + "Count": p["count"], + "% of Logs": p["percentage"], + "Level": p["level"], + }) + df_patterns = pd.DataFrame(pattern_data) + st.dataframe(df_patterns, use_container_width=True, hide_index=True) + + # Bar chart of top patterns + try: + import plotly.express as px + top_10 = sa_result.patterns[:10] + fig = px.bar( + x=[p["template"][:60] for p in top_10], + y=[p["count"] for p in top_10], + labels={"x": "Template", "y": "Count"}, + title="Top 10 Log Templates", + color=[p["level"] for p in top_10], + color_discrete_map={"ERROR": "#FF4B4B", "WARNING": "#FFA500", "INFO": "#326CE5"}, + ) + fig.update_layout(height=400, xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + except ImportError: + pass + else: + st.info("No patterns extracted.") + + # ── Timeline ────────────────────────────────────────────── + if sa_result.timeline_buckets and len(sa_result.timeline_buckets) > 1: + st.markdown("---") + st.markdown("#### Log Volume Timeline") + try: + import plotly.graph_objects as go + import pandas as pd + ts_labels = [b["timestamp"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"] + ts_totals = [b["total"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"] + ts_errors = [b["errors"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"] + ts_warnings = [b["warnings"] for b in sa_result.timeline_buckets if b["timestamp"] != "unknown"] + + if ts_labels: + fig = go.Figure() + fig.add_trace(go.Scatter(x=ts_labels, y=ts_totals, name="Total", mode="lines+markers", line=dict(color="#326CE5"))) + fig.add_trace(go.Bar(x=ts_labels, y=ts_errors, name="Errors", marker_color="#FF4B4B")) + fig.add_trace(go.Bar(x=ts_labels, y=ts_warnings, name="Warnings", marker_color="#FFA500")) + fig.update_layout( + title="Log Volume Over Time", + yaxis_title="Count", + xaxis_title="Time", + barmode="stack", + height=400, + ) + st.plotly_chart(fig, use_container_width=True) + except ImportError: + pass + + # ── Istio / Envoy Access Log Analysis ──────────────────────── + if sa_result.istio: + istio = sa_result.istio + st.markdown("---") + st.markdown("#### Istio / Envoy Access Log Analysis") + st.markdown( + "Detected **Istio/Envoy access logs** — showing response time analytics, " + "status code distribution, per-path and per-upstream breakdowns, and slow requests." + ) + + # ── Overview metrics ── + icol1, icol2, icol3, icol4, icol5, icol6 = st.columns(6) + icol1.metric("Total Requests", f"{istio.total_requests:,}") + icol2.metric("Avg Latency", f"{istio.avg_ms:.0f} ms") + icol3.metric("P50", f"{istio.p50_ms:.0f} ms") + icol4.metric("P95", f"{istio.p95_ms:.0f} ms") + icol5.metric("P99", f"{istio.p99_ms:.0f} ms") + icol6.metric("Error Rate", f"{istio.error_rate:.1f}%") + + if istio.error_rate > 10: + st.error(f"High error rate: **{istio.error_rate:.1f}%** of requests returned 4xx/5xx.") + elif istio.error_rate > 2: + st.warning(f"Elevated error rate: **{istio.error_rate:.1f}%** of requests returned 4xx/5xx.") + + icol7, icol8, icol9 = st.columns(3) + icol7.metric("Min Latency", f"{istio.min_ms:.0f} ms") + icol8.metric("Max Latency", f"{istio.max_ms:.0f} ms") + icol9.metric("P90", f"{istio.p90_ms:.0f} ms") + + # ── Status Code Distribution ── + st.markdown("##### Status Code Distribution") + if istio.status_distribution: + import pandas as pd + status_data = [{"Status Code": str(k), "Count": v} for k, v in sorted(istio.status_distribution.items())] + df_status = pd.DataFrame(status_data) + scol_t, scol_c = st.columns([1, 1]) + with scol_t: + st.dataframe(df_status, use_container_width=True, hide_index=True) + with scol_c: + try: + import plotly.express as px + fig = px.pie( + df_status, names="Status Code", values="Count", + title="Response Status Codes", + color="Status Code", + color_discrete_map={ + str(k): ("#2ecc71" if k < 300 else "#f39c12" if k < 400 else "#e67e22" if k < 500 else "#e74c3c") + for k in istio.status_distribution + }, + ) + fig.update_layout(height=350) + st.plotly_chart(fig, use_container_width=True) + except ImportError: + pass + + # ── Status class summary ── + if istio.status_class_distribution: + class_cols = st.columns(len(istio.status_class_distribution)) + for idx, (cls, cnt) in enumerate(sorted(istio.status_class_distribution.items())): + class_cols[idx].metric(cls, cnt) + + # ── Response Flags ── + if istio.response_flags_dist and len(istio.response_flags_dist) > 1: + with st.expander("Response Flags (Envoy)", expanded=False): + st.markdown( + "Envoy response flags indicate special conditions: " + "`UF`=upstream failure, `UH`=no healthy upstream, " + "`UT`=upstream timeout, `NR`=no route, `DC`=downstream disconnected, etc." + ) + import pandas as pd + flags_data = [{"Flag": k, "Count": v} for k, v in istio.response_flags_dist.items()] + st.dataframe(pd.DataFrame(flags_data), use_container_width=True, hide_index=True) + + # ── Latency Distribution Histogram ── + st.markdown("##### Latency Distribution") + try: + import plotly.express as px + durations = [e.duration_ms for e in istio.parsed_entries] + fig = px.histogram( + x=durations, nbins=50, + labels={"x": "Duration (ms)", "y": "Count"}, + title="Request Latency Distribution", + ) + fig.add_vline(x=istio.p50_ms, line_dash="dash", line_color="green", + annotation_text=f"P50: {istio.p50_ms:.0f}ms") + fig.add_vline(x=istio.p95_ms, line_dash="dash", line_color="orange", + annotation_text=f"P95: {istio.p95_ms:.0f}ms") + fig.add_vline(x=istio.p99_ms, line_dash="dash", line_color="red", + annotation_text=f"P99: {istio.p99_ms:.0f}ms") + fig.update_layout(height=400) + st.plotly_chart(fig, use_container_width=True) + except ImportError: + pass + + # ── Per-Path Response Time ── + if istio.path_stats: + st.markdown("##### Per-Path Response Time") + import pandas as pd + path_data = [] + for ps in istio.path_stats[:30]: + path_data.append({ + "Path": ps["path"][:80], + "Requests": ps["count"], + "Avg (ms)": ps["avg_ms"], + "P50 (ms)": ps["p50_ms"], + "P95 (ms)": ps["p95_ms"], + "P99 (ms)": ps["p99_ms"], + "Max (ms)": ps["max_ms"], + "Errors": ps["error_count"], + "Error %": ps["error_rate"], + }) + df_paths = pd.DataFrame(path_data) + st.dataframe(df_paths, use_container_width=True, hide_index=True) + + # Bar chart of top paths by P95 + try: + import plotly.express as px + top_paths = istio.path_stats[:15] + fig = px.bar( + x=[p["path"][:50] for p in top_paths], + y=[p["p95_ms"] for p in top_paths], + labels={"x": "Path", "y": "P95 Latency (ms)"}, + title="Top Paths by P95 Latency", + color=[p["error_rate"] for p in top_paths], + color_continuous_scale="RdYlGn_r", + ) + fig.update_layout(height=400, xaxis_tickangle=-45, + coloraxis_colorbar_title="Error %") + st.plotly_chart(fig, use_container_width=True) + except ImportError: + pass + + # ── Per-Upstream Service Stats ── + if istio.upstream_stats: + st.markdown("##### Per-Upstream Service Stats") + import pandas as pd + up_data = [] + for us in istio.upstream_stats[:20]: + up_data.append({ + "Upstream": us["upstream"][:60], + "Requests": us["count"], + "Avg Duration (ms)": us["avg_duration_ms"], + "Avg Upstream (ms)": us["avg_upstream_ms"], + "P95 Duration (ms)": us["p95_duration_ms"], + "P95 Upstream (ms)": us["p95_upstream_ms"], + "Errors": us["error_count"], + "Error %": us["error_rate"], + }) + st.dataframe(pd.DataFrame(up_data), use_container_width=True, hide_index=True) + + # ── Slow Requests ── + if istio.slow_requests: + with st.expander(f"Slow Requests (>{istio.p95_ms:.0f}ms — top {len(istio.slow_requests)})", expanded=True): + import pandas as pd + slow_data = [] + for sr in istio.slow_requests[:30]: + slow_data.append({ + "Duration (ms)": sr.duration_ms, + "Upstream (ms)": sr.upstream_service_time_ms, + "Method": sr.method, + "Path": sr.path[:80], + "Status": sr.response_code, + "Flags": sr.response_flags, + "Upstream Host": sr.upstream_host[:40], + "Timestamp": sr.timestamp[:25] if sr.timestamp else "", + }) + st.dataframe(pd.DataFrame(slow_data), use_container_width=True, hide_index=True) + + # ── Istio Request Timeline ── + if istio.timeline_buckets and len(istio.timeline_buckets) > 1: + st.markdown("##### Request Timeline") + try: + import plotly.graph_objects as go + ts_labels = [b["timestamp"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"] + ts_totals = [b["total"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"] + ts_errors = [b["errors"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"] + ts_avg_dur = [b["avg_duration"] for b in istio.timeline_buckets if b["timestamp"] != "unknown"] + + if ts_labels: + fig = go.Figure() + fig.add_trace(go.Bar(x=ts_labels, y=ts_totals, name="Requests", marker_color="#326CE5")) + fig.add_trace(go.Bar(x=ts_labels, y=ts_errors, name="Errors (4xx+5xx)", marker_color="#FF4B4B")) + fig.add_trace(go.Scatter( + x=ts_labels, y=ts_avg_dur, name="Avg Latency (ms)", + mode="lines+markers", yaxis="y2", line=dict(color="orange"), + )) + fig.update_layout( + title="Requests & Latency Over Time", + yaxis_title="Request Count", + yaxis2=dict(title="Avg Latency (ms)", overlaying="y", side="right"), + barmode="overlay", + height=400, + ) + st.plotly_chart(fig, use_container_width=True) + except ImportError: + pass + + # ── AI Log Analysis ─────────────────────────────────────────────────── + with tab_ai: + st.markdown("### AI-Powered Log Analysis") + if not is_llm_configured(): + st.info( + "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` " + "environment variables to enable AI-powered log analysis." + ) + st.markdown( + "You can still use the **System Logs**, **Pod Logs**, and " + "**Error Correlation** tabs — they work without an LLM and provide " + "automated pattern matching and error grouping." + ) + else: + st.markdown("Paste logs or describe an issue for AI analysis.") + + log_input = st.text_area( + "Paste log output", + height=200, + placeholder="Paste your Kubernetes logs here...", + ) + context_input = st.text_input( + "Additional context", + placeholder="e.g., This started happening after we upgraded to K8s 1.30", + ) + + if st.button("Analyze Logs", type="primary", key="ai_log_analyze") and log_input: + with st.spinner("AI is analyzing logs..."): + analysis = llm_analyze_logs(log_input, context=context_input) + st.markdown(analysis) + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: AI Assistant +# ══════════════════════════════════════════════════════════════════════════ + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Resource Viewer +# ══════════════════════════════════════════════════════════════════════════ + +# Resource definitions: (display_name, kubectl_command, supports_namespace) +_RESOURCE_TYPES = { + "Pods": ("get pods", True), + "Deployments": ("get deployments", True), + "Services": ("get services", True), + "ConfigMaps": ("get configmaps", True), + "Secrets": ("get secrets", True), + "StatefulSets": ("get statefulsets", True), + "DaemonSets": ("get daemonsets", True), + "ReplicaSets": ("get replicasets", True), + "Jobs": ("get jobs", True), + "CronJobs": ("get cronjobs", True), + "Ingresses": ("get ingress", True), + "NetworkPolicies": ("get networkpolicies", True), + "PersistentVolumeClaims": ("get pvc", True), + "PersistentVolumes": ("get pv", False), + "StorageClasses": ("get storageclasses", False), + "Namespaces": ("get namespaces", False), + "Nodes": ("get nodes", False), + "ServiceAccounts": ("get serviceaccounts", True), + "DestinationRules": ("get destinationrules", True), + "VirtualServices": ("get virtualservices", True), + "HorizontalPodAutoscalers": ("get hpa", True), + "PodDisruptionBudgets": ("get pdb", True), + "Endpoints": ("get endpoints", True), +} + + +def page_resource_viewer(): + st.markdown("## Resource Viewer") + st.markdown("Browse live Kubernetes resources from your cluster.") + + profile = _get_active_profile() + if not profile: + return + + if not profile.kubeconfig_content and not profile.get_control_plane_nodes(): + st.error( + "This profile has no kubeconfig and no control-plane node. " + "Import a kubeconfig or add nodes in the Profile Manager." + ) + return + + # Pre-fetch namespaces for imported clusters + _rv_namespaces: list[str] = [] + if profile.cluster_source == "imported" and profile.kubeconfig_content: + _rv_namespaces = fetch_namespaces(profile.kubeconfig_content) + + (tab_resources, tab_scaling, tab_shell, tab_res_limits, tab_crictl, + tab_node_health, tab_rbac, tab_events, + tab_restart_tracker, tab_pvc) = st.tabs([ + "Cluster Resources", + "Scaling", + "Pod Shell", + "Resource Requests/Limits", + "Node Containers", + "Node Health", + "RBAC Viewer", + "Events Timeline", + "Pod Restart Tracker", + "PVC / Storage", + ]) + + # ── Cluster Resources ──────────────────────────────────────────────── + with tab_resources: + st.markdown("### Browse Cluster Resources") + + col1, col2, col3 = st.columns([2, 2, 1]) + with col1: + resource_type = st.selectbox( + "Resource Type", + options=list(_RESOURCE_TYPES.keys()), + index=0, + ) + with col2: + cmd_base, ns_supported = _RESOURCE_TYPES[resource_type] + if ns_supported: + ns_choice = st.radio( + "Namespace", + ["All Namespaces", "Specific"], + horizontal=True, + key="res_ns_choice", + ) + if ns_choice == "Specific": + if _rv_namespaces: + namespace = st.selectbox( + "Namespace", options=_rv_namespaces, + index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0, + key="res_ns", + ) + else: + namespace = st.text_input("Namespace", value="default", key="res_ns") + else: + namespace = "" + else: + namespace = "" + st.info(f"{resource_type} is a cluster-scoped resource.") + with col3: + output_format = st.selectbox( + "Output", + ["wide", "yaml", "json", "name"], + index=0, + key="res_output", + ) + + if st.button("Fetch Resources", type="primary", key="fetch_res"): + kubectl_cmd = cmd_base + if ns_supported and not namespace: + kubectl_cmd += " -A" + elif ns_supported and namespace: + kubectl_cmd += f" -n {namespace}" + kubectl_cmd += f" -o {output_format}" + + with st.spinner(f"Fetching {resource_type}..."): + result = run_kubectl(profile, kubectl_cmd, timeout=30) + if result.success: + st.code(result.stdout or "(no resources found)", language="text") + else: + if "the server doesn't have a resource type" in result.stderr: + st.warning( + f"{resource_type} is not available on this cluster " + "(CRD may not be installed)." + ) + else: + st.error("Failed to fetch resources") + st.code(result.stderr, language="text") + + # Describe a specific resource + st.markdown("---") + st.markdown("#### Describe a Resource") + desc_col1, desc_col2, desc_col3 = st.columns([2, 2, 1]) + with desc_col2: + if _rv_namespaces: + desc_ns = st.selectbox( + "Namespace", + options=_rv_namespaces, + index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0, + key="desc_ns", + ) + else: + desc_ns = st.text_input( + "Namespace", + value="default", + key="desc_ns", + ) + with desc_col3: + desc_refresh = st.button("Load names", key="desc_load_names") + + # Fetch resource names for the dropdown + _desc_resource_names: list[str] = [] + if desc_refresh or st.session_state.get("_desc_cached_names"): + if desc_refresh: + # Determine the kubectl get command for names + _desc_cmd_base, _desc_ns_supported = _RESOURCE_TYPES[resource_type] + _names_cmd = f"{_desc_cmd_base} -o name" + if _desc_ns_supported and desc_ns: + _names_cmd += f" -n {desc_ns}" + elif _desc_ns_supported: + _names_cmd += " -A" + _names_result = run_kubectl(profile, _names_cmd, timeout=15) + if _names_result.success and _names_result.stdout.strip(): + raw_names = _names_result.stdout.strip().split("\n") + # Strip resource type prefix (e.g. "pod/my-pod" -> "my-pod") + _desc_resource_names = [ + n.split("/", 1)[-1] if "/" in n else n + for n in raw_names if n.strip() + ] + st.session_state["_desc_cached_names"] = _desc_resource_names + st.session_state["_desc_cached_type"] = resource_type + else: + _desc_resource_names = [] + st.session_state["_desc_cached_names"] = [] + else: + # Use cached names if resource type matches + if st.session_state.get("_desc_cached_type") == resource_type: + _desc_resource_names = st.session_state.get("_desc_cached_names", []) + + with desc_col1: + if _desc_resource_names: + desc_name = st.selectbox( + "Resource name", + options=_desc_resource_names, + key="desc_name_select", + ) + else: + desc_name = st.text_input( + "Resource name", + placeholder="Click 'Load names' or type a name", + key="desc_name", + ) + + if st.button("Describe", key="describe_res") and desc_name: + # Determine the singular resource type for describe + res_singular = resource_type.rstrip("s") + if resource_type == "Ingresses": + res_singular = "ingress" + elif resource_type == "Namespaces": + res_singular = "namespace" + elif resource_type == "StorageClasses": + res_singular = "storageclass" + elif resource_type == "Endpoints": + res_singular = "endpoints" + + desc_cmd = f"describe {res_singular.lower()} {desc_name}" + if ns_supported and desc_ns: + desc_cmd += f" -n {desc_ns}" + + with st.spinner(f"Describing {desc_name}..."): + result = run_kubectl(profile, desc_cmd, timeout=30) + if result.success: + st.code(result.stdout, language="yaml") + else: + st.error("Describe failed") + st.code(result.stderr, language="text") + + # ── Scaling ────────────────────────────────────────────────────────── + with tab_scaling: + st.markdown("### Deployment Scaling") + st.markdown("Scale deployment replicas up or down.") + + sc_col1, sc_col2 = st.columns(2) + with sc_col1: + if _rv_namespaces: + sc_ns = st.selectbox( + "Namespace", + options=_rv_namespaces, + index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0, + key="sc_ns", + ) + else: + sc_ns = st.text_input("Namespace", value="default", key="sc_ns") + with sc_col2: + sc_load = st.button("Load Deployments", key="sc_load") + + # Fetch deployments for the dropdown + _sc_deployments: list[str] = [] + _sc_dep_info: dict[str, str] = {} + if sc_load or st.session_state.get("_sc_cached_deps"): + if sc_load: + dep_result = run_kubectl( + profile, + f"get deployments -n {sc_ns} -o custom-columns=NAME:.metadata.name,REPLICAS:.spec.replicas,AVAILABLE:.status.availableReplicas --no-headers", + timeout=15, + ) + if dep_result.success and dep_result.stdout.strip(): + for line in dep_result.stdout.strip().split("\n"): + parts = line.split() + if parts: + dep_name = parts[0] + _sc_deployments.append(dep_name) + replicas = parts[1] if len(parts) > 1 else "?" + available = parts[2] if len(parts) > 2 else "?" + _sc_dep_info[dep_name] = f"{replicas} replicas ({available} available)" + st.session_state["_sc_cached_deps"] = _sc_deployments + st.session_state["_sc_cached_dep_info"] = _sc_dep_info + st.session_state["_sc_cached_ns"] = sc_ns + else: + st.session_state["_sc_cached_deps"] = [] + st.session_state["_sc_cached_dep_info"] = {} + if dep_result.success: + st.info(f"No deployments found in namespace '{sc_ns}'.") + else: + st.error("Failed to list deployments") + st.code(dep_result.stderr, language="text") + else: + if st.session_state.get("_sc_cached_ns") == sc_ns: + _sc_deployments = st.session_state.get("_sc_cached_deps", []) + _sc_dep_info = st.session_state.get("_sc_cached_dep_info", {}) + + if _sc_deployments: + sc_dep_col1, sc_dep_col2, sc_dep_col3 = st.columns([3, 1, 1]) + with sc_dep_col1: + sc_selected = st.selectbox( + "Deployment", + options=_sc_deployments, + format_func=lambda d: f"{d} ({_sc_dep_info.get(d, '')})", + key="sc_selected", + ) + with sc_dep_col2: + sc_replicas = st.number_input( + "Target replicas", + min_value=0, + max_value=100, + value=1, + key="sc_replicas", + ) + with sc_dep_col3: + st.markdown("
", unsafe_allow_html=True) + if st.button("Scale", type="primary", key="sc_apply"): + scale_cmd = f"scale deployment {sc_selected} --replicas={sc_replicas} -n {sc_ns}" + with st.spinner(f"Scaling {sc_selected} to {sc_replicas} replicas..."): + result = run_kubectl(profile, scale_cmd, timeout=30) + if result.success: + st.success(f"Scaled **{sc_selected}** to **{sc_replicas}** replicas!") + st.code(result.stdout, language="text") + # Refresh to show updated state + verify = run_kubectl( + profile, + f"get deployment {sc_selected} -n {sc_ns} -o wide", + timeout=15, + ) + if verify.success: + st.code(verify.stdout, language="text") + else: + st.error("Scaling failed") + st.code(result.stderr, language="text") + + # Quick scale buttons + st.markdown("---") + st.markdown("#### Quick Actions") + qa_col1, qa_col2, qa_col3, qa_col4 = st.columns(4) + with qa_col1: + if st.button("Scale to 0 (stop)", key="sc_0"): + with st.spinner("Scaling to 0..."): + result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=0 -n {sc_ns}", timeout=30) + st.success("Scaled to 0") if result.success else st.error(result.stderr) + with qa_col2: + if st.button("Scale to 1", key="sc_1"): + with st.spinner("Scaling to 1..."): + result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=1 -n {sc_ns}", timeout=30) + st.success("Scaled to 1") if result.success else st.error(result.stderr) + with qa_col3: + if st.button("Scale to 3", key="sc_3"): + with st.spinner("Scaling to 3..."): + result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=3 -n {sc_ns}", timeout=30) + st.success("Scaled to 3") if result.success else st.error(result.stderr) + with qa_col4: + if st.button("Scale to 5", key="sc_5"): + with st.spinner("Scaling to 5..."): + result = run_kubectl(profile, f"scale deployment {sc_selected} --replicas=5 -n {sc_ns}", timeout=30) + st.success("Scaled to 5") if result.success else st.error(result.stderr) + elif not sc_load: + st.info("Click **Load Deployments** to see deployments in the selected namespace.") + + # ── Pod Shell ──────────────────────────────────────────────────────── + with tab_shell: + st.markdown("### Pod Shell (Exec)") + st.markdown("Execute commands inside a running pod/container.") + + sh_col1, sh_col2 = st.columns(2) + with sh_col1: + if _rv_namespaces: + sh_ns = st.selectbox( + "Namespace", + options=_rv_namespaces, + index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0, + key="sh_ns", + ) + else: + sh_ns = st.text_input("Namespace", value="default", key="sh_ns") + with sh_col2: + sh_load = st.button("Load Pods", key="sh_load") + + # Fetch running pods + _sh_pods: list[str] = [] + _sh_containers: dict[str, list[str]] = {} + if sh_load or st.session_state.get("_sh_cached_pods"): + if sh_load: + pod_result = run_kubectl( + profile, + f"get pods -n {sh_ns} --field-selector=status.phase=Running -o jsonpath=" + "'{range .items[*]}{.metadata.name}{\"\\n\"}{end}'", + timeout=15, + ) + if pod_result.success and pod_result.stdout.strip(): + _sh_pods = [p.strip() for p in pod_result.stdout.strip().split("\n") if p.strip()] + st.session_state["_sh_cached_pods"] = _sh_pods + st.session_state["_sh_cached_ns"] = sh_ns + # Fetch container names for each pod + _sh_containers = {} + for pod_name in _sh_pods[:20]: # limit to first 20 for perf + ctr_result = run_kubectl( + profile, + f"get pod {pod_name} -n {sh_ns} -o jsonpath=" + "'{range .spec.containers[*]}{.name}{\"\\n\"}{end}'", + timeout=10, + ) + if ctr_result.success and ctr_result.stdout.strip(): + _sh_containers[pod_name] = [ + c.strip() for c in ctr_result.stdout.strip().split("\n") if c.strip() + ] + else: + _sh_containers[pod_name] = [] + st.session_state["_sh_cached_containers"] = _sh_containers + else: + st.session_state["_sh_cached_pods"] = [] + st.session_state["_sh_cached_containers"] = {} + if pod_result.success: + st.info(f"No running pods found in namespace '{sh_ns}'.") + else: + st.error("Failed to list pods") + st.code(pod_result.stderr, language="text") + else: + if st.session_state.get("_sh_cached_ns") == sh_ns: + _sh_pods = st.session_state.get("_sh_cached_pods", []) + _sh_containers = st.session_state.get("_sh_cached_containers", {}) + + if _sh_pods: + sh_pod_col1, sh_pod_col2 = st.columns(2) + with sh_pod_col1: + sh_selected_pod = st.selectbox("Pod", options=_sh_pods, key="sh_pod") + with sh_pod_col2: + containers = _sh_containers.get(sh_selected_pod, []) + if containers: + sh_selected_ctr = st.selectbox("Container", options=containers, key="sh_ctr") + else: + sh_selected_ctr = st.text_input("Container (optional)", key="sh_ctr") + + st.info( + "**Note:** This runs non-interactive commands via `kubectl exec`. " + "For a fully interactive shell, use your terminal:\n\n" + f"`kubectl exec -it {sh_selected_pod} -n {sh_ns}" + f"{' -c ' + sh_selected_ctr if sh_selected_ctr else ''} -- /bin/sh`" + ) + + sh_cmd = st.text_input( + "Command to execute", + value="sh -c 'hostname && cat /etc/os-release && df -h'", + key="sh_cmd", + help="Enter the command to run inside the container", + ) + + sh_preset_col1, sh_preset_col2, sh_preset_col3, sh_preset_col4 = st.columns(4) + with sh_preset_col1: + if st.button("env", key="sh_p_env"): + sh_cmd = "env" + with sh_preset_col2: + if st.button("ps aux", key="sh_p_ps"): + sh_cmd = "ps aux" + with sh_preset_col3: + if st.button("df -h", key="sh_p_df"): + sh_cmd = "df -h" + with sh_preset_col4: + if st.button("cat /etc/resolv.conf", key="sh_p_dns"): + sh_cmd = "cat /etc/resolv.conf" + + if st.button("Execute", type="primary", key="sh_exec") and sh_cmd: + ctr_flag = f" -c {sh_selected_ctr}" if sh_selected_ctr else "" + exec_cmd = f"exec {sh_selected_pod} -n {sh_ns}{ctr_flag} -- {sh_cmd}" + with st.spinner(f"Executing in {sh_selected_pod}..."): + result = run_kubectl(profile, exec_cmd, timeout=30) + if result.success: + st.code(result.stdout or "(no output)", language="text") + else: + st.error("Exec failed") + st.code(result.stderr, language="text") + + # Pod logs quick access + st.markdown("---") + st.markdown("#### Quick Pod Logs") + log_lines = st.number_input("Tail lines", min_value=10, max_value=500, value=50, key="sh_log_lines") + if st.button("View Logs", key="sh_logs"): + ctr_flag = f" -c {sh_selected_ctr}" if sh_selected_ctr else "" + log_cmd = f"logs {sh_selected_pod} -n {sh_ns}{ctr_flag} --tail={log_lines}" + with st.spinner("Fetching logs..."): + result = run_kubectl(profile, log_cmd, timeout=30) + if result.success: + st.code(result.stdout or "(no logs)", language="text") + else: + st.error("Failed to fetch logs") + st.code(result.stderr, language="text") + elif not sh_load: + st.info("Click **Load Pods** to see running pods in the selected namespace.") + + # ── Resource Requests / Limits ─────────────────────────────────────── + with tab_res_limits: + st.markdown("### Container Resource Requests & Limits") + st.markdown( + "View CPU, memory, and ephemeral-storage requests and limits for all " + "containers in a namespace (from Deployments, StatefulSets, DaemonSets, and Jobs)." + ) + + rl_col1, rl_col2 = st.columns(2) + with rl_col1: + if _rv_namespaces: + rl_ns = st.selectbox( + "Namespace", + options=_rv_namespaces, + index=_rv_namespaces.index("default") if "default" in _rv_namespaces else 0, + key="rl_ns", + ) + else: + rl_ns = st.text_input("Namespace", value="default", key="rl_ns") + with rl_col2: + rl_workload = st.selectbox( + "Workload Type", + options=["Deployments", "StatefulSets", "DaemonSets", "Jobs", "All"], + index=0, + key="rl_workload", + ) + + if st.button("Fetch Resource Requests/Limits", type="primary", key="rl_fetch"): + import json as _json + + workload_map = { + "Deployments": "deploy", + "StatefulSets": "statefulsets", + "DaemonSets": "daemonsets", + "Jobs": "jobs", + } + if rl_workload == "All": + types_to_fetch = list(workload_map.items()) + else: + types_to_fetch = [(rl_workload, workload_map[rl_workload])] + + all_rows: list[dict] = [] + for wl_label, wl_cmd in types_to_fetch: + with st.spinner(f"Fetching {wl_label}..."): + result = run_kubectl( + profile, + f"get {wl_cmd} -n {rl_ns} -o json", + timeout=30, + ) + if result.success and result.stdout.strip(): + try: + data = _json.loads(result.stdout) + for item in data.get("items", []): + workload_name = item.get("metadata", {}).get("name", "?") + spec = item.get("spec", {}) + # For Jobs the pod template is at spec.template, + # for Deployments/StatefulSets/DaemonSets it's spec.template + template = spec.get("template", {}) + pod_spec = template.get("spec", {}) + containers = pod_spec.get("containers", []) + for ctr in containers: + res = ctr.get("resources", {}) + req = res.get("requests", {}) + lim = res.get("limits", {}) + all_rows.append({ + "Type": wl_label, + "Workload": workload_name, + "Container": ctr.get("name", "?"), + "CPU Req": req.get("cpu", "-"), + "CPU Lim": lim.get("cpu", "-"), + "Mem Req": req.get("memory", "-"), + "Mem Lim": lim.get("memory", "-"), + "Eph Req": req.get("ephemeral-storage", "-"), + "Eph Lim": lim.get("ephemeral-storage", "-"), + }) + except _json.JSONDecodeError: + st.warning(f"Could not parse JSON for {wl_label}") + elif not result.success: + st.warning(f"Failed to fetch {wl_label}: {result.stderr}") + + if all_rows: + st.markdown(f"**{len(all_rows)} container(s)** found in namespace `{rl_ns}`:") + st.dataframe( + all_rows, + use_container_width=True, + column_config={ + "Type": st.column_config.TextColumn(width="small"), + "Workload": st.column_config.TextColumn(width="medium"), + "Container": st.column_config.TextColumn(width="medium"), + "CPU Req": st.column_config.TextColumn(width="small"), + "CPU Lim": st.column_config.TextColumn(width="small"), + "Mem Req": st.column_config.TextColumn(width="small"), + "Mem Lim": st.column_config.TextColumn(width="small"), + "Eph Req": st.column_config.TextColumn(width="small"), + "Eph Lim": st.column_config.TextColumn(width="small"), + }, + ) + + # Summary stats + st.markdown("---") + st.markdown("#### Summary") + total_ctr = len(all_rows) + no_cpu_req = sum(1 for r in all_rows if r["CPU Req"] == "-") + no_mem_req = sum(1 for r in all_rows if r["Mem Req"] == "-") + no_cpu_lim = sum(1 for r in all_rows if r["CPU Lim"] == "-") + no_mem_lim = sum(1 for r in all_rows if r["Mem Lim"] == "-") + sc1, sc2, sc3, sc4 = st.columns(4) + with sc1: + st.metric("No CPU Request", f"{no_cpu_req}/{total_ctr}") + with sc2: + st.metric("No CPU Limit", f"{no_cpu_lim}/{total_ctr}") + with sc3: + st.metric("No Mem Request", f"{no_mem_req}/{total_ctr}") + with sc4: + st.metric("No Mem Limit", f"{no_mem_lim}/{total_ctr}") + + if no_cpu_req > 0 or no_mem_req > 0: + st.warning( + f"{no_cpu_req + no_mem_req} container(s) are missing resource requests. " + "This can affect scheduling and QoS class assignment." + ) + if no_cpu_lim > 0 or no_mem_lim > 0: + st.info( + f"{no_cpu_lim + no_mem_lim} container(s) are missing resource limits. " + "Consider setting limits to prevent resource contention." + ) + + # Download as TSV + tsv_lines = ["Type\tWorkload\tContainer\tCPU Req\tCPU Lim\tMem Req\tMem Lim\tEph Req\tEph Lim"] + for r in all_rows: + tsv_lines.append( + f"{r['Type']}\t{r['Workload']}\t{r['Container']}\t" + f"{r['CPU Req']}\t{r['CPU Lim']}\t{r['Mem Req']}\t{r['Mem Lim']}\t" + f"{r['Eph Req']}\t{r['Eph Lim']}" + ) + st.download_button( + "Download as TSV", + data="\n".join(tsv_lines), + file_name=f"resource_limits_{rl_ns}.tsv", + mime="text/tab-separated-values", + key="rl_download", + ) + else: + st.info(f"No containers found in namespace `{rl_ns}` for the selected workload type(s).") + + # ── Node Containers (crictl) ──────────────────────────────────────── + with tab_crictl: + st.markdown("### Node Containers (crictl)") + st.markdown("View containers running on each node using `crictl ps -a`.") + + if profile.cluster_source == "imported": + # Imported clusters — no SSH, but we can still get node list and show + # container info via kubectl debug or just list pods per node + st.info( + "**crictl** requires SSH access to each node and is only available for " + "provisioned clusters. For imported clusters, pod and container " + "information per node is shown via `kubectl` below." + ) + st.markdown( + "This view uses `kubectl get pods --field-selector spec.nodeName=` " + "to list pods/containers on each node. For full container-level details " + "(container IDs, image digests, runtime state), SSH into the node and run " + "`sudo crictl ps -a` directly." + ) + if st.button("Show containers per node (kubectl)", type="primary", key="crictl_kubectl"): + with st.spinner("Fetching node list..."): + node_result = run_kubectl( + profile, + "get nodes -o jsonpath='{range .items[*]}{.metadata.name}{\"\\n\"}{end}'", + timeout=15, + ) + if node_result.success and node_result.stdout.strip(): + node_names = [n.strip() for n in node_result.stdout.strip().split("\n") if n.strip()] + + # Pod count distribution across nodes + node_pod_counts: dict[str, int] = {} + for node_name in node_names: + with st.spinner(f"Fetching pods on {node_name}..."): + count_result = run_kubectl( + profile, + f"get pods -A --field-selector spec.nodeName={node_name} " + "--no-headers", + timeout=15, + ) + if count_result.success: + lines = [l for l in (count_result.stdout or "").strip().split("\n") if l.strip()] + node_pod_counts[node_name] = len(lines) + else: + node_pod_counts[node_name] = 0 + + # Show pod distribution summary + st.markdown("#### Pod Distribution Across Nodes") + dist_cols = st.columns(min(len(node_names), 6)) + for idx, node_name in enumerate(node_names): + with dist_cols[idx % min(len(node_names), 6)]: + st.metric(node_name, f"{node_pod_counts.get(node_name, 0)} pods") + total_pods = sum(node_pod_counts.values()) + if total_pods > 0 and len(node_names) > 1: + avg_pods = total_pods / len(node_names) + max_pods = max(node_pod_counts.values()) + min_pods = min(node_pod_counts.values()) + spread = max_pods - min_pods + st.markdown( + f"**Total:** {total_pods} pods across {len(node_names)} nodes | " + f"**Avg:** {avg_pods:.1f} | **Min:** {min_pods} | **Max:** {max_pods} | " + f"**Spread:** {spread}" + ) + if spread > avg_pods * 0.5 and avg_pods > 0: + st.warning( + f"Pod distribution is uneven (spread of {spread}). " + "Consider checking node affinity rules or pod topology spread constraints." + ) + else: + st.success("Pods are reasonably well-distributed across nodes.") + + st.markdown("---") + + # Detailed per-node pod listing + for node_name in node_names: + pod_count = node_pod_counts.get(node_name, 0) + with st.expander(f"Node: **{node_name}** ({pod_count} pods)", expanded=True): + with st.spinner(f"Fetching containers on {node_name}..."): + pod_result = run_kubectl( + profile, + f"get pods -A --field-selector spec.nodeName={node_name} " + "-o custom-columns=" + "'NAMESPACE:.metadata.namespace," + "POD:.metadata.name," + "CONTAINERS:.spec.containers[*].name," + "STATUS:.status.phase," + "RESTARTS:.status.containerStatuses[0].restartCount," + "NODE:.spec.nodeName'", + timeout=15, + ) + if pod_result.success: + st.code(pod_result.stdout or "(no pods on this node)", language="text") + else: + st.error(f"Failed to get pods on {node_name}") + st.code(pod_result.stderr, language="text") + else: + st.error("Failed to list nodes") + if node_result.stderr: + st.code(node_result.stderr, language="text") + else: + # Provisioned clusters — SSH into each node and run crictl + all_nodes = profile.nodes + if not all_nodes: + st.warning("No nodes defined in this profile.") + else: + st.markdown( + "> **Note:** `crictl` typically requires **root/sudo** access. " + "If your SSH user is not root, the command will be prefixed with `sudo`." + ) + use_sudo = st.checkbox( + "Run with sudo (required if SSH user is not root)", + value=True, + key="crictl_sudo", + help="Prefix the command with 'sudo' for non-root SSH users.", + ) + crictl_cmd = st.text_input( + "CRI command", + value="crictl ps -a", + key="crictl_cmd", + help="Command to run on each node (e.g. crictl ps -a, crictl images, crictl stats)", + ) + + crictl_presets = st.columns(5) + with crictl_presets[0]: + if st.button("crictl ps -a", key="cp_ps"): + crictl_cmd = "crictl ps -a" + with crictl_presets[1]: + if st.button("crictl images", key="cp_img"): + crictl_cmd = "crictl images" + with crictl_presets[2]: + if st.button("crictl stats", key="cp_stats"): + crictl_cmd = "crictl stats" + with crictl_presets[3]: + if st.button("crictl pods", key="cp_pods"): + crictl_cmd = "crictl pods" + with crictl_presets[4]: + if st.button("crictl info", key="cp_info"): + crictl_cmd = "crictl info" + + # Node selection + node_labels = [ + f"{n.get('hostname', n.get('ip_address', '?'))} ({n.get('ip_address', '?')}) [{n.get('role', '?')}]" + for n in all_nodes + ] + cr_select_all = st.checkbox("Run on all nodes", value=True, key="cr_all") + + if not cr_select_all: + selected_nodes_idx = st.multiselect( + "Select nodes", + options=list(range(len(all_nodes))), + format_func=lambda i: node_labels[i], + default=list(range(len(all_nodes))), + key="cr_nodes", + ) + selected_nodes = [all_nodes[i] for i in selected_nodes_idx] + else: + selected_nodes = all_nodes + + if st.button("Run on selected nodes", type="primary", key="crictl_run"): + if not selected_nodes: + st.warning("No nodes selected. Please select at least one node.") + else: + actual_cmd = f"sudo {crictl_cmd}" if use_sudo and not crictl_cmd.strip().startswith("sudo") else crictl_cmd + all_success = True + for node in selected_nodes: + node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})" + with st.expander(f"Node: **{node_label}** [{node.get('role', '')}]", expanded=True): + with st.spinner(f"Running `{actual_cmd}` on {node_label}..."): + result = run_ssh_command( + ip_address=node["ip_address"], + command=actual_cmd, + ssh_user=node.get("ssh_user", "root"), + ssh_port=node.get("ssh_port", 22), + ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=30, + ) + if result.success: + st.code(result.stdout or "(no output)", language="text") + else: + all_success = False + st.error(f"Command failed on {node_label}") + st.code(result.stderr, language="text") + if "permission denied" in (result.stderr or "").lower(): + st.info("Tip: Enable the 'Run with sudo' checkbox above if your SSH user needs elevated privileges.") + if all_success: + st.success(f"Command completed successfully on {len(selected_nodes)} node(s).") + else: + st.warning("Command failed on some nodes. Check the details above.") + + # ── Node Health ────────────────────────────────────────────────────── + with tab_node_health: + st.markdown("### Node Health Overview") + st.markdown("View node status, resource usage, and conditions.") + + if st.button("Refresh Node Health", type="primary", key="node_health"): + col_status, col_top = st.columns(2) + + with col_status: + st.markdown("#### Node Status") + with st.spinner("Fetching nodes..."): + result = run_kubectl(profile, "get nodes -o wide", timeout=15) + if result.success: + st.code(result.stdout, language="text") + else: + st.error("Failed to get nodes") + st.code(result.stderr, language="text") + + with col_top: + st.markdown("#### Resource Usage") + with st.spinner("Fetching node metrics..."): + result = run_kubectl(profile, "top nodes", timeout=15) + if result.success: + st.code(result.stdout, language="text") + else: + st.warning("kubectl top requires metrics-server to be installed.") + st.code(result.stderr, language="text") + + st.markdown("---") + st.markdown("#### Node Conditions") + with st.spinner("Checking node conditions..."): + result = run_kubectl( + profile, + 'get nodes -o custom-columns=' + '"NAME:.metadata.name,' + 'READY:.status.conditions[?(@.type==\\"Ready\\")].status,' + 'DISK:.status.conditions[?(@.type==\\"DiskPressure\\")].status,' + 'MEMORY:.status.conditions[?(@.type==\\"MemoryPressure\\")].status,' + 'PID:.status.conditions[?(@.type==\\"PIDPressure\\")].status"', + timeout=15, + ) + if result.success: + st.code(result.stdout, language="text") + else: + st.code(result.stderr, language="text") + + st.markdown("#### Pod Distribution per Node") + with st.spinner("Fetching pod distribution..."): + result = run_kubectl( + profile, + 'get pods -A -o custom-columns=' + '"NODE:.spec.nodeName,NAMESPACE:.metadata.namespace,' + 'POD:.metadata.name,STATUS:.status.phase" ' + '--sort-by=.spec.nodeName', + timeout=15, + ) + if result.success: + st.code(result.stdout, language="text") + else: + st.code(result.stderr, language="text") + + # ── RBAC Viewer ────────────────────────────────────────────────────── + with tab_rbac: + st.markdown("### RBAC Viewer") + st.markdown("Browse Roles, ClusterRoles, Bindings, and ServiceAccounts.") + + rbac_type = st.selectbox( + "RBAC Resource", + [ + "ClusterRoles", + "ClusterRoleBindings", + "Roles (namespaced)", + "RoleBindings (namespaced)", + "ServiceAccounts", + ], + key="rbac_type", + ) + + rbac_ns = "" + if "(namespaced)" in rbac_type or rbac_type == "ServiceAccounts": + if _rv_namespaces: + rbac_ns = st.selectbox( + "Namespace (blank = all)", + options=[""] + _rv_namespaces, + index=0, + key="rbac_ns", + format_func=lambda x: "All Namespaces" if x == "" else x, + ) + else: + rbac_ns = st.text_input( + "Namespace", + value="default", + key="rbac_ns", + help="Leave blank for all namespaces", + ) + + if st.button("Fetch RBAC Resources", type="primary", key="fetch_rbac"): + cmd_map = { + "ClusterRoles": "get clusterroles", + "ClusterRoleBindings": "get clusterrolebindings", + "Roles (namespaced)": "get roles", + "RoleBindings (namespaced)": "get rolebindings", + "ServiceAccounts": "get serviceaccounts", + } + rbac_cmd = cmd_map[rbac_type] + if rbac_ns: + rbac_cmd += f" -n {rbac_ns}" + elif "(namespaced)" in rbac_type or rbac_type == "ServiceAccounts": + rbac_cmd += " -A" + + with st.spinner(f"Fetching {rbac_type}..."): + result = run_kubectl(profile, rbac_cmd, timeout=15) + if result.success: + st.code(result.stdout or "(none found)", language="text") + else: + st.error("Failed to fetch RBAC resources") + st.code(result.stderr, language="text") + + # Describe a specific RBAC resource + st.markdown("---") + st.markdown("#### Inspect RBAC Resource") + rbac_name = st.text_input( + "Resource name to describe", + placeholder="e.g., cluster-admin", + key="rbac_desc_name", + ) + if st.button("Describe RBAC", key="desc_rbac") and rbac_name: + type_map = { + "ClusterRoles": "clusterrole", + "ClusterRoleBindings": "clusterrolebinding", + "Roles (namespaced)": "role", + "RoleBindings (namespaced)": "rolebinding", + "ServiceAccounts": "serviceaccount", + } + desc_cmd = f"describe {type_map[rbac_type]} {rbac_name}" + if rbac_ns: + desc_cmd += f" -n {rbac_ns}" + + with st.spinner(f"Describing {rbac_name}..."): + result = run_kubectl(profile, desc_cmd, timeout=15) + if result.success: + st.code(result.stdout, language="yaml") + else: + st.error("Describe failed") + st.code(result.stderr, language="text") + + # ── Events Timeline ────────────────────────────────────────────────── + with tab_events: + st.markdown("### Cluster Events Timeline") + st.markdown("View recent Kubernetes events with graphical analysis.") + + ev_col1, ev_col2, ev_col3 = st.columns(3) + with ev_col1: + ev_ns_all = st.checkbox("All namespaces", value=True, key="ev_ns_all") + ev_ns = "" + if not ev_ns_all: + ev_ns = st.text_input("Namespace", value="default", key="ev_ns") + with ev_col2: + ev_type = st.selectbox( + "Event Type", + ["All", "Normal", "Warning"], + key="ev_type", + ) + with ev_col3: + ev_sort = st.selectbox( + "Sort by", + ["Last Timestamp", "First Timestamp", "Count"], + key="ev_sort", + ) + + if st.button("Fetch Events", type="primary", key="fetch_events"): + # Fetch events in JSON for graphical display + ev_json_cmd = "get events" + if ev_ns_all: + ev_json_cmd += " -A" + elif ev_ns: + ev_json_cmd += f" -n {ev_ns}" + if ev_type != "All": + ev_json_cmd += f" --field-selector type={ev_type}" + ev_json_cmd += " -o json" + + with st.spinner("Fetching events..."): + result = run_kubectl(profile, ev_json_cmd, timeout=15) + + if result.success and result.stdout.strip(): + try: + events_data = json.loads(result.stdout) + items = events_data.get("items", []) + + if not items: + st.info("No events found.") + else: + # Parse events into structured data + ev_records = [] + for item in items: + ev_records.append({ + "Namespace": item.get("metadata", {}).get("namespace", ""), + "Type": item.get("type", ""), + "Reason": item.get("reason", ""), + "Object": item.get("involvedObject", {}).get("name", ""), + "Kind": item.get("involvedObject", {}).get("kind", ""), + "Message": (item.get("message", "") or "")[:120], + "Count": item.get("count", 1), + "Last Seen": item.get("lastTimestamp", item.get("eventTime", "")), + }) + + import pandas as pd + + df = pd.DataFrame(ev_records) + + # ── Graphical Summary ──────────────────────── + st.markdown("#### Event Summary Charts") + + chart_col1, chart_col2 = st.columns(2) + + with chart_col1: + st.markdown("**Events by Type**") + type_counts = df["Type"].value_counts().reset_index() + type_counts.columns = ["Type", "Count"] + st.bar_chart(type_counts.set_index("Type")) + + with chart_col2: + st.markdown("**Events by Reason (Top 10)**") + reason_counts = df["Reason"].value_counts().head(10).reset_index() + reason_counts.columns = ["Reason", "Count"] + st.bar_chart(reason_counts.set_index("Reason")) + + chart_col3, chart_col4 = st.columns(2) + + with chart_col3: + st.markdown("**Events by Namespace (Top 10)**") + ns_counts = df["Namespace"].value_counts().head(10).reset_index() + ns_counts.columns = ["Namespace", "Count"] + st.bar_chart(ns_counts.set_index("Namespace")) + + with chart_col4: + st.markdown("**Events by Object Kind**") + kind_counts = df["Kind"].value_counts().reset_index() + kind_counts.columns = ["Kind", "Count"] + st.bar_chart(kind_counts.set_index("Kind")) + + # ── Timeline Chart ──────────────────────────── + st.markdown("---") + st.markdown("#### Event Timeline") + if df["Last Seen"].notna().any() and df["Last Seen"].str.strip().any(): + try: + df["Timestamp"] = pd.to_datetime( + df["Last Seen"], errors="coerce", utc=True, + ) + ts_df = df.dropna(subset=["Timestamp"]) + if not ts_df.empty: + ts_df = ts_df.set_index("Timestamp") + # Events over time grouped by type + timeline = ts_df.groupby( + [pd.Grouper(freq="1min"), "Type"] + ).size().unstack(fill_value=0) + if not timeline.empty: + st.line_chart(timeline) + else: + st.info("Not enough timestamp data for timeline chart.") + else: + st.info("Could not parse event timestamps for timeline.") + except Exception: + st.info("Could not render timeline chart from event data.") + else: + st.info("No timestamp data available for timeline chart.") + + # ── High-Count Events ───────────────────────── + st.markdown("---") + st.markdown("#### High-Frequency Events") + high_count = df[df["Count"] > 1].sort_values("Count", ascending=False).head(20) + if not high_count.empty: + st.dataframe( + high_count[["Namespace", "Type", "Reason", "Object", "Count", "Message"]], + use_container_width=True, + hide_index=True, + ) + else: + st.info("No repeated events found.") + + # ── Full Events Table ───────────────────────── + st.markdown("---") + st.markdown("#### All Events") + st.dataframe(df, use_container_width=True, hide_index=True) + + except (json.JSONDecodeError, KeyError): + # Fallback to text display + st.code(result.stdout, language="text") + elif result.success: + st.info("No events found.") + else: + st.error("Failed to fetch events") + st.code(result.stderr, language="text") + + # Warning events summary + st.markdown("---") + st.markdown("#### Warning Events Summary") + if st.button("Show Warning Events", key="warn_events"): + warn_cmd = ( + "get events -A --field-selector type=Warning " + "-o custom-columns=" + "'NAMESPACE:.metadata.namespace," + "LAST_SEEN:.lastTimestamp," + "COUNT:.count," + "REASON:.reason," + "OBJECT:.involvedObject.name," + "MESSAGE:.message' " + "--sort-by=.lastTimestamp" + ) + with st.spinner("Fetching warning events..."): + result = run_kubectl(profile, warn_cmd, timeout=15) + if result.success: + st.code(result.stdout or "(no warning events)", language="text") + else: + st.code(result.stderr, language="text") + + # ── Pod Restart Tracker ─────────────────────────────────────────────── + with tab_restart_tracker: + st.markdown("### Pod Restart Tracker") + st.markdown("Identify pods with frequent restarts, OOMKilled containers, and CrashLoopBackOff issues.") + + rcol1, rcol2 = st.columns([2, 1]) + with rcol1: + if _rv_namespaces: + restart_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="restart_ns") + else: + restart_ns = st.text_input("Namespace (blank = all)", value="", key="restart_ns_text") + if not restart_ns: + restart_ns = "All Namespaces" + with rcol2: + min_restarts = st.number_input("Min restarts to show", min_value=0, value=1, key="min_restarts") + + if st.button("Load Pod Restarts", type="primary", key="load_restarts"): + ns_flag = "-A" if restart_ns == "All Namespaces" else f"-n {restart_ns}" + cmd = f"get pods {ns_flag} -o json" + with st.spinner("Fetching pod data..."): + result = run_kubectl(profile, cmd, timeout=30) + if result.success and result.stdout.strip(): + try: + import pandas as pd + pods_json = json.loads(result.stdout) + restart_data = [] + for pod in pods_json.get("items", []): + pod_name = pod.get("metadata", {}).get("name", "?") + pod_ns = pod.get("metadata", {}).get("namespace", "?") + for cs in pod.get("status", {}).get("containerStatuses", []): + restarts = cs.get("restartCount", 0) + if restarts < min_restarts: + continue + container_name = cs.get("name", "?") + ready = cs.get("ready", False) + # Detect OOMKilled + last_state = cs.get("lastState", {}) + terminated = last_state.get("terminated", {}) + reason = terminated.get("reason", "") + exit_code = terminated.get("exitCode", "") + # Current state + state = cs.get("state", {}) + if "running" in state: + current_state = "Running" + elif "waiting" in state: + current_state = state["waiting"].get("reason", "Waiting") + elif "terminated" in state: + current_state = state["terminated"].get("reason", "Terminated") + else: + current_state = "Unknown" + restart_data.append({ + "Namespace": pod_ns, + "Pod": pod_name, + "Container": container_name, + "Restarts": restarts, + "Ready": ready, + "State": current_state, + "Last Termination": reason or "N/A", + "Exit Code": str(exit_code) if exit_code != "" else "N/A", + }) + if restart_data: + df = pd.DataFrame(restart_data).sort_values("Restarts", ascending=False) + # Summary metrics + total_restarts = df["Restarts"].sum() + oom_count = len(df[df["Last Termination"] == "OOMKilled"]) + crash_count = len(df[df["State"] == "CrashLoopBackOff"]) + mcol1, mcol2, mcol3, mcol4 = st.columns(4) + mcol1.metric("Containers with Restarts", len(df)) + mcol2.metric("Total Restarts", int(total_restarts)) + mcol3.metric("OOMKilled", oom_count) + mcol4.metric("CrashLoopBackOff", crash_count) + st.dataframe(df, use_container_width=True, hide_index=True) + # Highlight problematic pods + if oom_count > 0: + st.warning( + f"{oom_count} container(s) were terminated due to **OOMKilled** — " + "consider increasing memory limits for those workloads." + ) + if crash_count > 0: + st.error( + f"{crash_count} container(s) are in **CrashLoopBackOff** — " + "check logs with `kubectl logs -c --previous`." + ) + else: + st.success(f"No containers found with {min_restarts}+ restarts. Cluster looks healthy!") + except (json.JSONDecodeError, KeyError) as e: + st.error(f"Failed to parse pod data: {e}") + st.code(result.stdout[:2000], language="text") + elif result.success: + st.info("No pods found.") + else: + st.error("Failed to fetch pods") + st.code(result.stderr, language="text") + + # ── PVC / Storage Dashboard ─────────────────────────────────────────── + with tab_pvc: + st.markdown("### PVC / Storage Dashboard") + st.markdown("View PersistentVolumeClaims, PersistentVolumes, and StorageClasses.") + + pvc_sub = st.radio( + "View", + ["PVCs", "PersistentVolumes", "StorageClasses"], + horizontal=True, + key="pvc_view", + ) + + if pvc_sub == "PVCs": + pcol1, pcol2 = st.columns([2, 1]) + with pcol1: + if _rv_namespaces: + pvc_ns = st.selectbox("Namespace", ["All Namespaces"] + _rv_namespaces, key="pvc_ns") + else: + pvc_ns = st.text_input("Namespace (blank = all)", value="", key="pvc_ns_text") + if not pvc_ns: + pvc_ns = "All Namespaces" + + if st.button("Load PVCs", type="primary", key="load_pvcs"): + ns_flag = "-A" if pvc_ns == "All Namespaces" else f"-n {pvc_ns}" + cmd = f"get pvc {ns_flag} -o json" + with st.spinner("Fetching PVCs..."): + result = run_kubectl(profile, cmd, timeout=15) + if result.success and result.stdout.strip(): + try: + import pandas as pd + pvc_json = json.loads(result.stdout) + pvcs = pvc_json.get("items", []) + if not pvcs: + st.info("No PVCs found.") + else: + pvc_data = [] + for pvc in pvcs: + meta = pvc.get("metadata", {}) + spec = pvc.get("spec", {}) + status = pvc.get("status", {}) + capacity = status.get("capacity", {}).get("storage", "N/A") + requested = spec.get("resources", {}).get("requests", {}).get("storage", "N/A") + pvc_data.append({ + "Namespace": meta.get("namespace", "?"), + "Name": meta.get("name", "?"), + "Status": status.get("phase", "?"), + "Volume": spec.get("volumeName", "N/A"), + "Capacity": capacity, + "Requested": requested, + "Access Modes": ", ".join(spec.get("accessModes", [])), + "Storage Class": spec.get("storageClassName", "N/A"), + }) + df = pd.DataFrame(pvc_data) + # Summary + bound = len(df[df["Status"] == "Bound"]) + pending = len(df[df["Status"] == "Pending"]) + lost = len(df[df["Status"] == "Lost"]) + scol1, scol2, scol3, scol4 = st.columns(4) + scol1.metric("Total PVCs", len(df)) + scol2.metric("Bound", bound) + scol3.metric("Pending", pending) + scol4.metric("Lost", lost) + if pending > 0: + st.warning(f"{pending} PVC(s) are **Pending** — check StorageClass availability and provisioner status.") + if lost > 0: + st.error(f"{lost} PVC(s) are **Lost** — the bound PV has been deleted. Data may be lost.") + st.dataframe(df, use_container_width=True, hide_index=True) + except (json.JSONDecodeError, KeyError) as e: + st.error(f"Failed to parse PVC data: {e}") + elif result.success: + st.info("No PVCs found.") + else: + st.error("Failed to fetch PVCs") + st.code(result.stderr, language="text") + + elif pvc_sub == "PersistentVolumes": + if st.button("Load PVs", type="primary", key="load_pvs"): + cmd = "get pv -o json" + with st.spinner("Fetching PersistentVolumes..."): + result = run_kubectl(profile, cmd, timeout=15) + if result.success and result.stdout.strip(): + try: + import pandas as pd + pv_json = json.loads(result.stdout) + pvs = pv_json.get("items", []) + if not pvs: + st.info("No PersistentVolumes found.") + else: + pv_data = [] + for pv in pvs: + meta = pv.get("metadata", {}) + spec = pv.get("spec", {}) + status = pv.get("status", {}) + claim_ref = spec.get("claimRef", {}) + claim = f"{claim_ref.get('namespace', '')}/{claim_ref.get('name', '')}" if claim_ref else "Unbound" + pv_data.append({ + "Name": meta.get("name", "?"), + "Capacity": spec.get("capacity", {}).get("storage", "N/A"), + "Access Modes": ", ".join(spec.get("accessModes", [])), + "Reclaim Policy": spec.get("persistentVolumeReclaimPolicy", "N/A"), + "Status": status.get("phase", "?"), + "Claim": claim, + "Storage Class": spec.get("storageClassName", "N/A"), + "Volume Mode": spec.get("volumeMode", "N/A"), + }) + df = pd.DataFrame(pv_data) + avail = len(df[df["Status"] == "Available"]) + bound = len(df[df["Status"] == "Bound"]) + released = len(df[df["Status"] == "Released"]) + scol1, scol2, scol3, scol4 = st.columns(4) + scol1.metric("Total PVs", len(df)) + scol2.metric("Bound", bound) + scol3.metric("Available", avail) + scol4.metric("Released", released) + st.dataframe(df, use_container_width=True, hide_index=True) + except (json.JSONDecodeError, KeyError) as e: + st.error(f"Failed to parse PV data: {e}") + elif result.success: + st.info("No PersistentVolumes found.") + else: + st.error("Failed to fetch PVs") + st.code(result.stderr, language="text") + + elif pvc_sub == "StorageClasses": + if st.button("Load Storage Classes", type="primary", key="load_sc"): + cmd = "get storageclasses -o json" + with st.spinner("Fetching StorageClasses..."): + result = run_kubectl(profile, cmd, timeout=15) + if result.success and result.stdout.strip(): + try: + import pandas as pd + sc_json = json.loads(result.stdout) + scs = sc_json.get("items", []) + if not scs: + st.info("No StorageClasses found.") + else: + sc_data = [] + for sc in scs: + meta = sc.get("metadata", {}) + annotations = meta.get("annotations", {}) + is_default = annotations.get("storageclass.kubernetes.io/is-default-class", "false") == "true" + sc_data.append({ + "Name": meta.get("name", "?"), + "Provisioner": sc.get("provisioner", "N/A"), + "Reclaim Policy": sc.get("reclaimPolicy", "N/A"), + "Volume Binding": sc.get("volumeBindingMode", "N/A"), + "Allow Expansion": sc.get("allowVolumeExpansion", False), + "Default": is_default, + }) + st.dataframe(pd.DataFrame(sc_data), use_container_width=True, hide_index=True) + except (json.JSONDecodeError, KeyError) as e: + st.error(f"Failed to parse StorageClass data: {e}") + elif result.success: + st.info("No StorageClasses found.") + else: + st.error("Failed to fetch StorageClasses") + st.code(result.stderr, language="text") + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Upgrade Planner +# ══════════════════════════════════════════════════════════════════════════ + +_K8S_VERSIONS_DETAIL = [ + { + "version": "1.35", + "release": "2026-04", + "end_of_life": "2027-08", + "highlights": "Sidecar containers GA, improved pod lifecycle management, dynamic resource allocation enhancements.", + }, + { + "version": "1.34", + "release": "2025-12", + "end_of_life": "2027-04", + "highlights": "Structured authorization config GA, recursive read-only mounts, traffic distribution improvements.", + }, + { + "version": "1.33", + "release": "2025-08", + "end_of_life": "2027-01", + "highlights": "In-place pod resize beta, multi-network pods alpha, nftables kube-proxy backend.", + }, + { + "version": "1.32", + "release": "2025-04", + "end_of_life": "2026-08", + "highlights": "Dynamic resource allocation (DRA) beta, auto-remove PV claims, job success policy GA.", + }, + { + "version": "1.31", + "release": "2024-12", + "end_of_life": "2026-04", + "highlights": "AppArmor GA, nftables proxy GA, improved ingress connectivity reliability, cgroup v2 enhancements.", + }, + { + "version": "1.30", + "release": "2024-04", + "end_of_life": "2025-08", + "highlights": "Contextual logging GA, CEL admission improvements, pod scheduling readiness.", + }, + { + "version": "1.29", + "release": "2023-12", + "end_of_life": "2025-02", + "highlights": "KMS v2 GA, ReadWriteOncePod GA, networking improvements, node memory manager.", + }, + { + "version": "1.28", + "release": "2023-08", + "end_of_life": "2024-10", + "highlights": "Sidecar containers alpha, recovery from non-graceful node shutdown, mixed version proxy.", + }, + { + "version": "1.27", + "release": "2023-04", + "end_of_life": "2024-06", + "highlights": "In-place pod resize alpha, VPA improvements, SeccompDefault GA.", + }, +] + + +def page_upgrade_planner(): + st.markdown("## Upgrade Planner") + st.markdown("Plan and prepare Kubernetes version upgrades for your cluster.") + + profile = _get_active_profile() + if not profile: + return + + current_ver = profile.kubernetes_version + + tab_overview, tab_preflight, tab_plan, tab_changelog = st.tabs([ + "Version Overview", + "Pre-flight Checks", + "Upgrade Steps", + "Changelog & Compatibility", + ]) + + # ── Version Overview ───────────────────────────────────────────────── + with tab_overview: + st.markdown("### Kubernetes Version Matrix") + st.info(f"Your current cluster version: **{current_ver}**") + + # Build a table + rows = [] + for v in _K8S_VERSIONS_DETAIL: + status = "" + if v["version"] == current_ver: + status = "CURRENT" + elif v["version"] > current_ver: + status = "UPGRADE AVAILABLE" + else: + status = "OLDER" + rows.append({ + "Version": v["version"], + "Status": status, + "Release Date": v["release"], + "End of Life": v["end_of_life"], + "Highlights": v["highlights"], + }) + + st.dataframe(rows, use_container_width=True, hide_index=True) + + # Upgrade target selection + st.markdown("---") + available_upgrades = [ + v["version"] for v in _K8S_VERSIONS_DETAIL if v["version"] > current_ver + ] + if available_upgrades: + target_version = st.selectbox( + "Select target upgrade version", + available_upgrades, + key="upgrade_target", + ) + skipped = [ + v for v in _K8S_VERSIONS_DETAIL + if current_ver < v["version"] <= target_version + ] + if len(skipped) > 1: + st.warning( + f"You are skipping {len(skipped) - 1} minor version(s). " + "Kubernetes supports upgrading one minor version at a time. " + "Plan incremental upgrades for production clusters." + ) + st.markdown("#### Upgrade Path") + path_versions = [current_ver] + [v["version"] for v in reversed(skipped)] + st.markdown(" → ".join([f"**{v}**" for v in path_versions])) + else: + st.success("You are running the latest version!") + + # ── Pre-flight Checks ──────────────────────────────────────────────── + with tab_preflight: + st.markdown("### Pre-Upgrade Checks") + st.markdown("Run these checks before starting the upgrade process.") + + checks = [ + ("Cluster Health", "get nodes -o wide"), + ("All Pods Running", "get pods -A --field-selector 'status.phase!=Running,status.phase!=Succeeded'"), + ("etcd Health", "get --raw=/healthz"), + ("API Server Version", "version"), + ("PodDisruptionBudgets", "get pdb -A"), + ("Deprecated APIs", "api-resources --api-group=extensions"), + ("Persistent Volumes", "get pv"), + ("Component Statuses", "get cs 2>/dev/null || echo 'Deprecated in newer versions'"), + ] + + if st.button("Run All Pre-flight Checks", type="primary", key="preflight"): + all_ok = True + for name, cmd in checks: + with st.status(f"Checking: {name}...", expanded=False) as status: + result = run_kubectl(profile, cmd, timeout=15) + if result.success: + st.code(result.stdout or "(no output)", language="text") + status.update(label=f"{name} — OK", state="complete") + else: + st.code(result.stderr, language="text") + status.update(label=f"{name} — ISSUE", state="error") + all_ok = False + + if all_ok: + st.success("All pre-flight checks passed! The cluster looks ready for upgrade.") + else: + st.warning( + "Some checks reported issues. Review the output above before proceeding." + ) + + st.markdown("---") + st.markdown("#### Backup Checklist") + st.markdown( + "Before upgrading, ensure you have:\n\n" + "- [ ] **etcd snapshot backup**: `ETCDCTL_API=3 etcdctl snapshot save /backup/etcd-snapshot.db`\n" + "- [ ] **Cluster state export**: `kubectl get all -A -o yaml > cluster-backup.yaml`\n" + "- [ ] **PV/PVC data backed up** (if applicable)\n" + "- [ ] **CNI configuration backed up**: `/etc/cni/net.d/`\n" + "- [ ] **kubeadm config backed up**: `kubeadm config view > kubeadm-config.yaml`\n" + "- [ ] **VM/node snapshots taken** (if running on VMs)\n" + ) + + # ── Upgrade Steps ──────────────────────────────────────────────────── + with tab_plan: + st.markdown("### Step-by-Step Upgrade Plan") + + target = st.selectbox( + "Target Version", + [v["version"] for v in _K8S_VERSIONS_DETAIL if v["version"] > current_ver] or [current_ver], + key="upgrade_plan_target", + ) + + st.markdown(f"#### Upgrading from {current_ver} → {target}") + + st.markdown( + f""" +**Phase 1: Prepare (Control Plane)** +```bash +# 1. Update package repositories +sudo apt-get update + +# 2. Check available kubeadm versions +apt-cache madison kubeadm | grep {target} + +# 3. Upgrade kubeadm +sudo apt-mark unhold kubeadm +sudo apt-get install -y kubeadm={target}.* +sudo apt-mark hold kubeadm + +# 4. Verify kubeadm version +kubeadm version + +# 5. Check upgrade plan +sudo kubeadm upgrade plan +``` + +**Phase 2: Upgrade Control Plane** +```bash +# 1. Drain the control-plane node +kubectl drain --ignore-daemonsets --delete-emptydir-data + +# 2. Apply the upgrade +sudo kubeadm upgrade apply v{target}.0 + +# 3. Upgrade kubelet & kubectl +sudo apt-mark unhold kubelet kubectl +sudo apt-get install -y kubelet={target}.* kubectl={target}.* +sudo apt-mark hold kubelet kubectl + +# 4. Restart kubelet +sudo systemctl daemon-reload +sudo systemctl restart kubelet + +# 5. Uncordon the node +kubectl uncordon +``` + +**Phase 3: Upgrade Worker Nodes** (repeat for each worker) +```bash +# On each worker node: +# 1. Drain the worker +kubectl drain --ignore-daemonsets --delete-emptydir-data + +# 2. Upgrade kubeadm, kubelet, kubectl +sudo apt-mark unhold kubeadm kubelet kubectl +sudo apt-get install -y kubeadm={target}.* kubelet={target}.* kubectl={target}.* +sudo apt-mark hold kubeadm kubelet kubectl + +# 3. Upgrade node config +sudo kubeadm upgrade node + +# 4. Restart kubelet +sudo systemctl daemon-reload +sudo systemctl restart kubelet + +# 5. Uncordon +kubectl uncordon +``` + +**Phase 4: Upgrade CRI-O** (on each node) +```bash +# Update CRI-O to match the K8s version +sudo apt-get install -y cri-o={target}.* +sudo systemctl restart crio +sudo systemctl restart kubelet +``` + +**Phase 5: Verify** +```bash +kubectl get nodes -o wide +kubectl get pods -A +kubectl version +``` +""" + ) + + # ── Changelog & Compatibility ──────────────────────────────────────── + with tab_changelog: + st.markdown("### Version Changelog & Compatibility Notes") + + for v in _K8S_VERSIONS_DETAIL: + marker = " ← CURRENT" if v["version"] == current_ver else "" + with st.expander(f"Kubernetes {v['version']}{marker}", expanded=(v["version"] == current_ver)): + st.markdown(f"**Release Date:** {v['release']}") + st.markdown(f"**End of Life:** {v['end_of_life']}") + st.markdown(f"**Key Highlights:** {v['highlights']}") + st.markdown("---") + st.markdown( + f"**Compatibility:**\n" + f"- CRI-O: {v['version']}.x\n" + f"- Flannel: Compatible (check release notes for CNI spec changes)\n" + f"- etcd: 3.5.x+ recommended\n" + f"- CoreDNS: 1.11.x+ recommended\n" + ) + st.markdown( + f"**Upgrade Notes:**\n" + f"- Always upgrade one minor version at a time\n" + f"- Check deprecated API versions before upgrading\n" + f"- Run `kubeadm upgrade plan` to verify compatibility\n" + f"- Back up etcd before starting\n" + ) + + +def page_ai_assistant(): + st.markdown("## AI Kubernetes Assistant") + + if not is_llm_configured(): + st.info( + "LLM is not configured. Select a provider (OpenAI or Ollama) in the sidebar LLM Settings, or set `LLM_API_URL` and `LLM_API_KEY` " + "environment variables to enable the AI chat assistant." + ) + st.markdown( + "All other features (Cluster Creation, Debugging, Monitoring, Log Analysis) " + "work without an LLM. Only the AI-powered analysis and chat features require it." + ) + return + + st.markdown("Chat with the AI about any Kubernetes topic.") + + # Chat history + for msg in st.session_state.chat_history: + with st.chat_message(msg["role"]): + st.markdown(msg["content"]) + + # Chat input + if prompt := st.chat_input("Ask about Kubernetes..."): + st.session_state.chat_history.append({"role": "user", "content": prompt}) + with st.chat_message("user"): + st.markdown(prompt) + + with st.chat_message("assistant"): + placeholder = st.empty() + full_response = "" + for chunk in stream_llm( + prompt, + conversation_history=st.session_state.chat_history[:-1], + ): + full_response += chunk + placeholder.markdown(full_response + "▌") + placeholder.markdown(full_response) + + st.session_state.chat_history.append({"role": "assistant", "content": full_response}) + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Multi-Cluster Dashboard +# ══════════════════════════════════════════════════════════════════════════ + +def page_multi_cluster_dashboard(): + st.markdown("## Multi-Cluster Dashboard") + st.markdown("Overview of all registered cluster profiles at a glance.") + + profiles = list_profiles() + if not profiles: + st.info("No cluster profiles yet. Create one in the **Profile Manager** or import a cluster via kubeconfig.") + return + + # Summary metrics + total = len(profiles) + imported = sum(1 for p in profiles if p.cluster_source == "imported") + provisioned = total - imported + active_count = sum(1 for p in profiles if p.status == "active") + draft_count = sum(1 for p in profiles if p.status == "draft") + error_count = sum(1 for p in profiles if p.status == "error") + + mcol1, mcol2, mcol3, mcol4, mcol5 = st.columns(5) + mcol1.metric("Total Clusters", total) + mcol2.metric("Provisioned", provisioned) + mcol3.metric("Imported", imported) + mcol4.metric("Active", active_count) + mcol5.metric("Errors", error_count) + + st.markdown("---") + + # Cluster cards + for profile in profiles: + status_icon = {"active": "🟢", "error": "🔴", "draft": "⚪", "provisioning": "🟡"}.get(profile.status, "⚪") + source_label = "Imported" if profile.cluster_source == "imported" else "Provisioned" + + with st.expander( + f"{status_icon} **{profile.name}** — {source_label} | {profile.status.upper()}", + expanded=(profile.status == "error"), + ): + col1, col2, col3 = st.columns(3) + with col1: + st.markdown(f"**K8s Version:** {profile.kubernetes_version}") + st.markdown(f"**Source:** {source_label}") + st.markdown(f"**Status:** {profile.status.upper()}") + with col2: + if profile.cluster_source == "imported": + st.markdown(f"**Kubeconfig:** {'Loaded' if profile.kubeconfig_content else 'Not loaded'}") + else: + cp = len(profile.get_control_plane_nodes()) + wk = len(profile.get_worker_nodes()) + st.markdown(f"**Nodes:** {cp} control-plane + {wk} worker") + st.markdown(f"**CRI-O:** {profile.crio_version}") + st.markdown(f"**CNI:** Flannel") + with col3: + if profile.description: + st.markdown(f"**Description:** {profile.description}") + + # Live cluster health check for imported clusters + if profile.cluster_source == "imported" and profile.kubeconfig_content: + if st.button(f"Check Health", key=f"health_{profile.name}"): + with st.spinner("Checking cluster health..."): + node_result = run_kubectl(profile, "get nodes --no-headers", timeout=10) + if node_result.success and node_result.stdout.strip(): + lines = [l for l in node_result.stdout.strip().split("\n") if l.strip()] + total_nodes = len(lines) + ready_nodes = sum(1 for l in lines if "Ready" in l.split()[1] if len(l.split()) > 1) + not_ready = total_nodes - ready_nodes + hcol1, hcol2, hcol3 = st.columns(3) + hcol1.metric("Nodes", total_nodes) + hcol2.metric("Ready", ready_nodes) + hcol3.metric("Not Ready", not_ready) + if not_ready > 0: + st.warning(f"{not_ready} node(s) are not Ready.") + else: + st.success("All nodes are Ready.") + # Pod summary + pod_result = run_kubectl(profile, "get pods -A --no-headers", timeout=15) + if pod_result.success and pod_result.stdout.strip(): + pod_lines = [l for l in pod_result.stdout.strip().split("\n") if l.strip()] + total_pods = len(pod_lines) + running_pods = sum(1 for l in pod_lines if "Running" in l) + failed_pods = sum(1 for l in pod_lines if any(s in l for s in ["Error", "CrashLoopBackOff", "ImagePullBackOff"])) + pcol1, pcol2, pcol3 = st.columns(3) + pcol1.metric("Total Pods", total_pods) + pcol2.metric("Running", running_pods) + pcol3.metric("Failed/Error", failed_pods) + elif node_result.success: + st.info("Connected but no nodes found.") + else: + st.error(f"Could not connect: {node_result.stderr or 'kubectl failed'}") + + # Quick actions + if profile.cluster_source == "imported" and profile.kubeconfig_content: + qcol1, qcol2, qcol3 = st.columns(3) + with qcol1: + if st.button("View Nodes", key=f"qnodes_{profile.name}"): + result = run_kubectl(profile, "get nodes -o wide", timeout=10) + if result.success: + st.code(result.stdout or "(no output)", language="text") + else: + st.error(result.stderr or "Failed") + with qcol2: + if st.button("View Namespaces", key=f"qns_{profile.name}"): + result = run_kubectl(profile, "get namespaces", timeout=10) + if result.success: + st.code(result.stdout or "(no output)", language="text") + else: + st.error(result.stderr or "Failed") + with qcol3: + if st.button("Warning Events", key=f"qevents_{profile.name}"): + result = run_kubectl( + profile, + "get events -A --field-selector type=Warning --sort-by=.lastTimestamp", + timeout=15, + ) + if result.success: + st.code(result.stdout or "(no warning events)", language="text") + else: + st.error(result.stderr or "Failed") + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Certificate Manager +# ══════════════════════════════════════════════════════════════════════════ + +def page_certificate_manager(): + st.markdown("## Certificate Manager") + st.markdown("View cluster certificate expiration dates, TLS secrets, and plan renewals.") + + profile = _get_active_profile() + if not profile: + return + + _show_profile_summary(profile) + + tab_certs, tab_tls, tab_renew = st.tabs([ + "Cluster Certificates", + "TLS Secrets", + "Renewal Guide", + ]) + + # ── Cluster Certificates (kubeadm) ──────────────────────────────────── + with tab_certs: + st.markdown("### Cluster Certificates (kubeadm)") + + if profile.cluster_source == "imported": + st.info( + "Certificate inspection via `kubeadm certs check-expiration` requires SSH access " + "to control-plane nodes. For imported clusters, use the **TLS Secrets** tab to view " + "TLS certificates stored in the cluster." + ) + # Still try to get API server cert info + if st.button("Check API Server Certificate", key="api_cert_check"): + with st.spinner("Checking API server certificate..."): + cmd = ( + "get --raw /healthz -v=6 2>&1 || true" + ) + result = run_kubectl(profile, "version --short", timeout=10) + if result.success: + st.success("API server is reachable and serving valid TLS.") + st.code(result.stdout, language="text") + else: + if "certificate" in (result.stderr or "").lower(): + st.error("Certificate issue detected:") + st.code(result.stderr, language="text") + else: + st.warning(f"Could not check: {result.stderr}") + else: + cp_nodes = profile.get_control_plane_nodes() + if not cp_nodes: + st.warning("No control-plane nodes defined.") + else: + st.markdown( + "Runs `kubeadm certs check-expiration` on control-plane nodes via SSH " + "to show certificate validity and expiration dates." + ) + if st.button("Check Certificate Expiration", type="primary", key="check_certs"): + for node in cp_nodes: + node_label = f"{node.get('hostname', node.get('ip_address', '?'))} ({node.get('ip_address', '')})" + with st.expander(f"Node: {node_label}", expanded=True): + with st.spinner(f"Checking certificates on {node_label}..."): + result = run_ssh_command( + ip_address=node["ip_address"], + command="sudo kubeadm certs check-expiration 2>/dev/null || echo 'kubeadm certs command not available'", + ssh_user=node.get("ssh_user", "root"), + ssh_port=node.get("ssh_port", 22), + ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=30, + ) + if result.success and result.stdout.strip(): + st.code(result.stdout, language="text") + # Parse for expiring soon + if "RESIDUAL TIME" in result.stdout: + for line in result.stdout.split("\n"): + if any(warn in line.lower() for warn in ["invalid", "expired"]): + st.error(f"Certificate issue: {line.strip()}") + else: + st.error(f"Failed: {result.stderr or 'No output'}") + + # ── TLS Secrets ─────────────────────────────────────────────────────── + with tab_tls: + st.markdown("### TLS Secrets") + st.markdown("View Kubernetes TLS secrets and their certificate details.") + + if st.button("Load TLS Secrets", type="primary", key="load_tls"): + cmd = "get secrets -A -o json" + with st.spinner("Fetching secrets..."): + result = run_kubectl(profile, cmd, timeout=20) + if result.success and result.stdout.strip(): + try: + import pandas as pd + secrets_json = json.loads(result.stdout) + tls_secrets = [] + for secret in secrets_json.get("items", []): + if secret.get("type") == "kubernetes.io/tls": + meta = secret.get("metadata", {}) + annotations = meta.get("annotations", {}) + tls_secrets.append({ + "Namespace": meta.get("namespace", "?"), + "Name": meta.get("name", "?"), + "Type": "kubernetes.io/tls", + "Created": meta.get("creationTimestamp", "N/A"), + "Issuer": annotations.get("cert-manager.io/issuer-name", annotations.get("cert-manager.io/cluster-issuer", "N/A")), + "Has cert": "tls.crt" in secret.get("data", {}), + "Has key": "tls.key" in secret.get("data", {}), + }) + if tls_secrets: + st.markdown(f"**Found {len(tls_secrets)} TLS secret(s)**") + st.dataframe(pd.DataFrame(tls_secrets), use_container_width=True, hide_index=True) + else: + st.info("No TLS secrets found in the cluster.") + except (json.JSONDecodeError, KeyError) as e: + st.error(f"Failed to parse secrets: {e}") + elif result.success: + st.info("No secrets found.") + else: + st.error("Failed to fetch secrets") + st.code(result.stderr, language="text") + + # cert-manager status + st.markdown("---") + st.markdown("#### cert-manager Status") + if st.button("Check cert-manager", key="check_certmanager"): + with st.spinner("Checking cert-manager..."): + result = run_kubectl(profile, "get pods -n cert-manager --no-headers", timeout=10) + if result.success and result.stdout.strip(): + st.success("cert-manager is installed:") + st.code(result.stdout, language="text") + # Check certificates + cert_result = run_kubectl(profile, "get certificates -A --no-headers", timeout=10) + if cert_result.success and cert_result.stdout.strip(): + st.markdown("**Managed Certificates:**") + st.code(cert_result.stdout, language="text") + elif result.success: + st.info("cert-manager namespace exists but no pods found.") + else: + st.info("cert-manager does not appear to be installed.") + + # ── Renewal Guide ───────────────────────────────────────────────────── + with tab_renew: + st.markdown("### Certificate Renewal Guide") + + st.markdown(""" +#### Automatic Renewal (kubeadm) + +kubeadm automatically renews certificates during `kubeadm upgrade`. For manual renewal: + +```bash +# Renew all certificates +sudo kubeadm certs renew all + +# Renew specific certificate +sudo kubeadm certs renew apiserver +sudo kubeadm certs renew apiserver-kubelet-client +sudo kubeadm certs renew front-proxy-client +sudo kubeadm certs renew etcd-server +sudo kubeadm certs renew etcd-peer +sudo kubeadm certs renew etcd-healthcheck-client + +# After renewal, restart control plane components +sudo systemctl restart kubelet +``` + +#### Certificate Authority (CA) Rotation + +CA rotation is more complex and requires: +1. Generate new CA certificate and key +2. Distribute to all nodes +3. Re-sign all component certificates +4. Rolling restart of all components + +#### cert-manager Renewal + +If using cert-manager, certificates are automatically renewed before expiration. +Check cert-manager logs for renewal status: + +```bash +kubectl logs -n cert-manager deploy/cert-manager -f +``` + +#### Best Practices +- Monitor certificate expiration dates regularly +- Set up alerts for certificates expiring within 30 days +- Keep kubeadm version aligned with cluster version for smooth renewals +- Back up `/etc/kubernetes/pki/` before any certificate operations +- Test renewal in a staging environment first + """) + + +# ══════════════════════════════════════════════════════════════════════════ +# PAGE: Cost Optimizer +# ══════════════════════════════════════════════════════════════════════════ + +def page_cost_optimizer(): + st.markdown("## Cost Estimator / Resource Optimizer") + st.markdown("Analyze resource usage vs requests/limits and identify optimization opportunities.") + + profile = _get_active_profile() + if not profile: + return + + _show_profile_summary(profile) + + tab_usage, tab_right_size, tab_idle = st.tabs([ + "Resource Usage", + "Right-Sizing", + "Idle Resources", + ]) + + # ── Resource Usage ──────────────────────────────────────────────────── + with tab_usage: + st.markdown("### Actual Resource Usage vs Requests") + st.markdown("Compare real CPU/memory usage (from metrics-server) against configured requests and limits.") + + usage_sub = st.radio("View", ["Node Usage", "Pod Usage"], horizontal=True, key="usage_view") + + if usage_sub == "Node Usage": + if st.button("Load Node Usage", type="primary", key="load_node_usage"): + with st.spinner("Fetching node metrics..."): + result = run_kubectl(profile, "top nodes --no-headers", timeout=15) + if result.success and result.stdout.strip(): + import pandas as pd + lines = [l for l in result.stdout.strip().split("\n") if l.strip()] + node_usage = [] + for line in lines: + parts = line.split() + if len(parts) >= 5: + node_usage.append({ + "Node": parts[0], + "CPU (cores)": parts[1], + "CPU %": parts[2], + "Memory": parts[3], + "Memory %": parts[4], + }) + if node_usage: + st.dataframe(pd.DataFrame(node_usage), use_container_width=True, hide_index=True) + # Chart + try: + import plotly.graph_objects as go + fig = go.Figure() + names = [n["Node"] for n in node_usage] + cpu_pcts = [int(n["CPU %"].replace("%", "")) for n in node_usage] + mem_pcts = [int(n["Memory %"].replace("%", "")) for n in node_usage] + fig.add_trace(go.Bar(name="CPU %", x=names, y=cpu_pcts, marker_color="#326CE5")) + fig.add_trace(go.Bar(name="Memory %", x=names, y=mem_pcts, marker_color="#764ba2")) + fig.update_layout( + title="Node Resource Utilization", + yaxis_title="Utilization %", + barmode="group", + height=400, + ) + fig.add_hline(y=80, line_dash="dash", line_color="red", annotation_text="80% threshold") + st.plotly_chart(fig, use_container_width=True) + except ImportError: + pass + else: + st.code(result.stdout, language="text") + elif result.success: + st.info("No node metrics available. Is metrics-server installed?") + else: + st.error("Failed to fetch node metrics. Ensure metrics-server is installed.") + st.code(result.stderr, language="text") + st.info("Install metrics-server via **Monitoring Setup** > **Metrics Components**.") + + elif usage_sub == "Pod Usage": + pcol1, pcol2 = st.columns([2, 1]) + with pcol1: + _co_namespaces: list[str] = [] + if profile.cluster_source == "imported" and profile.kubeconfig_content: + _co_namespaces = fetch_namespaces(profile.kubeconfig_content) + if _co_namespaces: + pod_usage_ns = st.selectbox("Namespace", ["All Namespaces"] + _co_namespaces, key="pod_usage_ns") + else: + pod_usage_ns = st.text_input("Namespace (blank = all)", value="", key="pod_usage_ns_text") + if not pod_usage_ns: + pod_usage_ns = "All Namespaces" + + if st.button("Load Pod Usage", type="primary", key="load_pod_usage"): + ns_flag = "-A" if pod_usage_ns == "All Namespaces" else f"-n {pod_usage_ns}" + with st.spinner("Fetching pod metrics..."): + result = run_kubectl(profile, f"top pods {ns_flag} --no-headers", timeout=20) + if result.success and result.stdout.strip(): + import pandas as pd + lines = [l for l in result.stdout.strip().split("\n") if l.strip()] + pod_usage = [] + for line in lines: + parts = line.split() + if pod_usage_ns == "All Namespaces" and len(parts) >= 4: + pod_usage.append({ + "Namespace": parts[0], + "Pod": parts[1], + "CPU": parts[2], + "Memory": parts[3], + }) + elif len(parts) >= 3: + pod_usage.append({ + "Pod": parts[0], + "CPU": parts[1], + "Memory": parts[2], + }) + if pod_usage: + df = pd.DataFrame(pod_usage) + st.dataframe(df, use_container_width=True, hide_index=True) + st.markdown(f"**Total pods:** {len(df)}") + elif result.success: + st.info("No pod metrics available.") + else: + st.error("Failed to fetch pod metrics.") + st.code(result.stderr, language="text") + + # ── Right-Sizing ────────────────────────────────────────────────────── + with tab_right_size: + st.markdown("### Right-Sizing Recommendations") + st.markdown( + "Compare actual pod usage against configured requests/limits to find " + "over-provisioned or under-provisioned workloads." + ) + + rs_col1, rs_col2 = st.columns([2, 1]) + with rs_col1: + _rs_namespaces: list[str] = [] + if profile.cluster_source == "imported" and profile.kubeconfig_content: + _rs_namespaces = fetch_namespaces(profile.kubeconfig_content) + if _rs_namespaces: + rs_ns = st.selectbox("Namespace", _rs_namespaces, key="rs_ns") + else: + rs_ns = st.text_input("Namespace", value="default", key="rs_ns_text") + + if st.button("Analyze Right-Sizing", type="primary", key="analyze_rs"): + if not rs_ns: + st.warning("Please specify a namespace.") + else: + with st.spinner("Fetching usage and resource specs..."): + # Get actual usage + usage_result = run_kubectl( + profile, + f"top pods -n {rs_ns} --no-headers --containers", + timeout=20, + ) + # Get resource specs + spec_result = run_kubectl( + profile, + f"get pods -n {rs_ns} -o json", + timeout=20, + ) + + if usage_result.success and spec_result.success: + try: + import pandas as pd + # Parse usage: POD CONTAINER CPU MEM + usage_map = {} + for line in (usage_result.stdout or "").strip().split("\n"): + parts = line.split() + if len(parts) >= 4: + key = f"{parts[0]}/{parts[1]}" + usage_map[key] = {"cpu_usage": parts[2], "mem_usage": parts[3]} + + # Parse specs + pods_json = json.loads(spec_result.stdout) + rows = [] + for pod in pods_json.get("items", []): + pod_name = pod.get("metadata", {}).get("name", "?") + for container in pod.get("spec", {}).get("containers", []): + c_name = container.get("name", "?") + res = container.get("resources", {}) + req_cpu = res.get("requests", {}).get("cpu", "none") + req_mem = res.get("requests", {}).get("memory", "none") + lim_cpu = res.get("limits", {}).get("cpu", "none") + lim_mem = res.get("limits", {}).get("memory", "none") + key = f"{pod_name}/{c_name}" + usage = usage_map.get(key, {}) + rows.append({ + "Pod": pod_name, + "Container": c_name, + "CPU Usage": usage.get("cpu_usage", "N/A"), + "CPU Request": req_cpu, + "CPU Limit": lim_cpu, + "Mem Usage": usage.get("mem_usage", "N/A"), + "Mem Request": req_mem, + "Mem Limit": lim_mem, + }) + if rows: + df = pd.DataFrame(rows) + st.dataframe(df, use_container_width=True, hide_index=True) + + # Recommendations + no_req_cpu = sum(1 for r in rows if r["CPU Request"] == "none") + no_req_mem = sum(1 for r in rows if r["Mem Request"] == "none") + no_lim_cpu = sum(1 for r in rows if r["CPU Limit"] == "none") + no_lim_mem = sum(1 for r in rows if r["Mem Limit"] == "none") + + st.markdown("---") + st.markdown("#### Recommendations") + if no_req_cpu > 0: + st.warning(f"{no_req_cpu} container(s) have **no CPU request** — scheduler cannot make optimal placement decisions.") + if no_req_mem > 0: + st.warning(f"{no_req_mem} container(s) have **no memory request** — pods may be evicted under pressure.") + if no_lim_cpu > 0: + st.info(f"{no_lim_cpu} container(s) have **no CPU limit** — they can consume all available CPU on the node.") + if no_lim_mem > 0: + st.warning(f"{no_lim_mem} container(s) have **no memory limit** — they may be OOMKilled or cause node instability.") + if no_req_cpu == 0 and no_req_mem == 0 and no_lim_cpu == 0 and no_lim_mem == 0: + st.success("All containers have CPU and memory requests and limits set.") + else: + st.info("No containers found in this namespace.") + except (json.JSONDecodeError, KeyError) as e: + st.error(f"Failed to parse data: {e}") + else: + if not usage_result.success: + st.error("Failed to fetch pod usage metrics. Is metrics-server installed?") + st.code(usage_result.stderr, language="text") + if not spec_result.success: + st.error("Failed to fetch pod specs.") + st.code(spec_result.stderr, language="text") + + # ── Idle Resources ──────────────────────────────────────────────────── + with tab_idle: + st.markdown("### Idle / Unused Resources") + st.markdown("Find resources that may be wasting cluster capacity.") + + idle_checks = st.multiselect( + "Check for", + [ + "Completed/Failed Jobs", + "Deployments scaled to 0", + "Orphaned ConfigMaps", + "Unbound PVCs", + "Empty Namespaces", + ], + default=["Completed/Failed Jobs", "Deployments scaled to 0", "Unbound PVCs"], + key="idle_checks", + ) + + if st.button("Scan for Idle Resources", type="primary", key="scan_idle"): + findings = [] + + if "Completed/Failed Jobs" in idle_checks: + with st.spinner("Checking completed/failed jobs..."): + result = run_kubectl(profile, "get jobs -A -o json", timeout=15) + if result.success and result.stdout.strip(): + try: + jobs = json.loads(result.stdout).get("items", []) + old_jobs = [] + for job in jobs: + status = job.get("status", {}) + conditions = status.get("conditions", []) + for cond in conditions: + if cond.get("type") in ("Complete", "Failed") and cond.get("status") == "True": + meta = job.get("metadata", {}) + old_jobs.append(f" - {meta.get('namespace', '?')}/{meta.get('name', '?')} ({cond['type']})") + if old_jobs: + findings.append(("warning", f"**{len(old_jobs)} completed/failed job(s)** can be cleaned up:\n" + "\n".join(old_jobs[:20]))) + else: + findings.append(("success", "No completed/failed jobs found.")) + except (json.JSONDecodeError, KeyError): + findings.append(("error", "Failed to parse jobs data.")) + + if "Deployments scaled to 0" in idle_checks: + with st.spinner("Checking zero-replica deployments..."): + result = run_kubectl(profile, "get deployments -A -o json", timeout=15) + if result.success and result.stdout.strip(): + try: + deploys = json.loads(result.stdout).get("items", []) + zero_deploys = [] + for dep in deploys: + replicas = dep.get("spec", {}).get("replicas", 1) + if replicas == 0: + meta = dep.get("metadata", {}) + zero_deploys.append(f" - {meta.get('namespace', '?')}/{meta.get('name', '?')}") + if zero_deploys: + findings.append(("warning", f"**{len(zero_deploys)} deployment(s) scaled to 0 replicas:**\n" + "\n".join(zero_deploys[:20]))) + else: + findings.append(("success", "No zero-replica deployments found.")) + except (json.JSONDecodeError, KeyError): + findings.append(("error", "Failed to parse deployment data.")) + + if "Unbound PVCs" in idle_checks: + with st.spinner("Checking unbound PVCs..."): + result = run_kubectl(profile, "get pvc -A --no-headers", timeout=15) + if result.success and result.stdout.strip(): + lines = [l for l in result.stdout.strip().split("\n") if l.strip()] + pending_pvcs = [l for l in lines if "Pending" in l] + if pending_pvcs: + findings.append(("warning", f"**{len(pending_pvcs)} PVC(s) in Pending state** (not bound to a PV):\n```\n" + "\n".join(pending_pvcs[:10]) + "\n```")) + else: + findings.append(("success", "All PVCs are bound.")) + elif result.success: + findings.append(("info", "No PVCs found.")) + + if "Empty Namespaces" in idle_checks: + with st.spinner("Checking empty namespaces..."): + ns_result = run_kubectl(profile, "get namespaces --no-headers", timeout=10) + if ns_result.success and ns_result.stdout.strip(): + ns_lines = [l.split()[0] for l in ns_result.stdout.strip().split("\n") if l.strip()] + system_ns = {"kube-system", "kube-public", "kube-node-lease", "default"} + empty_ns = [] + for ns in ns_lines: + if ns in system_ns: + continue + pod_r = run_kubectl(profile, f"get pods -n {ns} --no-headers", timeout=10) + if pod_r.success and not pod_r.stdout.strip(): + empty_ns.append(ns) + if empty_ns: + findings.append(("info", f"**{len(empty_ns)} namespace(s) with no pods:**\n - " + "\n - ".join(empty_ns[:15]))) + else: + findings.append(("success", "No empty non-system namespaces found.")) + + if "Orphaned ConfigMaps" in idle_checks: + findings.append(("info", "Orphaned ConfigMap detection requires cross-referencing all pod specs — use the **Resource Viewer** to manually inspect ConfigMaps per namespace.")) + + # Display findings + st.markdown("---") + st.markdown("#### Findings") + for level, msg in findings: + if level == "warning": + st.warning(msg) + elif level == "error": + st.error(msg) + elif level == "success": + st.success(msg) + else: + st.info(msg) + + +# ── Helper functions ────────────────────────────────────────────────────── + +def _get_active_profile() -> ClusterProfile | None: + """Get the active profile or show a warning.""" + if not st.session_state.active_profile: + st.warning("No active cluster profile selected. Please create or select one in the Profile Manager.") + return None + profile = load_profile(st.session_state.active_profile) + if not profile: + st.error(f"Profile '{st.session_state.active_profile}' not found.") + return None + return profile + + +def _show_profile_summary(profile: ClusterProfile): + """Display a compact profile summary.""" + if profile.cluster_source == "imported": + cols = st.columns(4) + cols[0].metric("Profile", profile.name) + cols[1].metric("K8s Version", profile.kubernetes_version) + cols[2].metric("Source", "Imported (kubeconfig)") + cols[3].metric("Status", profile.status.upper()) + with st.expander("Cluster Details", expanded=False): + st.markdown(f"**Description:** {profile.description or 'N/A'}") + st.markdown(f"**Kubeconfig:** {'Loaded' if profile.kubeconfig_content else 'Not loaded'}") + + # Fetch live cluster info from kubeconfig + if profile.kubeconfig_content: + node_result = run_kubectl( + profile, + "get nodes -o wide --no-headers", + timeout=10, + ) + if node_result.success and node_result.stdout.strip(): + st.markdown("---") + st.markdown("**Cluster Nodes:**") + node_lines = [l for l in node_result.stdout.strip().split("\n") if l.strip()] + node_data = [] + for line in node_lines: + parts = line.split() + if len(parts) >= 5: + node_data.append({ + "Name": parts[0], + "Status": parts[1], + "Roles": parts[2] if parts[2] != "" else "worker", + "Age": parts[3], + "Kubelet Version": parts[4], + "Internal IP": parts[5] if len(parts) > 5 else "N/A", + "OS Image": " ".join(parts[7:9]) if len(parts) > 8 else (parts[7] if len(parts) > 7 else "N/A"), + "Container Runtime": parts[-1] if len(parts) > 9 else "N/A", + }) + if node_data: + import pandas as pd + st.dataframe( + pd.DataFrame(node_data), + use_container_width=True, + hide_index=True, + ) + # Summary + cp_count = sum(1 for n in node_data if "control-plane" in n["Roles"] or "master" in n["Roles"]) + worker_count = len(node_data) - cp_count + ready_count = sum(1 for n in node_data if "Ready" in n["Status"]) + st.markdown( + f"**Total:** {len(node_data)} node(s) — " + f"{cp_count} control-plane, {worker_count} worker | " + f"**Ready:** {ready_count}/{len(node_data)}" + ) + else: + st.code(node_result.stdout, language="text") + + # Cluster info (API server endpoint) + info_result = run_kubectl(profile, "cluster-info", timeout=10) + if info_result.success and info_result.stdout.strip(): + st.markdown("---") + st.markdown("**Cluster Info:**") + # Strip ANSI color codes for clean display + import re + clean_info = re.sub(r'\x1b\[[0-9;]*m', '', info_result.stdout) + st.code(clean_info.strip(), language="text") + elif node_result.success: + st.info("Connected to cluster but no nodes found.") + else: + st.warning( + f"Could not fetch cluster details: {node_result.stderr or 'kubectl command failed'}. " + "Verify that kubectl is installed and the kubeconfig is valid." + ) + else: + cols = st.columns(5) + cols[0].metric("Profile", profile.name) + cols[1].metric("K8s Version", profile.kubernetes_version) + cols[2].metric("Runtime", f"CRI-O {profile.crio_version}") + cols[3].metric("CNI", "Flannel") + cols[4].metric("Nodes", f"{len(profile.get_control_plane_nodes())} CP + {len(profile.get_worker_nodes())} W") + + with st.expander("Storage & Proxy Details", expanded=False): + scol1, scol2, scol3 = st.columns(3) + with scol1: + st.markdown(f"**CRI-O Root:** `{profile.crio_root}`") + st.markdown(f"**CRI-O RunRoot:** `{profile.crio_runroot}`") + with scol2: + st.markdown(f"**Kubelet Dir:** `{profile.kubelet_root}`") + st.markdown(f"**Log Root:** `{profile.log_root}`") + with scol3: + if profile.http_proxy or profile.https_proxy: + st.markdown(f"**HTTP Proxy:** `{profile.http_proxy or 'N/A'}`") + st.markdown(f"**HTTPS Proxy:** `{profile.https_proxy or 'N/A'}`") + if profile.no_proxy: + st.markdown(f"**No Proxy:** `{profile.no_proxy}`") + if profile.http_proxy_alt or profile.https_proxy_alt: + st.markdown(f"**Alt HTTP Proxy:** `{profile.http_proxy_alt or 'N/A'}`") + st.markdown(f"**Alt HTTPS Proxy:** `{profile.https_proxy_alt or 'N/A'}`") + if not (profile.http_proxy or profile.https_proxy or profile.http_proxy_alt or profile.https_proxy_alt): + st.markdown("**Proxy:** Not configured") + + +# ── Main Router ─────────────────────────────────────────────────────────── + +def main(): + page = render_sidebar() + + if page == "Multi-Cluster Dashboard": + page_multi_cluster_dashboard() + elif page == "Profile Manager": + page_profile_manager() + elif page == "Cluster Creation": + page_cluster_creation() + elif page == "Resource Viewer": + page_resource_viewer() + elif page == "Cluster Debugger": + page_cluster_debugger() + elif page == "Monitoring Setup": + page_monitoring_setup() + elif page == "Log Analysis": + page_log_analysis() + elif page == "Upgrade Planner": + page_upgrade_planner() + elif page == "Certificate Manager": + page_certificate_manager() + elif page == "Cost Optimizer": + page_cost_optimizer() + elif page == "AI Assistant": + page_ai_assistant() + + +if __name__ == "__main__": + main() diff --git a/k8s-agent/config.py b/k8s-agent/config.py new file mode 100644 index 0000000..2951fa1 --- /dev/null +++ b/k8s-agent/config.py @@ -0,0 +1,157 @@ +"""Configuration for the K8s Agent application.""" + +import os +import shutil +import subprocess + + +# LLM Configuration +# Provider: "openai" (OpenAI-compatible endpoint) or "ollama" (local Ollama) +LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai") +LLM_API_URL = os.getenv( + "LLM_API_URL", + "https://aigateway-intern.ad.infosys.com/aigateway/chat/completions", +) +LLM_API_KEY = os.getenv("LLM_API_KEY", os.getenv("INFOSYS_CODER_API_KEY", "")) +LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4") +LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3")) +LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "4096")) + +# Ollama-specific defaults +OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://10.73.98.113:11434") +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3") + + +def is_llm_configured() -> bool: + """Return True if the LLM is configured. + + For Ollama, only the base URL is required (no API key). + For OpenAI-compatible endpoints, both URL and key are required. + """ + if LLM_PROVIDER == "ollama": + return bool(OLLAMA_BASE_URL) + return bool(LLM_API_URL and LLM_API_KEY) + + +def get_active_llm_url() -> str: + """Return the effective chat completions URL based on the active provider.""" + if LLM_PROVIDER == "ollama": + base = OLLAMA_BASE_URL.rstrip("/") + return f"{base}/api/chat" + return LLM_API_URL + + +def get_active_model() -> str: + """Return the effective model name based on the active provider.""" + if LLM_PROVIDER == "ollama": + return OLLAMA_MODEL + return LLM_MODEL + + +# Application paths +DATA_DIR = os.path.join(os.path.dirname(__file__), "data") +PROFILES_DIR = os.path.join(DATA_DIR, "profiles") +TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates") +UPLOADS_DIR = os.path.join(DATA_DIR, "uploads") + +# Ensure directories exist +os.makedirs(PROFILES_DIR, exist_ok=True) +os.makedirs(UPLOADS_DIR, exist_ok=True) + + +# ── kubectl / helm path detection ───────────────────────────────────────── + +# Common install locations to check when kubectl/helm are not in PATH +_KUBECTL_SEARCH_PATHS = [ + "/usr/local/bin/kubectl", + "/usr/bin/kubectl", + "/snap/bin/kubectl", + os.path.expanduser("~/.local/bin/kubectl"), + os.path.expanduser("~/bin/kubectl"), + "/opt/bin/kubectl", +] + +_HELM_SEARCH_PATHS = [ + "/usr/local/bin/helm", + "/usr/bin/helm", + "/snap/bin/helm", + os.path.expanduser("~/.local/bin/helm"), + os.path.expanduser("~/bin/helm"), + "/opt/bin/helm", +] + + +def _find_binary(name: str, search_paths: list[str]) -> str: + """Find a binary by name, checking PATH first then common locations. + + Strategy: + 1. ``shutil.which`` — honours $PATH as seen by the Python process. + 2. Probe well-known install directories with ``os.path.isfile``. + (Skip the ``os.access`` X_OK check because some SELinux / mount + configurations report False even though the file *is* executable.) + 3. Last resort: ask the OS via ``/usr/bin/which`` in a subprocess, + which may see a different PATH than the Python process (e.g. when + Streamlit is started through systemd or a virtualenv wrapper). + """ + # 1. shutil.which + found = shutil.which(name) + if found: + return found + # 2. well-known paths — only check existence (skip os.access) + for path in search_paths: + if os.path.isfile(path): + return path + # 3. subprocess fallback — works when shell PATH differs from Python PATH + for which_cmd in ("which", "/usr/bin/which", "/bin/which"): + try: + proc = subprocess.run( + f"{which_cmd} {name}", + shell=True, capture_output=True, text=True, timeout=5, + ) + result = proc.stdout.strip() + if proc.returncode == 0 and result and os.path.isfile(result): + return result + except Exception: + continue + return "" + + +def get_kubectl_path() -> str: + """Return the full path to kubectl, or empty string if not found.""" + return _find_binary("kubectl", _KUBECTL_SEARCH_PATHS) + + +def get_helm_path() -> str: + """Return the full path to helm, or empty string if not found.""" + return _find_binary("helm", _HELM_SEARCH_PATHS) + + +def get_kubeconfig_path(profile_name: str = "_temp") -> str: + """Return the path where a kubeconfig file should be written for local commands.""" + kc_dir = os.path.join(DATA_DIR, "kubeconfigs") + os.makedirs(kc_dir, exist_ok=True) + return os.path.join(kc_dir, f"{profile_name}.kubeconfig") + + +def fetch_namespaces(kubeconfig_content: str) -> list[str]: + """Fetch all namespaces from a cluster using kubectl with the given kubeconfig. + + Returns a list of namespace names, or an empty list on failure. + """ + kubectl = get_kubectl_path() + if not kubectl: + return [] + kc_path = get_kubeconfig_path("_ns_fetch") + os.makedirs(os.path.dirname(kc_path), exist_ok=True) + with open(kc_path, "w") as f: + f.write(kubeconfig_content) + try: + proc = subprocess.run( + f"{kubectl} --kubeconfig=\"{kc_path}\" get namespaces -o jsonpath='{{.items[*].metadata.name}}'", + shell=True, capture_output=True, text=True, timeout=15, + ) + if proc.returncode == 0 and proc.stdout.strip(): + return sorted(proc.stdout.strip().split()) + return [] + except Exception: + return [] diff --git a/k8s-agent/data/profiles/.gitkeep b/k8s-agent/data/profiles/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/k8s-agent/data/profiles/.gitkeep @@ -0,0 +1 @@ + diff --git a/k8s-agent/modules/__init__.py b/k8s-agent/modules/__init__.py new file mode 100644 index 0000000..fc1c144 --- /dev/null +++ b/k8s-agent/modules/__init__.py @@ -0,0 +1 @@ +"""K8s Agent modules.""" diff --git a/k8s-agent/modules/cluster_creator.py b/k8s-agent/modules/cluster_creator.py new file mode 100644 index 0000000..3fea3c7 --- /dev/null +++ b/k8s-agent/modules/cluster_creator.py @@ -0,0 +1,1535 @@ +"""Cluster Creator — SSH-based K8s cluster provisioning with CRI-O + Flannel.""" + +import os +import subprocess +import time +from dataclasses import dataclass, field +from typing import List, Optional + +import config +from modules.profile_manager import ClusterProfile + +# Default Flannel manifest URL — can be overridden by user-uploaded file +FLANNEL_MANIFEST_URL = "https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml" + + +@dataclass +class SSHResult: + """Result of an SSH command execution.""" + + hostname: str + command: str + return_code: int + stdout: str + stderr: str + success: bool + + +def run_ssh_command( + ip_address: str, + command: str, + ssh_user: str = "root", + ssh_port: int = 22, + ssh_key_path: str = "~/.ssh/id_rsa", + timeout: int = 600, +) -> SSHResult: + """Execute a command on a remote node via SSH. + + Args: + ip_address: Target node IP. + command: Shell command to execute remotely. + ssh_user: SSH username. + ssh_port: SSH port number. + ssh_key_path: Path to SSH private key. + timeout: Command timeout in seconds. + + Returns: + SSHResult with command output and status. + """ + ssh_cmd = [ + "ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ConnectTimeout=10", + "-o", "BatchMode=yes", + "-i", ssh_key_path, + "-p", str(ssh_port), + f"{ssh_user}@{ip_address}", + command, + ] + + try: + result = subprocess.run( + ssh_cmd, + capture_output=True, + text=True, + timeout=timeout, + ) + return SSHResult( + hostname=ip_address, + command=command, + return_code=result.returncode, + stdout=result.stdout, + stderr=result.stderr, + success=result.returncode == 0, + ) + except subprocess.TimeoutExpired: + return SSHResult( + hostname=ip_address, + command=command, + return_code=-1, + stdout="", + stderr=f"Command timed out after {timeout}s", + success=False, + ) + except Exception as exc: + return SSHResult( + hostname=ip_address, + command=command, + return_code=-1, + stdout="", + stderr=str(exc), + success=False, + ) + + +def test_ssh_connectivity(node: dict) -> SSHResult: + """Test SSH connectivity to a node.""" + return run_ssh_command( + ip_address=node["ip_address"], + command="echo 'SSH connection successful' && hostname && uname -r", + ssh_user=node.get("ssh_user", "root"), + ssh_port=node.get("ssh_port", 22), + ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=15, + ) + + +def _proxy_env_block(profile: ClusterProfile) -> str: + """Generate shell export lines for proxy environment variables. + + These are valid *shell* statements — use inside scripts for the + current session. Do NOT write these to ``/etc/environment``; + use :func:`_proxy_env_file_block` for that. + """ + lines = [] + proxy = profile.http_proxy or profile.http_proxy_alt + proxys = profile.https_proxy or profile.https_proxy_alt + if proxy: + lines.append(f'export http_proxy="{proxy}"') + lines.append(f'export HTTP_PROXY="{proxy}"') + if proxys: + lines.append(f'export https_proxy="{proxys}"') + lines.append(f'export HTTPS_PROXY="{proxys}"') + if profile.no_proxy: + lines.append(f'export no_proxy="{profile.no_proxy}"') + lines.append(f'export NO_PROXY="{profile.no_proxy}"') + return "\n".join(lines) + + +def _proxy_env_file_block(profile: ClusterProfile) -> str: + """Generate KEY=VALUE lines suitable for ``/etc/environment``. + + ``/etc/environment`` is parsed by ``pam_env.so`` which expects plain + ``KEY=VALUE`` lines — the ``export`` keyword is **not** valid there. + """ + lines = [] + proxy = profile.http_proxy or profile.http_proxy_alt + proxys = profile.https_proxy or profile.https_proxy_alt + if proxy: + lines.append(f'http_proxy="{proxy}"') + lines.append(f'HTTP_PROXY="{proxy}"') + if proxys: + lines.append(f'https_proxy="{proxys}"') + lines.append(f'HTTPS_PROXY="{proxys}"') + if profile.no_proxy: + lines.append(f'no_proxy="{profile.no_proxy}"') + lines.append(f'NO_PROXY="{profile.no_proxy}"') + return "\n".join(lines) + + +def _source_env_preamble() -> str: + """Return a shell snippet that sources /etc/environment. + + Each ``ProvisionStep`` runs in its own SSH session, so environment + variables set by a previous step (e.g. proxy settings) are lost. + Sourcing ``/etc/environment`` at the top of every network-dependent + step ensures the variables are available. + """ + return ( + "# Source /etc/environment so proxy vars (and others) persist across SSH sessions\n" + "set -a; . /etc/environment 2>/dev/null || true; set +a\n" + ) + + +def generate_common_setup_script(profile: ClusterProfile) -> str: + """Generate the common setup script that runs on ALL nodes (control-plane + workers).""" + proxy_block = _proxy_env_block(profile) + proxy_env_file_block = _proxy_env_file_block(profile) + proxy_section = "" + if proxy_block: + proxy_section = f""" +# ── 0. Proxy configuration ─────────────────────────────────────────────── +echo ">> Configuring proxy settings..." +{proxy_block} + +# Persist proxy in /etc/environment for all users (KEY=VALUE format for pam_env) +cat >> /etc/environment <<'PROXYEOF' +{proxy_env_file_block} +PROXYEOF +""" + + crio_storage_section = "" + if profile.crio_root != "/var/lib/containers/storage": + crio_storage_section = f""" +# ── Custom CRI-O storage paths ─────────────────────────────────────────── +echo ">> Configuring CRI-O custom storage root: {profile.crio_root}" +mkdir -p "{profile.crio_root}" +mkdir -p "{profile.crio_runroot}" +""" + + kubelet_section = "" + if profile.kubelet_root != "/var/lib/kubelet": + kubelet_section = f""" +# ── Custom kubelet data directory ──────────────────────────────────────── +echo ">> Configuring kubelet data directory: {profile.kubelet_root}" +mkdir -p "{profile.kubelet_root}" +""" + + log_section = "" + if profile.log_root != "/var/log": + log_section = f""" +# ── Custom log directory ───────────────────────────────────────────────── +echo ">> Configuring custom log root: {profile.log_root}" +mkdir -p "{profile.log_root}/pods" +mkdir -p "{profile.log_root}/containers" +""" + + return f"""#!/bin/bash +set -euo pipefail + +echo "=== K8s Node Common Setup ===" +echo "Kubernetes Version: {profile.kubernetes_version}" +echo "CRI-O Version: {profile.crio_version}" +echo "CRI-O Storage Root: {profile.crio_root}" +echo "Kubelet Data Dir: {profile.kubelet_root}" +echo "Log Root: {profile.log_root}" +echo "Timestamp: $(date -u)" +{proxy_section}{crio_storage_section}{kubelet_section}{log_section} +# ── 1. System prerequisites ────────────────────────────────────────────── +echo ">> Disabling swap..." +swapoff -a +sed -i '/\\bswap\\b/d' /etc/fstab + +echo ">> Loading kernel modules..." +cat > /etc/modules-load.d/k8s.conf <> Setting sysctl parameters..." +cat > /etc/sysctl.d/99-kubernetes.conf <> Disabling SELinux (if present)..." +if command -v setenforce &>/dev/null; then + setenforce 0 || true + sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/null || true +fi + +echo ">> Configuring firewalld (if present)..." +if systemctl is-active --quiet firewalld; then + firewall-cmd --permanent --add-port=6443/tcp # API server + firewall-cmd --permanent --add-port=2379-2380/tcp # etcd + firewall-cmd --permanent --add-port=10250/tcp # Kubelet API + firewall-cmd --permanent --add-port=10259/tcp # kube-scheduler + firewall-cmd --permanent --add-port=10257/tcp # kube-controller-manager + firewall-cmd --permanent --add-port=30000-32767/tcp # NodePort + firewall-cmd --permanent --add-port=8472/udp # Flannel VXLAN + firewall-cmd --reload +fi + +# ── 2. Install CRI-O ───────────────────────────────────────────────────── +echo ">> Installing CRI-O {profile.crio_version}..." + +OS="$(. /etc/os-release && echo "$ID")" +VERSION_ID="$(. /etc/os-release && echo "$VERSION_ID")" + +if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then + apt-get update -y + apt-get install -y software-properties-common curl gnupg2 + + CRIO_VERSION="{profile.crio_version}" + curl -fsSL "https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/Release.key" | \\ + gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg + echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/ /" | \\ + tee /etc/apt/sources.list.d/cri-o.list + + apt-get update -y + apt-get install -y cri-o +elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then + CRIO_VERSION="{profile.crio_version}" + cat > /etc/yum.repos.d/cri-o.repo <> Configuring CRI-O storage to {profile.crio_root}..." +mkdir -p /etc/crio/crio.conf.d +cat > /etc/crio/crio.conf.d/01-storage.conf <> CRI-O installed and configured (storage: {profile.crio_root})." + +# ── 3. Install kubeadm, kubelet, kubectl ────────────────────────────────── +echo ">> Installing Kubernetes {profile.kubernetes_version} components..." + +K8S_VERSION="{profile.kubernetes_version}" + +if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then + curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/Release.key" | \\ + gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg + echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/ /" | \\ + tee /etc/apt/sources.list.d/kubernetes.list + + apt-get update -y + apt-get install -y kubelet kubeadm kubectl + apt-mark hold kubelet kubeadm kubectl +elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then + cat > /etc/yum.repos.d/kubernetes.repo <> Kubernetes components installed." + +echo "=== Common setup complete ===" +""" + + +def generate_control_plane_init_script(profile: ClusterProfile) -> str: + """Generate the kubeadm init script for the control-plane node.""" + cp_nodes = profile.get_control_plane_nodes() + cp_ip = cp_nodes[0]["ip_address"] if cp_nodes else "CONTROL_PLANE_IP" + + # Build proxy environment block for the control-plane + proxy_block = _proxy_env_block(profile) + proxy_section = "" + if proxy_block: + proxy_section = f""" +# ── Proxy configuration (master node) ─────────────────────────────────── +echo ">> Setting proxy environment for kubeadm..." +{proxy_block} +""" + + # Audit log path respects custom log_root + audit_log_dir = f"{profile.log_root}/kubernetes" + + # Extra kubelet args for custom root dir + kubelet_extra = ' container-runtime-endpoint: "unix:///var/run/crio/crio.sock"' + if profile.kubelet_root != "/var/lib/kubelet": + kubelet_extra += f'\n root-dir: "{profile.kubelet_root}"' + + return f"""#!/bin/bash +set -euo pipefail + +echo "=== Initializing Kubernetes Control Plane ===" +{proxy_section} +# ── kubeadm init ────────────────────────────────────────────────────────── +mkdir -p "{audit_log_dir}" +cat > /tmp/kubeadm-config.yaml <> Running kubeadm init..." +kubeadm init --config=/tmp/kubeadm-config.yaml --upload-certs | tee /tmp/kubeadm-init.log + +# ── Configure kubectl for root ──────────────────────────────────────────── +echo ">> Configuring kubectl..." +mkdir -p /root/.kube +cp /etc/kubernetes/admin.conf /root/.kube/config +chown root:root /root/.kube/config + +# ── Install Flannel CNI ─────────────────────────────────────────────────── +echo ">> Installing Flannel CNI..." +if [ -f /tmp/kube-flannel-custom.yml ]; then + echo ">> Using user-provided Flannel manifest..." + kubectl apply -f /tmp/kube-flannel-custom.yml +else + kubectl apply -f {FLANNEL_MANIFEST_URL} +fi + +# Wait for Flannel to be ready +echo ">> Waiting for Flannel pods to be ready..." +kubectl -n kube-flannel wait --for=condition=ready pod -l app=flannel --timeout=120s || true + +# ── Apply Pod Security Standards ────────────────────────────────────────── +echo ">> Applying Pod Security Standards ({profile.pod_security_standard})..." +kubectl label namespace default \\ + pod-security.kubernetes.io/enforce={profile.pod_security_standard} \\ + pod-security.kubernetes.io/warn={profile.pod_security_standard} \\ + pod-security.kubernetes.io/audit={profile.pod_security_standard} \\ + --overwrite + +# ── Generate join command ───────────────────────────────────────────────── +echo ">> Generating worker join command..." +kubeadm token create --print-join-command > /tmp/kubeadm-join-command.txt +echo "Join command saved to /tmp/kubeadm-join-command.txt" + +echo "" +echo "=== Control Plane initialization complete ===" +echo "Join command:" +cat /tmp/kubeadm-join-command.txt +""" + + +def generate_worker_join_script() -> str: + """Generate the script that runs on worker nodes to join the cluster.""" + return """#!/bin/bash +set -euo pipefail + +echo "=== Joining Worker Node to Cluster ===" + +JOIN_COMMAND="$1" + +if [ -z "$JOIN_COMMAND" ]; then + echo "ERROR: Join command not provided." + echo "Usage: $0 ''" + exit 1 +fi + +echo ">> Executing join command..." +eval "$JOIN_COMMAND --cri-socket unix:///var/run/crio/crio.sock" + +echo "=== Worker node joined successfully ===" +""" + + +def generate_best_practices_script() -> str: + """Generate a post-install best practices hardening script.""" + return """#!/bin/bash +set -euo pipefail + +echo "=== Applying Kubernetes Best Practices ===" + +# ── Default Network Policy (deny-all) ──────────────────────────────────── +echo ">> Creating default-deny network policy for default namespace..." +cat <> Setting resource quotas..." +cat <> Setting limit ranges..." +cat <> Creating read-only ClusterRole..." +cat <> Ensuring audit log directory exists..." +mkdir -p /var/log/kubernetes + +echo "=== Best practices applied ===" +echo "" +echo "Summary of applied best practices:" +echo " - Default-deny NetworkPolicy in default namespace" +echo " - ResourceQuota for default namespace (CPU: 4/8, Memory: 8/16Gi)" +echo " - LimitRange with default container limits" +echo " - Read-only ClusterRole (cluster-reader)" +echo " - Audit logging directory configured" +""" + + +# ══════════════════════════════════════════════════════════════════════════ +# Step-based provisioning — granular SSH execution with per-step progress +# ══════════════════════════════════════════════════════════════════════════ + + +@dataclass +class ProvisionStep: + """A single discrete provisioning step to be executed over SSH.""" + + name: str # short identifier, e.g. "disable_swap" + title: str # human-readable label for the UI + script: str # shell snippet to execute + timeout: int = 300 # per-step timeout in seconds + fatal: bool = True # if True, abort provisioning on failure + + +def _run_step(node: dict, step: ProvisionStep) -> SSHResult: + """Execute a single ProvisionStep on a node via SSH.""" + return run_ssh_command( + ip_address=node["ip_address"], + command=step.script, + ssh_user=node.get("ssh_user", "root"), + ssh_port=node.get("ssh_port", 22), + ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=step.timeout, + ) + + +def get_common_setup_steps(profile: ClusterProfile) -> List[ProvisionStep]: + """Return the ordered list of discrete steps for common node setup.""" + proxy_block = _proxy_env_block(profile) + steps: List[ProvisionStep] = [] + + # 0. Proxy (optional) + proxy_env_file_block = _proxy_env_file_block(profile) + if proxy_block: + steps.append(ProvisionStep( + name="configure_proxy", + title="Configure Proxy Settings", + script=f"""set -euo pipefail +echo '>> Configuring proxy settings...' +{proxy_block} +# Persist proxy in /etc/environment for all users (KEY=VALUE format for pam_env) +cat >> /etc/environment <<'PROXYEOF' +{proxy_env_file_block} +PROXYEOF +echo 'Proxy configured.' +""", + timeout=30, + )) + + # 1. System prerequisites + steps.append(ProvisionStep( + name="system_prerequisites", + title="System Prerequisites (swap, modules, sysctl, firewall)", + script="""set -euo pipefail +echo '>> Disabling swap...' +swapoff -a +sed -i '/\\bswap\\b/d' /etc/fstab + +echo '>> Loading kernel modules...' +cat > /etc/modules-load.d/k8s.conf <> Setting sysctl parameters...' +cat > /etc/sysctl.d/99-kubernetes.conf <> Disabling SELinux (if present)...' +if command -v setenforce &>/dev/null; then + setenforce 0 || true + sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/null || true +fi + +echo '>> Configuring firewalld (if present)...' +if systemctl is-active --quiet firewalld; then + firewall-cmd --permanent --add-port=6443/tcp + firewall-cmd --permanent --add-port=2379-2380/tcp + firewall-cmd --permanent --add-port=10250/tcp + firewall-cmd --permanent --add-port=10259/tcp + firewall-cmd --permanent --add-port=10257/tcp + firewall-cmd --permanent --add-port=30000-32767/tcp + firewall-cmd --permanent --add-port=8472/udp + firewall-cmd --reload +fi +echo 'System prerequisites configured.' +""", + timeout=120, + )) + + # 2. Custom storage directories (optional) + dir_cmds = [] + if profile.crio_root != "/var/lib/containers/storage": + dir_cmds.append(f'mkdir -p "{profile.crio_root}"') + dir_cmds.append(f'mkdir -p "{profile.crio_runroot}"') + if profile.kubelet_root != "/var/lib/kubelet": + dir_cmds.append(f'mkdir -p "{profile.kubelet_root}"') + if profile.log_root != "/var/log": + dir_cmds.append(f'mkdir -p "{profile.log_root}/pods"') + dir_cmds.append(f'mkdir -p "{profile.log_root}/containers"') + if dir_cmds: + steps.append(ProvisionStep( + name="create_custom_dirs", + title="Create Custom Storage Directories", + script="set -euo pipefail\necho '>> Creating custom storage directories...'\n" + + "\n".join(dir_cmds) + "\necho 'Custom directories created.'", + timeout=30, + )) + + # 3. Install CRI-O + _env_preamble = _source_env_preamble() + steps.append(ProvisionStep( + name="install_crio", + title=f"Install CRI-O {profile.crio_version}", + script=f"""set -euo pipefail +{_env_preamble} +echo '>> Installing CRI-O {profile.crio_version}...' + +OS="$(. /etc/os-release && echo "$ID")" +VERSION_ID="$(. /etc/os-release && echo "$VERSION_ID")" + +if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then + apt-get update -y + apt-get install -y software-properties-common curl gnupg2 + CRIO_VERSION="{profile.crio_version}" + curl -fsSL "https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/Release.key" | \\ + gpg --dearmor -o /etc/apt/keyrings/cri-o-apt-keyring.gpg + echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/v$CRIO_VERSION/deb/ /" | \\ + tee /etc/apt/sources.list.d/cri-o.list + apt-get update -y + apt-get install -y cri-o +elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then + CRIO_VERSION="{profile.crio_version}" + cat > /etc/yum.repos.d/cri-o.repo <> Configuring CRI-O storage to {profile.crio_root}...' +systemctl daemon-reload +mkdir -p /etc/crio/crio.conf.d +cat > /etc/crio/crio.conf.d/01-storage.conf <> Installing Kubernetes {profile.kubernetes_version} components...' + +OS="$(. /etc/os-release && echo "$ID")" +K8S_VERSION="{profile.kubernetes_version}" + +if [[ "$OS" == "ubuntu" || "$OS" == "debian" ]]; then + curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/Release.key" | \\ + gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg + echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$K8S_VERSION/deb/ /" | \\ + tee /etc/apt/sources.list.d/kubernetes.list + apt-get update -y + apt-get install -y kubelet kubeadm kubectl + apt-mark hold kubelet kubeadm kubectl +elif [[ "$OS" == "rhel" || "$OS" == "centos" || "$OS" == "rocky" || "$OS" == "almalinux" ]]; then + cat > /etc/yum.repos.d/kubernetes.repo < List[ProvisionStep]: + """Return the ordered list of discrete steps for control-plane init.""" + cp_nodes = profile.get_control_plane_nodes() + cp_ip = cp_nodes[0]["ip_address"] if cp_nodes else "CONTROL_PLANE_IP" + + proxy_block = _proxy_env_block(profile) + audit_log_dir = f"{profile.log_root}/kubernetes" + + kubelet_extra = ' container-runtime-endpoint: "unix:///var/run/crio/crio.sock"' + if profile.kubelet_root != "/var/lib/kubelet": + kubelet_extra += f'\n root-dir: "{profile.kubelet_root}"' + + steps: List[ProvisionStep] = [] + + _env_preamble = _source_env_preamble() + + # 0. Proxy on CP (optional) + if proxy_block: + steps.append(ProvisionStep( + name="cp_proxy", + title="Set Proxy Environment for kubeadm", + script=f"""set -euo pipefail +{_env_preamble} +echo '>> Setting proxy environment for kubeadm...' +{proxy_block} +echo 'Proxy environment set.' +""", + timeout=15, + )) + + # 1. kubeadm init + steps.append(ProvisionStep( + name="kubeadm_init", + title="Run kubeadm init", + script=f"""set -euo pipefail +{_env_preamble} +echo '>> Preparing kubeadm config...' +mkdir -p "{audit_log_dir}" +cat > /tmp/kubeadm-config.yaml <> Running kubeadm init (this may take a few minutes)...' +kubeadm init --config=/tmp/kubeadm-config.yaml --upload-certs | tee /tmp/kubeadm-init.log +echo 'kubeadm init complete.' +""", + timeout=600, + )) + + # 2. Configure kubectl + steps.append(ProvisionStep( + name="configure_kubectl", + title="Configure kubectl for root user", + script="""set -euo pipefail +echo '>> Configuring kubectl...' +mkdir -p /root/.kube +cp /etc/kubernetes/admin.conf /root/.kube/config +chown root:root /root/.kube/config +kubectl get nodes +echo 'kubectl configured.' +""", + timeout=30, + )) + + # 3. Install Flannel CNI + flannel_manifest = profile.flannel_manifest_path or FLANNEL_MANIFEST_URL + # If the user uploaded a local file we SCP it first; otherwise download URL + if profile.flannel_manifest_path: + flannel_apply = ( + "echo '>> Using user-provided Flannel manifest...'\n" + "kubectl apply -f /tmp/kube-flannel-custom.yml" + ) + else: + flannel_apply = f"kubectl apply -f {FLANNEL_MANIFEST_URL}" + + steps.append(ProvisionStep( + name="install_flannel", + title="Install Flannel CNI", + script=f"""set -euo pipefail +{_env_preamble} +echo '>> Installing Flannel CNI...' +{flannel_apply} +echo '>> Waiting for Flannel pods to be ready...' +kubectl -n kube-flannel wait --for=condition=ready pod -l app=flannel --timeout=120s || true +echo 'Flannel CNI installed.' +""", + timeout=180, + )) + + # 4. Pod Security Standards + steps.append(ProvisionStep( + name="pod_security", + title=f"Apply Pod Security Standards ({profile.pod_security_standard})", + script=f"""set -euo pipefail +echo '>> Applying Pod Security Standards ({profile.pod_security_standard})...' +kubectl label namespace default \\ + pod-security.kubernetes.io/enforce={profile.pod_security_standard} \\ + pod-security.kubernetes.io/warn={profile.pod_security_standard} \\ + pod-security.kubernetes.io/audit={profile.pod_security_standard} \\ + --overwrite +echo 'Pod Security Standards applied.' +""", + timeout=30, + )) + + # 5. Generate join command + steps.append(ProvisionStep( + name="generate_join_cmd", + title="Generate Worker Join Command", + script="""set -euo pipefail +echo '>> Generating worker join command...' +kubeadm token create --print-join-command > /tmp/kubeadm-join-command.txt +echo 'Join command:' +cat /tmp/kubeadm-join-command.txt +""", + timeout=30, + )) + + return steps + + +def get_worker_join_steps(join_command: str) -> List[ProvisionStep]: + """Return the step(s) to join a worker node to the cluster.""" + return [ + ProvisionStep( + name="join_cluster", + title="Join Cluster", + script=f"""set -euo pipefail +echo '>> Joining cluster...' +{join_command} --cri-socket unix:///var/run/crio/crio.sock +echo 'Successfully joined the cluster.' +""", + timeout=300, + ), + ] + + +def get_best_practices_steps() -> List[ProvisionStep]: + """Return the ordered list of best-practices hardening steps.""" + return [ + ProvisionStep( + name="network_policy", + title="Apply Default-Deny NetworkPolicy", + script="""set -euo pipefail +echo '>> Creating default-deny network policy for default namespace...' +cat <> Setting resource quotas...' +cat <> Setting limit ranges...' +cat <> Creating read-only ClusterRole...' +cat <> Ensuring audit log directory exists...' +mkdir -p /var/log/kubernetes +echo 'Audit log directory ready.' +""", + timeout=15, + fatal=False, + ), + ] + + +def get_cluster_reset_steps(profile: ClusterProfile) -> List[ProvisionStep]: + """Return the ordered list of steps to fully reset/teardown a K8s node. + + This runs kubeadm reset, stops services, removes CRI-O data, CNI configs, + and cleans up iptables — preparing the node for a fresh cluster install. + """ + crio_root = profile.crio_root or "/var/lib/containers/storage" + kubelet_root = profile.kubelet_root or "/var/lib/kubelet" + log_root = profile.log_root or "/var/log" + + return [ + ProvisionStep( + name="drain_node", + title="Drain & Cordon Node (best effort)", + script="""set -uo pipefail +echo '>> Attempting to drain this node (best effort)...' +HOSTNAME=$(hostname) +kubectl drain "$HOSTNAME" --ignore-daemonsets --delete-emptydir-data --force --timeout=60s 2>/dev/null || true +kubectl cordon "$HOSTNAME" 2>/dev/null || true +echo 'Drain/cordon complete (or skipped if kubectl not available).' +""", + timeout=90, + fatal=False, + ), + ProvisionStep( + name="kubeadm_reset", + title="Run kubeadm reset", + script="""set -uo pipefail +echo '>> Running kubeadm reset...' +kubeadm reset -f --cri-socket unix:///var/run/crio/crio.sock 2>/dev/null || \ +kubeadm reset -f 2>/dev/null || \ +echo 'kubeadm reset returned non-zero (may already be reset)' +echo 'kubeadm reset complete.' +""", + timeout=120, + ), + ProvisionStep( + name="stop_services", + title="Stop kubelet & CRI-O services", + script="""set -uo pipefail +echo '>> Stopping kubelet...' +systemctl stop kubelet 2>/dev/null || true +systemctl disable kubelet 2>/dev/null || true +echo '>> Stopping CRI-O...' +systemctl stop crio 2>/dev/null || true +systemctl disable crio 2>/dev/null || true +echo 'Services stopped.' +""", + timeout=60, + ), + ProvisionStep( + name="clean_cni", + title="Remove CNI configuration & network interfaces", + script="""set -uo pipefail +echo '>> Removing CNI configs...' +rm -rf /etc/cni/net.d/* +echo '>> Removing flannel interface...' +ip link delete flannel.1 2>/dev/null || true +ip link delete cni0 2>/dev/null || true +ip link delete flannel-wg 2>/dev/null || true +echo 'CNI cleanup complete.' +""", + timeout=30, + ), + ProvisionStep( + name="clean_iptables", + title="Flush iptables rules", + script="""set -uo pipefail +echo '>> Flushing iptables...' +iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X +ip6tables -F && ip6tables -t nat -F && ip6tables -t mangle -F && ip6tables -X 2>/dev/null || true +echo 'iptables flushed.' +""", + timeout=30, + ), + ProvisionStep( + name="clean_kubelet_data", + title="Remove kubelet data", + script=f"""set -uo pipefail +echo '>> Removing kubelet data at {kubelet_root}...' +rm -rf "{kubelet_root}"/* +rm -rf /etc/kubernetes/* +rm -rf /tmp/kubeadm-join-command.txt +echo 'Kubelet data removed.' +""", + timeout=60, + ), + ProvisionStep( + name="clean_crio_data", + title="Remove CRI-O container data", + script=f"""set -uo pipefail +echo '>> Removing CRI-O storage at {crio_root}...' +rm -rf "{crio_root}"/* +echo '>> Removing CRI-O run root...' +rm -rf /run/containers/storage/* +echo 'CRI-O data removed.' +""", + timeout=60, + fatal=False, + ), + ProvisionStep( + name="clean_etcd", + title="Remove etcd data (control-plane only, best effort)", + script="""set -uo pipefail +echo '>> Removing etcd data...' +rm -rf /var/lib/etcd/* +echo 'etcd data removed (if present).' +""", + timeout=30, + fatal=False, + ), + ProvisionStep( + name="clean_logs", + title="Clean K8s-related logs", + script=f"""set -uo pipefail +echo '>> Cleaning K8s logs at {log_root}...' +rm -rf "{log_root}"/pods/* +rm -rf "{log_root}"/containers/* +rm -rf /var/log/kubernetes/* 2>/dev/null || true +echo 'Logs cleaned.' +""", + timeout=30, + fatal=False, + ), + ProvisionStep( + name="verify_clean", + title="Verify cleanup", + script="""set -uo pipefail +echo '>> Verifying cleanup...' +echo "kubelet active: $(systemctl is-active kubelet 2>/dev/null || echo 'not found')" +echo "crio active: $(systemctl is-active crio 2>/dev/null || echo 'not found')" +echo "kubeadm present: $(which kubeadm 2>/dev/null || echo 'not found')" +echo "kubectl present: $(which kubectl 2>/dev/null || echo 'not found')" +echo "CNI configs: $(ls /etc/cni/net.d/ 2>/dev/null || echo 'empty/missing')" +echo "" +echo "Node is ready for a fresh cluster installation." +""", + timeout=30, + fatal=False, + ), + ] + + +def execute_provision_steps( + node: dict, + steps: List[ProvisionStep], +) -> List[tuple]: + """Execute a list of provision steps on a node. + + Returns a list of (ProvisionStep, SSHResult) tuples. + Stops at the first fatal failure. + """ + results: List[tuple] = [] + for step in steps: + result = _run_step(node, step) + results.append((step, result)) + if not result.success and step.fatal: + break + return results + + +# ── Legacy wrapper functions (kept for backward compatibility) ──────────── + + +def provision_node_common(node: dict, profile: ClusterProfile) -> SSHResult: + """Run the common setup script on a single node via SSH.""" + script = generate_common_setup_script(profile) + return run_ssh_command( + ip_address=node["ip_address"], + command=script, + ssh_user=node.get("ssh_user", "root"), + ssh_port=node.get("ssh_port", 22), + ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=600, + ) + + +def init_control_plane(node: dict, profile: ClusterProfile) -> SSHResult: + """Initialize the control plane on the given node.""" + script = generate_control_plane_init_script(profile) + return run_ssh_command( + ip_address=node["ip_address"], + command=script, + ssh_user=node.get("ssh_user", "root"), + ssh_port=node.get("ssh_port", 22), + ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=600, + ) + + +def retrieve_join_command(control_plane_node: dict) -> Optional[str]: + """Retrieve the kubeadm join command from the control-plane node.""" + result = run_ssh_command( + ip_address=control_plane_node["ip_address"], + command="cat /tmp/kubeadm-join-command.txt", + ssh_user=control_plane_node.get("ssh_user", "root"), + ssh_port=control_plane_node.get("ssh_port", 22), + ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=30, + ) + if result.success: + return result.stdout.strip() + return None + + +def join_worker_node(node: dict, join_command: str) -> SSHResult: + """Join a worker node to the cluster.""" + full_command = f"{join_command} --cri-socket unix:///var/run/crio/crio.sock" + return run_ssh_command( + ip_address=node["ip_address"], + command=full_command, + ssh_user=node.get("ssh_user", "root"), + ssh_port=node.get("ssh_port", 22), + ssh_key_path=node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=300, + ) + + +def apply_best_practices(control_plane_node: dict) -> SSHResult: + """Apply best practices hardening on the cluster via the control-plane.""" + script = generate_best_practices_script() + return run_ssh_command( + ip_address=control_plane_node["ip_address"], + command=script, + ssh_user=control_plane_node.get("ssh_user", "root"), + ssh_port=control_plane_node.get("ssh_port", 22), + ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=120, + ) + + +def get_cluster_status(control_plane_node: dict) -> SSHResult: + """Get the cluster node status from the control-plane.""" + return run_ssh_command( + ip_address=control_plane_node["ip_address"], + command="kubectl get nodes -o wide && echo '---' && kubectl get pods -A", + ssh_user=control_plane_node.get("ssh_user", "root"), + ssh_port=control_plane_node.get("ssh_port", 22), + ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=30, + ) + + +def get_llm_cluster_advice(profile: ClusterProfile, context: str = "") -> str: + """Ask the LLM for cluster setup advice based on the profile. + + Returns a graceful message when the LLM is not configured. + """ + from modules.llm_client import query_llm # lazy import — LLM is optional + + nodes_desc = [] + for n in profile.nodes: + nodes_desc.append(f" - {n.get('hostname', 'unknown')} ({n['ip_address']}) — role: {n['role']}") + nodes_str = "\n".join(nodes_desc) + + prompt = f"""I am setting up an on-premises Kubernetes cluster with the following configuration: + +- Kubernetes Version: {profile.kubernetes_version} +- Container Runtime: CRI-O {profile.crio_version} +- CNI Plugin: Flannel +- Pod CIDR: {profile.pod_cidr} +- Service CIDR: {profile.service_cidr} +- Pod Security Standard: {profile.pod_security_standard} +- CRI-O Storage Root: {profile.crio_root} +- Kubelet Data Dir: {profile.kubelet_root} +- Log Root: {profile.log_root} +- HTTP Proxy: {profile.http_proxy or 'none'} +- HTTPS Proxy: {profile.https_proxy or 'none'} +- Alternate HTTP Proxy: {profile.http_proxy_alt or 'none'} +- Alternate HTTPS Proxy: {profile.https_proxy_alt or 'none'} + +Nodes: +{nodes_str} + +{context} + +Please review this configuration and provide: +1. Any potential issues or conflicts +2. Recommended optimizations +3. Security hardening recommendations specific to this setup +4. Network configuration tips for Flannel with CRI-O +""" + return query_llm(prompt) + + +def run_kubectl(profile: ClusterProfile, command: str, timeout: int = 30) -> SSHResult: + """Run a kubectl (or helm) command against the cluster. + + For imported clusters (with kubeconfig), commands run locally. + For provisioned clusters, commands run via SSH on the control-plane node. + + If `command` starts with 'helm ', it is treated as a helm command and + the KUBECONFIG env var is set instead of prefixing with 'kubectl'. + """ + is_helm = command.strip().startswith("helm ") + + if profile.kubeconfig_content: + # Write kubeconfig to a file and run locally. + # Sanitize the profile name for use as a filename — replace any + # non-alphanumeric characters (spaces, shell metacharacters, etc.) + # with underscores so the path is always safe for shell interpolation. + import re as _re + safe_name = _re.sub(r"[^\w.-]", "_", profile.name) or "cluster" + kubeconfig_path = os.path.join( + config.DATA_DIR, "kubeconfigs", f"{safe_name}.kubeconfig" + ) + os.makedirs(os.path.dirname(kubeconfig_path), exist_ok=True) + with open(kubeconfig_path, "w") as f: + f.write(profile.kubeconfig_content) + + kubectl = config.get_kubectl_path() + helm = config.get_helm_path() + + if is_helm: + bin_path = helm or "helm" + resolved = command.strip() + if resolved.startswith("helm "): + resolved = bin_path + resolved[4:] + full_cmd = f'KUBECONFIG="{kubeconfig_path}" {resolved}' + else: + if not kubectl: + return SSHResult( + hostname="local (kubeconfig)", + command=command, + return_code=1, + stdout="", + stderr=( + "kubectl not found on this machine.\n\n" + "Install kubectl:\n" + " curl -LO https://dl.k8s.io/release/" + "$(curl -Ls https://dl.k8s.io/release/stable.txt)" + "/bin/linux/amd64/kubectl\n" + " chmod +x kubectl && mv kubectl ~/.local/bin/\n\n" + "Or see: https://kubernetes.io/docs/tasks/tools/" + ), + success=False, + ) + full_cmd = f'{kubectl} --kubeconfig="{kubeconfig_path}" {command}' + try: + proc = subprocess.run( + full_cmd, + shell=True, + capture_output=True, + text=True, + timeout=timeout, + ) + return SSHResult( + hostname="local (kubeconfig)", + command=full_cmd, + return_code=proc.returncode, + stdout=proc.stdout, + stderr=proc.stderr, + success=proc.returncode == 0, + ) + except subprocess.TimeoutExpired: + return SSHResult( + hostname="local (kubeconfig)", + command=full_cmd, + return_code=-1, + stdout="", + stderr=f"Command timed out after {timeout}s", + success=False, + ) + except Exception as exc: + return SSHResult( + hostname="local (kubeconfig)", + command=full_cmd, + return_code=-1, + stdout="", + stderr=str(exc), + success=False, + ) + else: + # Provisioned cluster — SSH to control-plane + cp_nodes = profile.get_control_plane_nodes() + if not cp_nodes: + return SSHResult( + hostname="N/A", + command=command, + return_code=-1, + stdout="", + stderr="No control-plane node defined", + success=False, + ) + cp = cp_nodes[0] + remote_cmd = command if is_helm else f"kubectl {command}" + return run_ssh_command( + ip_address=cp["ip_address"], + command=remote_cmd, + ssh_user=cp.get("ssh_user", "root"), + ssh_port=cp.get("ssh_port", 22), + ssh_key_path=cp.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=timeout, + ) + + +def upload_flannel_manifest_to_node(node: dict, local_path: str) -> SSHResult: + """SCP a user-provided Flannel manifest to a node as /tmp/kube-flannel-custom.yml.""" + scp_cmd = [ + "scp", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-P", str(node.get("ssh_port", 22)), + "-i", node.get("ssh_key_path", "~/.ssh/id_rsa"), + local_path, + f"{node.get('ssh_user', 'root')}@{node['ip_address']}:/tmp/kube-flannel-custom.yml", + ] + try: + proc = subprocess.run( + scp_cmd, + capture_output=True, + text=True, + timeout=60, + ) + return SSHResult( + hostname=node["ip_address"], + command="scp flannel manifest", + return_code=proc.returncode, + stdout=proc.stdout, + stderr=proc.stderr, + success=proc.returncode == 0, + ) + except subprocess.TimeoutExpired: + return SSHResult( + hostname=node["ip_address"], + command="scp flannel manifest", + return_code=-1, + stdout="", + stderr="SCP timed out after 60 seconds", + success=False, + ) + except Exception as exc: + return SSHResult( + hostname=node["ip_address"], + command="scp flannel manifest", + return_code=-1, + stdout="", + stderr=str(exc), + success=False, + ) diff --git a/k8s-agent/modules/cluster_debugger.py b/k8s-agent/modules/cluster_debugger.py new file mode 100644 index 0000000..df80c9d --- /dev/null +++ b/k8s-agent/modules/cluster_debugger.py @@ -0,0 +1,368 @@ +"""Cluster Debugger — Diagnose K8s issues and provide LLM-powered recommendations. + +Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based). +""" + +import os +import subprocess + +from modules.cluster_creator import run_ssh_command, SSHResult +from modules.profile_manager import ClusterProfile +import config + + +# ── Diagnostic command definitions ──────────────────────────────────────── + +# kubectl-only commands (work for both imported and provisioned clusters) +KUBECTL_DIAGNOSTIC_COMMANDS = { + "Node Status": "get nodes -o wide", + "Pod Status (All Namespaces)": "get pods -A -o wide", + "Events (Recent)": "get events -A --sort-by=.lastTimestamp", + "Component Status": "get componentstatuses", + "System Pods": "-n kube-system get pods -o wide", + "Node Resources": "top nodes", + "Pod Resources": "top pods -A", + "Cluster Info": "cluster-info", + "Flannel Status": "-n kube-flannel get pods -o wide", + "Network Policies": "get networkpolicies -A", + "Services": "get svc -A", + "PVCs": "get pvc -A", + "Ingresses": "get ingress -A", + "Disk Usage": "top nodes", +} + +# Full SSH commands (backward-compat for provisioned clusters) +DIAGNOSTIC_COMMANDS = { + "Node Status": "kubectl get nodes -o wide", + "Pod Status (All Namespaces)": "kubectl get pods -A -o wide", + "Events (Recent)": "kubectl get events -A --sort-by='.lastTimestamp' | tail -50", + "Component Status": "kubectl get componentstatuses 2>/dev/null; kubectl get --raw='/healthz?verbose' 2>/dev/null || true", + "System Pods": "kubectl -n kube-system get pods -o wide", + "Node Resources": "kubectl top nodes 2>/dev/null || echo 'metrics-server not installed'", + "Pod Resources": "kubectl top pods -A 2>/dev/null || echo 'metrics-server not installed'", + "Cluster Info": "kubectl cluster-info", + "CRI-O Status": "systemctl status crio --no-pager -l", + "Kubelet Status": "systemctl status kubelet --no-pager -l", + "Kubelet Logs (Recent)": "journalctl -u kubelet --no-pager -n 50", + "CRI-O Logs (Recent)": "journalctl -u crio --no-pager -n 50", + "Flannel Status": "kubectl -n kube-flannel get pods -o wide 2>/dev/null || kubectl -n kube-system get pods -l app=flannel -o wide 2>/dev/null || echo 'Flannel pods not found'", + "Network Policies": "kubectl get networkpolicies -A", + "Services": "kubectl get svc -A", + "PVCs": "kubectl get pvc -A", + "Ingresses": "kubectl get ingress -A 2>/dev/null || true", + "Disk Usage": "df -h / /var/lib/containers /var/lib/kubelet 2>/dev/null || df -h /", + "Memory Info": "free -h", + "DNS Resolution": "kubectl run dns-test --image=busybox:1.36 --rm -it --restart=Never -- nslookup kubernetes.default 2>/dev/null || echo 'DNS test skipped'", + "Certificate Expiry": "kubeadm certs check-expiration 2>/dev/null || echo 'Not a kubeadm node or kubeadm not found'", +} + + +def _run_local_kubectl(kubeconfig_content: str, kubectl_args: str, timeout: int = 60) -> SSHResult: + """Run a kubectl command locally using the given kubeconfig content.""" + kubectl = config.get_kubectl_path() + if not kubectl: + return SSHResult( + hostname="local", command="kubectl " + kubectl_args, return_code=1, + stdout="", + stderr=( + "kubectl not found on this machine.\n\n" + "Install kubectl:\n" + " curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n" + " chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n" + "Or on macOS: brew install kubectl\n" + "Or see: https://kubernetes.io/docs/tasks/tools/" + ), + success=False, + ) + kubeconfig_path = config.get_kubeconfig_path("_debug_temp") + with open(kubeconfig_path, "w") as f: + f.write(kubeconfig_content) + full_cmd = f"{kubectl} --kubeconfig=\"{kubeconfig_path}\" {kubectl_args}" + try: + proc = subprocess.run(full_cmd, shell=True, capture_output=True, text=True, timeout=timeout) + return SSHResult( + hostname="local", command=full_cmd, return_code=proc.returncode, + stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0, + ) + except subprocess.TimeoutExpired: + return SSHResult( + hostname="local", command=full_cmd, return_code=-1, + stdout="", stderr=f"Command timed out after {timeout}s", success=False, + ) + except Exception as e: + return SSHResult( + hostname="local", command=full_cmd, return_code=-1, + stdout="", stderr=str(e), success=False, + ) + + +def get_available_commands(profile: ClusterProfile) -> dict[str, str]: + """Return available diagnostic commands based on cluster source.""" + if profile.cluster_source == "imported": + return dict(KUBECTL_DIAGNOSTIC_COMMANDS) + return dict(DIAGNOSTIC_COMMANDS) + + +CATEGORY_MAP = { + "Cluster Overview": [ + "Node Status", + "Pod Status (All Namespaces)", + "Cluster Info", + "Component Status", + ], + "Pod & Workload Health": [ + "Pod Status (All Namespaces)", + "System Pods", + "Events (Recent)", + ], + "Resource Usage": [ + "Node Resources", + "Pod Resources", + "Disk Usage", + "Memory Info", + ], + "Networking": [ + "Flannel Status", + "Network Policies", + "Services", + "Ingresses", + "DNS Resolution", + ], + "Container Runtime & Kubelet": [ + "CRI-O Status", + "Kubelet Status", + "CRI-O Logs (Recent)", + "Kubelet Logs (Recent)", + ], + "Security & Certificates": [ + "Certificate Expiry", + "Network Policies", + ], + "Storage": [ + "PVCs", + "Disk Usage", + ], +} + + +def run_diagnostic( + control_plane_node: dict | None, + command_name: str, + profile: ClusterProfile | None = None, +) -> SSHResult: + """Run a single diagnostic command. + + For imported clusters, uses kubectl locally with kubeconfig. + For provisioned clusters, uses SSH to the control-plane node. + """ + # Imported cluster path + if profile and profile.cluster_source == "imported" and profile.kubeconfig_content: + kubectl_args = KUBECTL_DIAGNOSTIC_COMMANDS.get(command_name) + if kubectl_args is None: + return SSHResult( + hostname="local", command=command_name, return_code=1, + stdout="", + stderr=f"Command '{command_name}' requires SSH (not available for imported clusters).", + success=False, + ) + return _run_local_kubectl(profile.kubeconfig_content, kubectl_args, timeout=60) + + # Provisioned cluster path (SSH) + if not control_plane_node: + return SSHResult( + hostname="unknown", command=command_name, return_code=1, + stdout="", stderr="No control-plane node available.", success=False, + ) + command = DIAGNOSTIC_COMMANDS.get(command_name) + if not command: + return SSHResult( + hostname=control_plane_node["ip_address"], + command=command_name, + return_code=1, + stdout="", + stderr=f"Unknown diagnostic command: {command_name}", + success=False, + ) + return run_ssh_command( + ip_address=control_plane_node["ip_address"], + command=command, + ssh_user=control_plane_node.get("ssh_user", "root"), + ssh_port=control_plane_node.get("ssh_port", 22), + ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=60, + ) + + +def run_category_diagnostics( + control_plane_node: dict | None, + category: str, + profile: ClusterProfile | None = None, +) -> dict[str, SSHResult]: + """Run all diagnostic commands for a given category.""" + results = {} + command_names = CATEGORY_MAP.get(category, []) + for name in command_names: + results[name] = run_diagnostic(control_plane_node, name, profile=profile) + return results + + +def run_all_diagnostics( + control_plane_node: dict | None, + profile: ClusterProfile | None = None, +) -> dict[str, SSHResult]: + """Run every diagnostic command.""" + commands = get_available_commands(profile) if profile else DIAGNOSTIC_COMMANDS + results = {} + for name in commands: + results[name] = run_diagnostic(control_plane_node, name, profile=profile) + return results + + +def run_custom_command( + control_plane_node: dict | None, + command: str, + profile: ClusterProfile | None = None, +) -> SSHResult: + """Run a custom command. For imported clusters, runs kubectl locally.""" + if profile and profile.cluster_source == "imported" and profile.kubeconfig_content: + cmd = command.strip() + if cmd.startswith("kubectl "): + cmd = cmd[len("kubectl "):] + return _run_local_kubectl(profile.kubeconfig_content, cmd, timeout=60) + + if not control_plane_node: + return SSHResult( + hostname="unknown", command=command, return_code=1, + stdout="", stderr="No control-plane node available.", success=False, + ) + return run_ssh_command( + ip_address=control_plane_node["ip_address"], + command=command, + ssh_user=control_plane_node.get("ssh_user", "root"), + ssh_port=control_plane_node.get("ssh_port", 22), + ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=60, + ) + + +def format_diagnostics_for_llm(results: dict[str, SSHResult]) -> str: + """Format diagnostic results into a text block for the LLM.""" + sections = [] + for name, result in results.items(): + status = "OK" if result.success else "FAILED" + output = result.stdout if result.success else result.stderr + sections.append( + f"### {name} [{status}]\n" + f"```\n{output.strip()}\n```\n" + ) + return "\n".join(sections) + + +def analyze_diagnostics( + results: dict[str, SSHResult], + user_description: str = "", + profile: ClusterProfile | None = None, +) -> str: + """Send diagnostic results to the LLM for analysis and recommendations. + + Returns a graceful message when the LLM is not configured. + """ + from modules.llm_client import query_llm # lazy import — LLM is optional + + diag_text = format_diagnostics_for_llm(results) + + cluster_info = "" + if profile: + cluster_info = f""" +Cluster Configuration: +- Kubernetes: {profile.kubernetes_version} +- Runtime: CRI-O {profile.crio_version} +- CNI: Flannel +- Pod CIDR: {profile.pod_cidr} +- Service CIDR: {profile.service_cidr} +""" + + prompt = f"""Analyze the following Kubernetes cluster diagnostic output and provide a detailed assessment. +{cluster_info} + +User's issue description: {user_description or 'General health check'} + +== Diagnostic Output == +{diag_text} +== End Diagnostic Output == + +Please provide: +1. **Health Summary**: Overall cluster health status (Healthy / Degraded / Critical) +2. **Issues Found**: List each issue with severity (Critical / Warning / Info) +3. **Root Cause Analysis**: For each issue, explain the likely root cause +4. **Remediation Steps**: Specific commands or actions to fix each issue +5. **Preventive Recommendations**: Steps to prevent these issues in the future + +Format your response with clear headings and actionable commands where applicable. +""" + return query_llm(prompt) + + +def get_debug_suggestion( + error_message: str, + context: str = "", +) -> str: + """Get a quick debugging suggestion from the LLM for a specific error. + + Returns a graceful message when the LLM is not configured. + """ + from modules.llm_client import query_llm # lazy import — LLM is optional + + prompt = f"""I encountered the following error in my Kubernetes cluster (CRI-O + Flannel): + +Error: {error_message} + +Additional context: {context or 'None'} + +Provide a concise diagnosis and the exact commands to fix this issue. +""" + return query_llm(prompt) + + +def check_pod_issues( + control_plane_node: dict | None, + namespace: str = "", + profile: ClusterProfile | None = None, +) -> SSHResult: + """Check for pods in non-running states.""" + ns_flag = f"-n {namespace}" if namespace else "-A" + + if profile and profile.cluster_source == "imported" and profile.kubeconfig_content: + kubectl_args = ( + f"get pods {ns_flag} " + "--field-selector=status.phase!=Running,status.phase!=Succeeded -o wide" + ) + return _run_local_kubectl(profile.kubeconfig_content, kubectl_args, timeout=60) + + if not control_plane_node: + return SSHResult( + hostname="unknown", command="check_pod_issues", return_code=1, + stdout="", stderr="No control-plane node available.", success=False, + ) + + command = ( + f"kubectl get pods {ns_flag} --field-selector=" + "'status.phase!=Running,status.phase!=Succeeded' -o wide 2>/dev/null; " + f"echo '---DESCRIBE---'; " + f"for pod in $(kubectl get pods {ns_flag} --field-selector=" + "'status.phase!=Running,status.phase!=Succeeded' " + "-o jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name} {end}' 2>/dev/null); do " + "ns=$(echo $pod | cut -d/ -f1); " + "name=$(echo $pod | cut -d/ -f2); " + "echo \"=== $ns/$name ===\"; " + "kubectl describe pod $name -n $ns 2>/dev/null | tail -20; " + "done" + ) + return run_ssh_command( + ip_address=control_plane_node["ip_address"], + command=command, + ssh_user=control_plane_node.get("ssh_user", "root"), + ssh_port=control_plane_node.get("ssh_port", 22), + ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=60, + ) diff --git a/k8s-agent/modules/llm_client.py b/k8s-agent/modules/llm_client.py new file mode 100644 index 0000000..ab8cb18 --- /dev/null +++ b/k8s-agent/modules/llm_client.py @@ -0,0 +1,215 @@ +"""LLM client — optional integration with OpenAI-compatible or Ollama endpoints. + +Supports two providers: + * **openai** — Any OpenAI-compatible chat completions API (default). + * **ollama** — Local Ollama instance (no API key required). + +All public functions gracefully return a fallback message when the LLM is not +configured. The rest of the application works without any LLM dependency. +""" + +import json +from typing import Generator, Optional + +import requests + +import config + +_NOT_CONFIGURED_MSG = ( + "LLM is not configured. Set the LLM provider and connection details in " + "the sidebar LLM Settings panel, or via environment variables " + "(LLM_PROVIDER, LLM_API_URL / OLLAMA_BASE_URL)." +) + + +SYSTEM_PROMPT = """You are an expert Kubernetes platform engineer specializing in on-premises +cluster administration. You have deep knowledge of: +- Kubernetes cluster setup with CRI-O container runtime and Flannel CNI +- kubeadm-based cluster bootstrapping and lifecycle management +- Cluster debugging, troubleshooting, and remediation +- Prometheus and Grafana monitoring stack setup and dashboard design +- Kubernetes log analysis, error correlation, and root cause analysis +- Security best practices including RBAC, network policies, and pod security standards + +Always provide actionable, production-ready advice. When generating scripts, include +error handling and idempotency. When diagnosing issues, ask clarifying questions if +the provided information is insufficient.""" + + +def _build_messages( + user_message: str, + system_message: Optional[str] = None, + conversation_history: Optional[list[dict]] = None, +) -> list[dict]: + """Assemble the messages list shared by both query and stream.""" + messages = [] + sys_msg = system_message or SYSTEM_PROMPT + messages.append({"role": "system", "content": sys_msg}) + if conversation_history: + messages.extend(conversation_history) + messages.append({"role": "user", "content": user_message}) + return messages + + +def _build_headers() -> dict: + """Return request headers for the active provider.""" + headers = {"Content-Type": "application/json"} + if config.LLM_PROVIDER != "ollama" and config.LLM_API_KEY: + headers["Authorization"] = f"Bearer {config.LLM_API_KEY}" + return headers + + +def _build_payload( + messages: list[dict], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + stream: bool = False, +) -> dict: + """Return the request payload for the active provider.""" + temp = temperature if temperature is not None else config.LLM_TEMPERATURE + model = config.get_active_model() + + if config.LLM_PROVIDER == "ollama": + payload: dict = { + "model": model, + "messages": messages, + "stream": stream, + "options": { + "temperature": temp, + }, + } + if max_tokens is not None or config.LLM_MAX_TOKENS: + payload["options"]["num_predict"] = ( + max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS + ) + return payload + + # OpenAI-compatible + payload = { + "model": model, + "messages": messages, + "temperature": temp, + "max_tokens": max_tokens if max_tokens is not None else config.LLM_MAX_TOKENS, + } + if stream: + payload["stream"] = True + return payload + + +def query_llm( + user_message: str, + system_message: Optional[str] = None, + conversation_history: Optional[list[dict]] = None, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, +) -> str: + """Send a query to the LLM and return the response text. + + Supports both OpenAI-compatible and Ollama endpoints. + """ + if not config.is_llm_configured(): + return _NOT_CONFIGURED_MSG + + messages = _build_messages(user_message, system_message, conversation_history) + headers = _build_headers() + payload = _build_payload(messages, temperature, max_tokens, stream=False) + url = config.get_active_llm_url() + + try: + response = requests.post(url, headers=headers, json=payload, timeout=120) + response.raise_for_status() + data = response.json() + + # Ollama returns {"message": {"content": "..."}} + if config.LLM_PROVIDER == "ollama": + return data.get("message", {}).get("content", "") + + # OpenAI returns {"choices": [{"message": {"content": "..."}}]} + return data["choices"][0]["message"]["content"] + except requests.exceptions.Timeout: + return "Error: LLM request timed out. Please try again." + except requests.exceptions.ConnectionError: + return ( + f"Error: Could not connect to the LLM endpoint at {url}. " + "Please check your network and LLM configuration." + ) + except requests.exceptions.HTTPError as exc: + return f"Error: LLM API returned HTTP {exc.response.status_code}: {exc.response.text}" + except (KeyError, IndexError, json.JSONDecodeError) as exc: + return f"Error: Unexpected LLM response format: {exc}" + + +def stream_llm( + user_message: str, + system_message: Optional[str] = None, + conversation_history: Optional[list[dict]] = None, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, +) -> Generator[str, None, None]: + """Stream a response from the LLM token-by-token. + + Supports both OpenAI-compatible and Ollama endpoints. + Yields chunks of text as they arrive from the API. + """ + if not config.is_llm_configured(): + yield _NOT_CONFIGURED_MSG + return + + messages = _build_messages(user_message, system_message, conversation_history) + headers = _build_headers() + payload = _build_payload(messages, temperature, max_tokens, stream=True) + url = config.get_active_llm_url() + + try: + response = requests.post( + url, headers=headers, json=payload, timeout=120, stream=True, + ) + response.raise_for_status() + + if config.LLM_PROVIDER == "ollama": + # Ollama streams newline-delimited JSON objects + for line in response.iter_lines(decode_unicode=True): + if not line: + continue + try: + chunk = json.loads(line) + content = chunk.get("message", {}).get("content", "") + if content: + yield content + if chunk.get("done", False): + break + except json.JSONDecodeError: + continue + else: + # OpenAI SSE format: "data: {...}\n" + for line in response.iter_lines(decode_unicode=True): + if not line or not line.startswith("data: "): + continue + data_str = line[len("data: "):] + if data_str.strip() == "[DONE]": + break + try: + chunk = json.loads(data_str) + delta = chunk.get("choices", [{}])[0].get("delta", {}) + content = delta.get("content", "") + if content: + yield content + except (json.JSONDecodeError, KeyError, IndexError): + continue + except requests.exceptions.RequestException as exc: + yield f"\n\nError during streaming: {exc}" + + +def list_ollama_models(base_url: str = "") -> list[str]: + """Fetch available model names from an Ollama instance. + + Returns a list of model name strings, or an empty list on failure. + """ + url = (base_url or config.OLLAMA_BASE_URL).rstrip("/") + "/api/tags" + try: + resp = requests.get(url, timeout=10) + resp.raise_for_status() + data = resp.json() + return [m["name"] for m in data.get("models", [])] + except Exception: + return [] diff --git a/k8s-agent/modules/log_analyzer.py b/k8s-agent/modules/log_analyzer.py new file mode 100644 index 0000000..69c4e9d --- /dev/null +++ b/k8s-agent/modules/log_analyzer.py @@ -0,0 +1,1160 @@ +"""Log Analyzer — Kubernetes log collection, parsing, error correlation, and analysis. + +Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based). +""" + +import os +import re +import subprocess +from collections import Counter +from dataclasses import dataclass, field +from typing import Optional + +from modules.cluster_creator import run_ssh_command, SSHResult +from modules.profile_manager import ClusterProfile +import config + + +def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 60) -> SSHResult: + """Run a shell command locally with KUBECONFIG set from profile content. + + Replaces bare ``kubectl`` and ``helm`` references with their full paths + so the command works even when these binaries are not in $PATH. + """ + kubectl = config.get_kubectl_path() + helm = config.get_helm_path() + if not kubectl: + return SSHResult( + hostname="local", command=command, return_code=1, + stdout="", + stderr=( + "kubectl not found on this machine.\n\n" + "Install kubectl:\n" + " curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n" + " chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n" + "Or on macOS: brew install kubectl\n" + "Or see: https://kubernetes.io/docs/tasks/tools/" + ), + success=False, + ) + # Replace bare kubectl/helm with full paths + resolved_cmd = command.replace("kubectl ", f"{kubectl} ").replace("helm ", f"{helm} " if helm else "helm ") + kubeconfig_path = config.get_kubeconfig_path("_log_temp") + with open(kubeconfig_path, "w") as f: + f.write(kubeconfig_content) + env = dict(os.environ, KUBECONFIG=kubeconfig_path) + try: + proc = subprocess.run(resolved_cmd, shell=True, capture_output=True, text=True, timeout=timeout, env=env) + return SSHResult( + hostname="local", command=command, return_code=proc.returncode, + stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0, + ) + except subprocess.TimeoutExpired: + return SSHResult( + hostname="local", command=command, return_code=-1, + stdout="", stderr=f"Command timed out after {timeout}s", success=False, + ) + except Exception as e: + return SSHResult( + hostname="local", command=command, return_code=-1, + stdout="", stderr=str(e), success=False, + ) + + +def _run_on_cluster(control_plane_node: dict | None, command: str, profile: ClusterProfile | None = None, timeout: int = 60) -> SSHResult: + """Route command to local shell or SSH based on cluster source.""" + if profile and profile.cluster_source == "imported" and profile.kubeconfig_content: + return _run_local_shell(profile.kubeconfig_content, command, timeout=timeout) + if not control_plane_node: + return SSHResult( + hostname="unknown", command=command, return_code=1, + stdout="", stderr="No control-plane node available.", success=False, + ) + return run_ssh_command( + ip_address=control_plane_node["ip_address"], + command=command, + ssh_user=control_plane_node.get("ssh_user", "root"), + ssh_port=control_plane_node.get("ssh_port", 22), + ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=timeout, + ) + + +@dataclass +class LogEntry: + """Represents a parsed log line.""" + + timestamp: str = "" + level: str = "INFO" + source: str = "" + message: str = "" + raw: str = "" + + +@dataclass +class LogAnalysisResult: + """Results from log analysis.""" + + total_lines: int = 0 + error_count: int = 0 + warning_count: int = 0 + error_patterns: dict[str, int] = field(default_factory=dict) + warning_patterns: dict[str, int] = field(default_factory=dict) + timeline: list[dict] = field(default_factory=list) + correlated_errors: list[dict] = field(default_factory=list) + + +# ── Log collection commands ─────────────────────────────────────────────── + +# SSH-only sources (journalctl requires node access) +SSH_ONLY_LOG_SOURCES = {"Kubelet", "CRI-O"} + +LOG_SOURCES = { + "Kubelet": "journalctl -u kubelet --no-pager -n {lines} --since '{since}'", + "CRI-O": "journalctl -u crio --no-pager -n {lines} --since '{since}'", + "API Server": "kubectl logs -n kube-system -l component=kube-apiserver --tail={lines} --since={since_k8s}", + "Controller Manager": "kubectl logs -n kube-system -l component=kube-controller-manager --tail={lines} --since={since_k8s}", + "Scheduler": "kubectl logs -n kube-system -l component=kube-scheduler --tail={lines} --since={since_k8s}", + "CoreDNS": "kubectl logs -n kube-system -l k8s-app=kube-dns --tail={lines} --since={since_k8s}", + "Flannel": "kubectl logs -n kube-flannel -l app=flannel --tail={lines} --since={since_k8s} 2>/dev/null || kubectl logs -n kube-system -l app=flannel --tail={lines} --since={since_k8s} 2>/dev/null || echo 'Flannel logs not found'", + "etcd": "kubectl logs -n kube-system -l component=etcd --tail={lines} --since={since_k8s}", + "Events": "kubectl get events -A --sort-by='.lastTimestamp' | tail -{lines}", +} + +def get_available_log_sources(profile: ClusterProfile | None = None) -> list[str]: + """Return log sources available for the given cluster type.""" + if profile and profile.cluster_source == "imported": + return [s for s in LOG_SOURCES if s not in SSH_ONLY_LOG_SOURCES] + return list(LOG_SOURCES.keys()) + + +POD_LOG_COMMAND = "kubectl logs {pod_ref} --tail={lines} --since={since_k8s} {container_flag}" +POD_PREVIOUS_LOG_COMMAND = "kubectl logs {pod_ref} --previous --tail={lines} {container_flag} 2>/dev/null || echo 'No previous logs available'" + + +def collect_logs( + control_plane_node: dict | None, + source: str, + lines: int = 200, + since: str = "1 hour ago", + since_k8s: str = "1h", + profile: ClusterProfile | None = None, +) -> SSHResult: + """Collect logs from a specific source on the cluster.""" + # Block SSH-only sources for imported clusters + if profile and profile.cluster_source == "imported" and source in SSH_ONLY_LOG_SOURCES: + return SSHResult( + hostname="local", command=source, return_code=1, + stdout="", + stderr=f"'{source}' logs require SSH access (not available for imported clusters).", + success=False, + ) + + cmd_template = LOG_SOURCES.get(source) + if not cmd_template: + return SSHResult( + hostname=control_plane_node["ip_address"] if control_plane_node else "local", + command=source, + return_code=1, + stdout="", + stderr=f"Unknown log source: {source}", + success=False, + ) + + command = cmd_template.format( + lines=lines, + since=since, + since_k8s=since_k8s, + ) + + return _run_on_cluster(control_plane_node, command, profile=profile, timeout=60) + + +def collect_pod_logs( + control_plane_node: dict | None, + namespace: str, + pod_name: str, + container: str = "", + lines: int = 200, + since_k8s: str = "1h", + previous: bool = False, + profile: ClusterProfile | None = None, +) -> SSHResult: + """Collect logs from a specific pod.""" + pod_ref = f"-n {namespace} {pod_name}" + container_flag = f"-c {container}" if container else "" + + if previous: + command = POD_PREVIOUS_LOG_COMMAND.format( + pod_ref=pod_ref, + lines=lines, + container_flag=container_flag, + ) + else: + command = POD_LOG_COMMAND.format( + pod_ref=pod_ref, + lines=lines, + since_k8s=since_k8s, + container_flag=container_flag, + ) + + return _run_on_cluster(control_plane_node, command, profile=profile, timeout=60) + + +def collect_multi_source_logs( + control_plane_node: dict | None, + sources: list[str], + lines: int = 100, + since: str = "1 hour ago", + since_k8s: str = "1h", + profile: ClusterProfile | None = None, +) -> dict[str, SSHResult]: + """Collect logs from multiple sources.""" + results = {} + for source in sources: + results[source] = collect_logs( + control_plane_node, source, lines, since, since_k8s, profile=profile + ) + return results + + +# ── Log parsing ─────────────────────────────────────────────────────────── + +ERROR_PATTERNS = [ + re.compile(r"\b(?:error|err|fatal|panic|fail(?:ed|ure)?)\b", re.IGNORECASE), +] + +WARNING_PATTERNS = [ + re.compile(r"\b(?:warn(?:ing)?|deprecated)\b", re.IGNORECASE), +] + +TIMESTAMP_PATTERNS = [ + re.compile(r"(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2})"), + re.compile(r"([A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})"), +] + + +def parse_log_line(line: str, source: str = "") -> LogEntry: + """Parse a single log line into a LogEntry.""" + entry = LogEntry(raw=line, source=source) + + for pattern in TIMESTAMP_PATTERNS: + match = pattern.search(line) + if match: + entry.timestamp = match.group(1) + break + + for pattern in ERROR_PATTERNS: + if pattern.search(line): + entry.level = "ERROR" + break + else: + for pattern in WARNING_PATTERNS: + if pattern.search(line): + entry.level = "WARNING" + break + + entry.message = line.strip() + return entry + + +def analyze_logs(log_text: str, source: str = "") -> LogAnalysisResult: + """Analyze a block of log text and extract patterns.""" + result = LogAnalysisResult() + lines = log_text.strip().split("\n") + result.total_lines = len(lines) + + error_messages = [] + warning_messages = [] + + for line in lines: + if not line.strip(): + continue + entry = parse_log_line(line, source) + + if entry.level == "ERROR": + result.error_count += 1 + normalized = _normalize_error(entry.message) + error_messages.append(normalized) + elif entry.level == "WARNING": + result.warning_count += 1 + normalized = _normalize_error(entry.message) + warning_messages.append(normalized) + + result.error_patterns = dict(Counter(error_messages).most_common(20)) + result.warning_patterns = dict(Counter(warning_messages).most_common(20)) + + return result + + +def _normalize_error(message: str) -> str: + """Normalize an error message by removing variable parts for grouping.""" + normalized = re.sub(r"\b[0-9a-f]{8,}\b", "", message) + normalized = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?", "", normalized) + normalized = re.sub(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[^\s]*", "", normalized) + normalized = re.sub(r"pod/[\w-]+", "pod/", normalized) + normalized = re.sub(r"node/[\w.-]+", "node/", normalized) + if len(normalized) > 150: + normalized = normalized[:150] + "..." + return normalized + + +def correlate_errors( + multi_source_results: dict[str, SSHResult], +) -> list[dict]: + """Correlate errors across multiple log sources to find related issues.""" + all_errors = [] + + for source, result in multi_source_results.items(): + if not result.success: + continue + for line in result.stdout.split("\n"): + entry = parse_log_line(line, source) + if entry.level == "ERROR": + all_errors.append({ + "source": source, + "timestamp": entry.timestamp, + "message": entry.message, + }) + + all_errors.sort(key=lambda e: e.get("timestamp", "")) + + def _parse_ts(ts_str: str): + """Try to parse a timestamp string into a datetime object.""" + from datetime import datetime + for fmt in ( + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S.%fZ", + "%b %d %H:%M:%S", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M:%S.%f", + ): + try: + return datetime.strptime(ts_str.strip(), fmt) + except (ValueError, AttributeError): + continue + return None + + correlated = [] + window_seconds = 30 + used: set[int] = set() + + for i, err in enumerate(all_errors): + if i in used: + continue + group = [err] + used.add(i) + err_ts = _parse_ts(err.get("timestamp", "")) + + for j in range(i + 1, len(all_errors)): + if j in used: + continue + other = all_errors[j] + if other.get("source") == err.get("source"): + continue + # If both timestamps are parseable, enforce the time window + other_ts = _parse_ts(other.get("timestamp", "")) + if err_ts and other_ts: + diff = abs((other_ts - err_ts).total_seconds()) + if diff > window_seconds: + continue + group.append(other) + used.add(j) + + if len(group) > 1: + correlated.append({ + "primary": err, + "related": group[1:], + "sources_involved": list({e["source"] for e in group}), + }) + + return correlated + + +# ── LLM-powered analysis ───────────────────────────────────────────────── + +def llm_analyze_logs( + log_text: str, + source: str = "", + context: str = "", +) -> str: + """Send log output to the LLM for deep analysis. + + Returns a graceful message when the LLM is not configured. + """ + from modules.llm_client import query_llm # lazy import — LLM is optional + + truncated = log_text[-8000:] if len(log_text) > 8000 else log_text + + prompt = f"""Analyze the following Kubernetes logs and provide a detailed assessment. + +Log Source: {source or 'Multiple sources'} +Context: {context or 'General analysis'} + +== Log Output == +{truncated} +== End Log Output == + +Please provide: +1. **Error Summary**: List all distinct errors found with frequency +2. **Root Cause Analysis**: For each error pattern, explain the likely root cause +3. **Error Correlation**: Identify errors that are likely related / cascading +4. **Impact Assessment**: What is the impact of these errors on the cluster? +5. **Remediation Steps**: Specific commands to fix each issue +6. **Patterns & Trends**: Any concerning patterns (increasing errors, recurring issues) +""" + return query_llm(prompt) + + +def llm_correlate_analysis( + multi_source_logs: dict[str, str], + issue_description: str = "", +) -> str: + """Send logs from multiple sources to the LLM for cross-source correlation. + + Returns a graceful message when the LLM is not configured. + """ + from modules.llm_client import query_llm # lazy import — LLM is optional + + log_sections = [] + for source, log_text in multi_source_logs.items(): + truncated = log_text[-3000:] if len(log_text) > 3000 else log_text + log_sections.append(f"### {source}\n```\n{truncated}\n```\n") + + all_logs = "\n".join(log_sections) + + prompt = f"""Perform a cross-source correlation analysis on these Kubernetes cluster logs. + +Issue Description: {issue_description or 'General health analysis'} + +== Multi-Source Logs == +{all_logs} +== End Logs == + +Please provide: +1. **Cross-Source Correlation**: Identify errors that appear related across different components +2. **Causal Chain**: Determine the sequence of events / root cause chain +3. **Timeline Reconstruction**: Reconstruct what happened based on timestamps +4. **Root Cause**: Identify the single most likely root cause +5. **Remediation Plan**: Step-by-step plan to resolve the issue +6. **Monitoring Recommendations**: What alerts/metrics should be added to catch this earlier +""" + return query_llm(prompt) + + +def get_pod_list( + control_plane_node: dict | None, + namespace: str = "", + profile: ClusterProfile | None = None, +) -> SSHResult: + """Get list of pods for the log analysis UI.""" + ns_flag = f"-n {namespace}" if namespace else "-A" + command = f"kubectl get pods {ns_flag} -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase,CONTAINERS:.spec.containers[*].name' --no-headers" + return _run_on_cluster(control_plane_node, command, profile=profile, timeout=30) + + +# ══════════════════════════════════════════════════════════════════════════ +# LogAI-inspired Smart Log Analysis +# Provides: Log Clustering, Anomaly Detection, Pattern Mining, Summarization +# Uses scikit-learn (TF-IDF + DBSCAN) instead of LogAI directly due to +# Python 3.12 compatibility issues with the logai package. +# ══════════════════════════════════════════════════════════════════════════ + +@dataclass +class LogCluster: + """A cluster of similar log messages.""" + cluster_id: int + template: str + count: int + level: str # predominant level: ERROR, WARNING, INFO + sample_messages: list[str] = field(default_factory=list) + first_seen: str = "" + last_seen: str = "" + + +@dataclass +class LogAnomaly: + """An anomalous log line or pattern.""" + message: str + score: float # anomaly score (higher = more anomalous) + reason: str + timestamp: str = "" + source: str = "" + + +@dataclass +class IstioAccessEntry: + """A parsed Istio/Envoy access log entry.""" + timestamp: str = "" + method: str = "" + path: str = "" + protocol: str = "" + response_code: int = 0 + response_flags: str = "" + bytes_received: int = 0 + bytes_sent: int = 0 + duration_ms: float = 0.0 # total request duration + upstream_service_time_ms: float = 0.0 # time spent in upstream + upstream_cluster: str = "" + upstream_host: str = "" + downstream_remote: str = "" + downstream_local: str = "" + requested_server_name: str = "" + authority: str = "" # Host header + user_agent: str = "" + raw_line: str = "" + + +@dataclass +class IstioAnalysisResult: + """Result from Istio access log analysis.""" + total_requests: int = 0 + parsed_entries: list[IstioAccessEntry] = field(default_factory=list) + # Latency percentiles + p50_ms: float = 0.0 + p90_ms: float = 0.0 + p95_ms: float = 0.0 + p99_ms: float = 0.0 + avg_ms: float = 0.0 + max_ms: float = 0.0 + min_ms: float = 0.0 + # Status code distribution + status_distribution: dict = field(default_factory=dict) # code -> count + status_class_distribution: dict = field(default_factory=dict) # "2xx"->count + # Error rate + error_rate: float = 0.0 # percentage of 4xx+5xx + # Slow requests (above p95) + slow_requests: list[IstioAccessEntry] = field(default_factory=list) + # Per-path stats + path_stats: list[dict] = field(default_factory=list) + # Per-upstream stats + upstream_stats: list[dict] = field(default_factory=list) + # Response flags distribution + response_flags_dist: dict = field(default_factory=dict) + # Timeline buckets (per-minute) + timeline_buckets: list[dict] = field(default_factory=list) + + +@dataclass +class SmartAnalysisResult: + """Full result from smart log analysis.""" + total_lines: int = 0 + clusters: list[LogCluster] = field(default_factory=list) + anomalies: list[LogAnomaly] = field(default_factory=list) + patterns: list[dict] = field(default_factory=list) + summary: dict = field(default_factory=dict) + timeline_buckets: list[dict] = field(default_factory=list) + istio: IstioAnalysisResult | None = None # populated when Istio logs detected + + +def _tokenize_log(message: str) -> str: + """Tokenize a log message by replacing variable parts with placeholders. + + This mimics LogAI's Drain-style log parsing — variable tokens (IPs, + hex IDs, numbers, paths, UUIDs) are replaced so that messages with the + same *template* look identical after tokenization. + """ + # Remove leading timestamp (various formats) + msg = re.sub(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}[^\s]*\s*", "", message) + msg = re.sub(r"^[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s*", "", msg) + # Replace UUIDs + msg = re.sub(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", "", msg, flags=re.IGNORECASE) + # Replace hex IDs (8+ chars) + msg = re.sub(r"\b[0-9a-f]{8,}\b", "", msg) + # Replace IPs + msg = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(:\d+)?", "", msg) + # Replace pure numbers + msg = re.sub(r"\b\d+\b", "", msg) + # Replace file paths + msg = re.sub(r"/[\w./-]+", "", msg) + # Replace pod/container names with common suffixes + msg = re.sub(r"\b[\w]+-[0-9a-f]{5,10}\b", "", msg) + return msg.strip() + + +def cluster_logs(log_text: str, source: str = "", max_clusters: int = 50, eps: float = 0.5) -> list[LogCluster]: + """Cluster log messages using TF-IDF vectorization + DBSCAN. + + Inspired by LogAI's log clustering pipeline: + 1. Parse each log line + 2. Tokenize to extract log templates + 3. Vectorize with TF-IDF + 4. Cluster with DBSCAN (density-based — no need to specify k) + 5. Return clusters sorted by size + """ + try: + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.cluster import DBSCAN + import numpy as np + except ImportError: + return [] + + lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()] + if len(lines) < 3: + return [] + + # Parse and tokenize + entries = [parse_log_line(l, source) for l in lines] + tokenized = [_tokenize_log(e.message) for e in entries] + + # Filter out empty tokenized lines + valid_indices = [i for i, t in enumerate(tokenized) if t.strip()] + if len(valid_indices) < 3: + return [] + + valid_tokenized = [tokenized[i] for i in valid_indices] + valid_entries = [entries[i] for i in valid_indices] + + # TF-IDF vectorization + try: + vectorizer = TfidfVectorizer(max_features=1000, stop_words=None, token_pattern=r"(?u)\b\w+\b") + tfidf_matrix = vectorizer.fit_transform(valid_tokenized) + except ValueError: + return [] + + # DBSCAN clustering + clustering = DBSCAN(eps=eps, min_samples=2, metric="cosine") + labels = clustering.fit_predict(tfidf_matrix) + + # Build clusters + cluster_map: dict[int, list[int]] = {} + for idx, label in enumerate(labels): + cluster_map.setdefault(label, []).append(idx) + + result_clusters = [] + for cluster_id, member_indices in sorted(cluster_map.items(), key=lambda x: -len(x[1])): + members = [valid_entries[i] for i in member_indices] + levels = [m.level for m in members] + level_counter = Counter(levels) + predominant_level = level_counter.most_common(1)[0][0] + + # Use the most common tokenized form as the template + templates = [valid_tokenized[i] for i in member_indices] + template = Counter(templates).most_common(1)[0][0] + + # Timestamps + timestamps = [m.timestamp for m in members if m.timestamp] + first_seen = min(timestamps) if timestamps else "" + last_seen = max(timestamps) if timestamps else "" + + samples = [members[i].raw for i in range(min(3, len(members)))] + + label_str = "noise" if cluster_id == -1 else str(cluster_id) + result_clusters.append(LogCluster( + cluster_id=cluster_id, + template=template if cluster_id != -1 else "(unclustered / unique messages)", + count=len(members), + level=predominant_level, + sample_messages=samples, + first_seen=first_seen, + last_seen=last_seen, + )) + + # Sort by count descending, but put noise cluster (-1) last + result_clusters.sort(key=lambda c: (c.cluster_id == -1, -c.count)) + return result_clusters[:max_clusters] + + +def detect_anomalies(log_text: str, source: str = "", threshold: float = 2.0) -> list[LogAnomaly]: + """Detect anomalous log lines using frequency-based and TF-IDF outlier detection. + + Inspired by LogAI's anomaly detection pipeline: + 1. Tokenize messages to get templates + 2. Count template frequencies + 3. Rare templates (below frequency threshold) are flagged + 4. Additionally, use TF-IDF distance from centroid for outlier scoring + """ + try: + from sklearn.feature_extraction.text import TfidfVectorizer + import numpy as np + except ImportError: + return [] + + lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()] + if len(lines) < 5: + return [] + + entries = [parse_log_line(l, source) for l in lines] + tokenized = [_tokenize_log(e.message) for e in entries] + + # Frequency-based anomaly detection + template_counts = Counter(tokenized) + total = len(tokenized) + freq_threshold = max(1, total * 0.01) # templates appearing in < 1% of lines + + anomalies = [] + + # TF-IDF outlier detection + try: + vectorizer = TfidfVectorizer(max_features=500, token_pattern=r"(?u)\b\w+\b") + tfidf_matrix = vectorizer.fit_transform(tokenized) + centroid = tfidf_matrix.mean(axis=0) + centroid = np.asarray(centroid).flatten() + + distances = [] + for i in range(tfidf_matrix.shape[0]): + vec = np.asarray(tfidf_matrix[i].todense()).flatten() + dist = np.linalg.norm(vec - centroid) + distances.append(dist) + + distances = np.array(distances) + mean_dist = distances.mean() + std_dist = distances.std() if distances.std() > 0 else 1.0 + + for i, (entry, dist) in enumerate(zip(entries, distances)): + z_score = (dist - mean_dist) / std_dist + reasons = [] + + # TF-IDF outlier + if z_score > threshold: + reasons.append(f"TF-IDF outlier (z-score: {z_score:.2f})") + + # Frequency anomaly + if template_counts[tokenized[i]] <= freq_threshold: + reasons.append(f"Rare template (seen {template_counts[tokenized[i]]}x out of {total})") + + # Error/critical level + if entry.level == "ERROR": + reasons.append("Error-level message") + + if reasons: + anomalies.append(LogAnomaly( + message=entry.raw, + score=float(z_score), + reason="; ".join(reasons), + timestamp=entry.timestamp, + source=source, + )) + except ValueError: + # Fallback to frequency-only if TF-IDF fails + for i, entry in enumerate(entries): + if template_counts[tokenized[i]] <= freq_threshold: + anomalies.append(LogAnomaly( + message=entry.raw, + score=1.0, + reason=f"Rare template (seen {template_counts[tokenized[i]]}x out of {total})", + timestamp=entry.timestamp, + source=source, + )) + + # Sort by score descending + anomalies.sort(key=lambda a: -a.score) + return anomalies[:100] # cap at 100 + + +def mine_log_patterns(log_text: str, source: str = "", top_n: int = 30) -> list[dict]: + """Mine frequent log patterns/templates from log text. + + Inspired by LogAI's Drain log parser — extracts common templates by + tokenizing variable parts and counting occurrences. + """ + lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()] + if not lines: + return [] + + entries = [parse_log_line(l, source) for l in lines] + tokenized = [_tokenize_log(e.message) for e in entries] + + # Count templates + template_counts = Counter(tokenized) + + # Group by template + template_levels: dict[str, Counter] = {} + template_samples: dict[str, str] = {} + for entry, template in zip(entries, tokenized): + if template not in template_levels: + template_levels[template] = Counter() + template_samples[template] = entry.raw + template_levels[template][entry.level] += 1 + + patterns = [] + for template, count in template_counts.most_common(top_n): + level_dist = dict(template_levels.get(template, {})) + predominant = max(level_dist, key=level_dist.get) if level_dist else "INFO" + patterns.append({ + "template": template, + "count": count, + "percentage": round(count / len(lines) * 100, 1), + "level": predominant, + "level_distribution": level_dist, + "sample": template_samples.get(template, ""), + }) + + return patterns + + +def summarize_logs(log_text: str, source: str = "") -> dict: + """Generate a comprehensive summary of log data. + + Inspired by LogAI's summarization — provides: + - Level distribution (INFO/WARNING/ERROR counts) + - Time span + - Top error messages + - Log velocity (lines per minute) + - Health score + """ + lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()] + if not lines: + return {"total_lines": 0, "health_score": 100} + + entries = [parse_log_line(l, source) for l in lines] + + # Level distribution + levels = Counter(e.level for e in entries) + error_count = levels.get("ERROR", 0) + warning_count = levels.get("WARNING", 0) + info_count = levels.get("INFO", 0) + + # Time span + timestamps = [e.timestamp for e in entries if e.timestamp] + first_ts = min(timestamps) if timestamps else "N/A" + last_ts = max(timestamps) if timestamps else "N/A" + + # Top errors + error_messages = [_normalize_error(e.message) for e in entries if e.level == "ERROR"] + top_errors = Counter(error_messages).most_common(10) + + # Top warnings + warning_messages = [_normalize_error(e.message) for e in entries if e.level == "WARNING"] + top_warnings = Counter(warning_messages).most_common(5) + + # Unique templates + tokenized = [_tokenize_log(e.message) for e in entries] + unique_templates = len(set(tokenized)) + + # Health score (0-100) + # High errors = low score, high warnings = moderate reduction + error_ratio = error_count / len(lines) if lines else 0 + warning_ratio = warning_count / len(lines) if lines else 0 + health_score = max(0, min(100, int(100 - error_ratio * 300 - warning_ratio * 50))) + + return { + "total_lines": len(lines), + "error_count": error_count, + "warning_count": warning_count, + "info_count": info_count, + "level_distribution": dict(levels), + "first_timestamp": first_ts, + "last_timestamp": last_ts, + "top_errors": top_errors, + "top_warnings": top_warnings, + "unique_templates": unique_templates, + "template_diversity": round(unique_templates / len(lines) * 100, 1) if lines else 0, + "health_score": health_score, + } + + +def smart_analyze(log_text: str, source: str = "") -> SmartAnalysisResult: + """Run the full LogAI-inspired analysis pipeline. + + Combines: clustering, anomaly detection, pattern mining, and summarization. + """ + result = SmartAnalysisResult() + lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()] + result.total_lines = len(lines) + + if not lines: + return result + + # 1. Clustering + result.clusters = cluster_logs(log_text, source) + + # 2. Anomaly detection + result.anomalies = detect_anomalies(log_text, source) + + # 3. Pattern mining + result.patterns = mine_log_patterns(log_text, source) + + # 4. Summarization + result.summary = summarize_logs(log_text, source) + + # 5. Timeline buckets (group by timestamp prefix for timeline view) + entries = [parse_log_line(l, source) for l in lines] + ts_buckets: dict[str, dict] = {} + for entry in entries: + if entry.timestamp: + # Bucket by minute (first 16 chars: YYYY-MM-DDTHH:MM) + bucket_key = entry.timestamp[:16] if len(entry.timestamp) >= 16 else entry.timestamp + else: + bucket_key = "unknown" + if bucket_key not in ts_buckets: + ts_buckets[bucket_key] = {"timestamp": bucket_key, "total": 0, "errors": 0, "warnings": 0} + ts_buckets[bucket_key]["total"] += 1 + if entry.level == "ERROR": + ts_buckets[bucket_key]["errors"] += 1 + elif entry.level == "WARNING": + ts_buckets[bucket_key]["warnings"] += 1 + + result.timeline_buckets = sorted(ts_buckets.values(), key=lambda b: b["timestamp"]) + + # 6. Istio / Envoy access log analysis (auto-detected) + istio_result = analyze_istio_access_logs(log_text) + if istio_result and istio_result.total_requests > 0: + result.istio = istio_result + + return result + + +# ══════════════════════════════════════════════════════════════════════════ +# Istio / Envoy Access Log Analysis +# Parses Envoy access log format used by Istio sidecars and provides +# response-time analytics, status code distributions, per-path and +# per-upstream breakdowns, and slow-request detection. +# ══════════════════════════════════════════════════════════════════════════ + +# Envoy default access log format (as emitted by Istio): +# [%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%" +# %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% +# %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% +# "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" +# "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" %UPSTREAM_CLUSTER% +# %UPSTREAM_LOCAL_ADDRESS% %DOWNSTREAM_LOCAL_ADDRESS% +# %DOWNSTREAM_REMOTE_ADDRESS% %REQUESTED_SERVER_NAME% %ROUTE_NAME% + +_ISTIO_LOG_RE = re.compile( + r'\[(?P[^\]]+)\]\s+' + r'"(?P\S+)\s+(?P\S+)\s+(?P[^"]*?)"\s+' + r'(?P\d+)\s+' + r'(?P\S+)\s+' + r'(?P\d+)\s+' + r'(?P\d+)\s+' + r'(?P\d+)\s+' + r'(?P\d+|-)\s+' + r'"(?P[^"]*)"\s+' + r'"(?P[^"]*)"\s+' + r'"(?P[^"]*)"\s+' + r'"(?P[^"]*)"\s+' + r'"(?P[^"]*)"\s*' + r'(?P.*)' +) + +# Simpler fallback: JSON-format Istio access logs (structured logging) +_ISTIO_JSON_KEYS = { + "response_code", "duration", "method", "path", "upstream_service_time", + "upstream_cluster", "authority", "bytes_received", "bytes_sent", +} + + +def _parse_istio_line(line: str) -> IstioAccessEntry | None: + """Try to parse a single line as an Istio/Envoy access log entry.""" + import json as _json + + # Try structured JSON format first + stripped = line.strip() + if stripped.startswith("{"): + try: + obj = _json.loads(stripped) + # Verify it looks like an Istio access log + if "response_code" in obj or "method" in obj or "duration" in obj: + duration = obj.get("duration", 0) + ust = obj.get("upstream_service_time", 0) + # Istio JSON logs may use different field names + return IstioAccessEntry( + timestamp=str(obj.get("start_time", obj.get("timestamp", ""))), + method=str(obj.get("method", obj.get("request_method", ""))), + path=str(obj.get("path", obj.get("request_path", ""))), + protocol=str(obj.get("protocol", "")), + response_code=int(obj.get("response_code", 0)), + response_flags=str(obj.get("response_flags", "-")), + bytes_received=int(obj.get("bytes_received", 0)), + bytes_sent=int(obj.get("bytes_sent", 0)), + duration_ms=float(duration) if duration not in ("-", "", None) else 0.0, + upstream_service_time_ms=float(ust) if ust not in ("-", "", None) else 0.0, + upstream_cluster=str(obj.get("upstream_cluster", "")), + upstream_host=str(obj.get("upstream_host", "")), + authority=str(obj.get("authority", obj.get("host", ""))), + user_agent=str(obj.get("user_agent", "")), + downstream_remote=str(obj.get("downstream_remote_address", "")), + downstream_local=str(obj.get("downstream_local_address", "")), + requested_server_name=str(obj.get("requested_server_name", "")), + raw_line=line, + ) + except (_json.JSONDecodeError, ValueError, TypeError): + pass + + # Try standard Envoy text format + m = _ISTIO_LOG_RE.match(stripped) + if m: + ust = m.group("upstream_service_time") + rest = m.group("rest").strip() + # Parse remaining fields from rest (upstream_cluster, etc.) + rest_parts = rest.split() + upstream_cluster = rest_parts[0] if rest_parts else "" + return IstioAccessEntry( + timestamp=m.group("timestamp"), + method=m.group("method"), + path=m.group("path"), + protocol=m.group("protocol"), + response_code=int(m.group("response_code")), + response_flags=m.group("response_flags"), + bytes_received=int(m.group("bytes_received")), + bytes_sent=int(m.group("bytes_sent")), + duration_ms=float(m.group("duration")), + upstream_service_time_ms=float(ust) if ust != "-" else 0.0, + upstream_cluster=upstream_cluster, + upstream_host=m.group("upstream_host"), + authority=m.group("authority"), + user_agent=m.group("user_agent"), + downstream_remote=m.group("xff") or "", + raw_line=line, + ) + + return None + + +def _is_likely_istio_log(lines: list[str], sample_size: int = 20) -> bool: + """Heuristic: check if a meaningful fraction of lines look like Istio access logs.""" + sample = lines[:sample_size] + parsed = sum(1 for l in sample if _parse_istio_line(l) is not None) + return parsed >= max(1, len(sample) * 0.3) # at least 30% parse successfully + + +def analyze_istio_access_logs(log_text: str) -> IstioAnalysisResult | None: + """Parse and analyze Istio/Envoy access logs. + + Returns None if the logs don't look like Istio access logs. + Returns an IstioAnalysisResult with latency stats, status distribution, + per-path breakdowns, per-upstream breakdowns, and slow requests. + """ + lines = [l.strip() for l in log_text.strip().split("\n") if l.strip()] + if not lines: + return None + + # Quick heuristic — bail early if this doesn't look like Istio logs + if not _is_likely_istio_log(lines): + return None + + entries: list[IstioAccessEntry] = [] + for line in lines: + entry = _parse_istio_line(line) + if entry is not None: + entries.append(entry) + + if not entries: + return None + + result = IstioAnalysisResult( + total_requests=len(entries), + parsed_entries=entries, + ) + + # ── Latency percentiles ────────────────────────────────────────── + import numpy as np + durations = np.array([e.duration_ms for e in entries]) + if len(durations) > 0: + result.avg_ms = float(np.mean(durations)) + result.min_ms = float(np.min(durations)) + result.max_ms = float(np.max(durations)) + result.p50_ms = float(np.percentile(durations, 50)) + result.p90_ms = float(np.percentile(durations, 90)) + result.p95_ms = float(np.percentile(durations, 95)) + result.p99_ms = float(np.percentile(durations, 99)) + + # ── Status code distribution ───────────────────────────────────── + status_counter: Counter = Counter() + class_counter: Counter = Counter() + for e in entries: + status_counter[e.response_code] += 1 + class_label = f"{e.response_code // 100}xx" + class_counter[class_label] += 1 + + result.status_distribution = dict(status_counter.most_common()) + result.status_class_distribution = dict(class_counter.most_common()) + + # Error rate (4xx + 5xx) + error_count = sum(1 for e in entries if e.response_code >= 400) + result.error_rate = (error_count / len(entries)) * 100 if entries else 0.0 + + # ── Slow requests (above p95) ──────────────────────────────────── + p95_threshold = result.p95_ms + slow = [e for e in entries if e.duration_ms > p95_threshold] + # Sort by duration descending, limit to top 50 + slow.sort(key=lambda e: e.duration_ms, reverse=True) + result.slow_requests = slow[:50] + + # ── Per-path stats ─────────────────────────────────────────────── + path_groups: dict[str, list[IstioAccessEntry]] = {} + for e in entries: + # Normalize path: strip query params for grouping + base_path = e.path.split("?")[0] if e.path else "(unknown)" + path_groups.setdefault(base_path, []).append(e) + + path_stats = [] + for path, group in path_groups.items(): + durations_g = [e.duration_ms for e in group] + errors_g = sum(1 for e in group if e.response_code >= 400) + path_stats.append({ + "path": path, + "count": len(group), + "avg_ms": round(sum(durations_g) / len(durations_g), 1) if durations_g else 0, + "p50_ms": round(float(np.percentile(durations_g, 50)), 1) if durations_g else 0, + "p95_ms": round(float(np.percentile(durations_g, 95)), 1) if durations_g else 0, + "p99_ms": round(float(np.percentile(durations_g, 99)), 1) if durations_g else 0, + "max_ms": round(max(durations_g), 1) if durations_g else 0, + "error_count": errors_g, + "error_rate": round((errors_g / len(group)) * 100, 1) if group else 0, + }) + path_stats.sort(key=lambda p: p["count"], reverse=True) + result.path_stats = path_stats[:50] + + # ── Per-upstream stats ─────────────────────────────────────────── + upstream_groups: dict[str, list[IstioAccessEntry]] = {} + for e in entries: + key = e.upstream_cluster or e.upstream_host or "(direct/unknown)" + upstream_groups.setdefault(key, []).append(e) + + upstream_stats = [] + for upstream, group in upstream_groups.items(): + ust_vals = [e.upstream_service_time_ms for e in group if e.upstream_service_time_ms > 0] + dur_vals = [e.duration_ms for e in group] + errors_g = sum(1 for e in group if e.response_code >= 400) + upstream_stats.append({ + "upstream": upstream, + "count": len(group), + "avg_duration_ms": round(sum(dur_vals) / len(dur_vals), 1) if dur_vals else 0, + "avg_upstream_ms": round(sum(ust_vals) / len(ust_vals), 1) if ust_vals else 0, + "p95_duration_ms": round(float(np.percentile(dur_vals, 95)), 1) if dur_vals else 0, + "p95_upstream_ms": round(float(np.percentile(ust_vals, 95)), 1) if ust_vals else 0, + "error_count": errors_g, + "error_rate": round((errors_g / len(group)) * 100, 1) if group else 0, + }) + upstream_stats.sort(key=lambda u: u["count"], reverse=True) + result.upstream_stats = upstream_stats[:30] + + # ── Response flags distribution ────────────────────────────────── + flags_counter: Counter = Counter() + for e in entries: + flag = e.response_flags if e.response_flags and e.response_flags != "-" else "(none)" + flags_counter[flag] += 1 + result.response_flags_dist = dict(flags_counter.most_common()) + + # ── Timeline buckets (per-minute) ──────────────────────────────── + ts_buckets: dict[str, dict] = {} + for e in entries: + # Try to extract minute-level bucket from timestamp + ts = e.timestamp + if ts: + # Envoy format: 2024-01-15T10:30:45.123Z or similar + bucket_key = ts[:16] if len(ts) >= 16 else ts[:10] + else: + bucket_key = "unknown" + if bucket_key not in ts_buckets: + ts_buckets[bucket_key] = { + "timestamp": bucket_key, "total": 0, "errors": 0, + "avg_duration": 0.0, "_durations": [], + } + ts_buckets[bucket_key]["total"] += 1 + ts_buckets[bucket_key]["_durations"].append(e.duration_ms) + if e.response_code >= 400: + ts_buckets[bucket_key]["errors"] += 1 + + # Compute avg duration per bucket + for bucket in ts_buckets.values(): + durs = bucket.pop("_durations", []) + bucket["avg_duration"] = round(sum(durs) / len(durs), 1) if durs else 0 + + result.timeline_buckets = sorted(ts_buckets.values(), key=lambda b: b["timestamp"]) + + return result diff --git a/k8s-agent/modules/monitoring_setup.py b/k8s-agent/modules/monitoring_setup.py new file mode 100644 index 0000000..c62be44 --- /dev/null +++ b/k8s-agent/modules/monitoring_setup.py @@ -0,0 +1,486 @@ +"""Monitoring Setup — Prometheus, Grafana, and dashboard provisioning. + +Supports both provisioned clusters (SSH-based) and imported clusters (kubeconfig-based). +""" + +import os +import subprocess + +from modules.cluster_creator import run_ssh_command, SSHResult +from modules.profile_manager import ClusterProfile +import config + + +def _run_local_shell(kubeconfig_content: str, command: str, timeout: int = 120) -> SSHResult: + """Run a shell command locally with KUBECONFIG set from profile content. + + Replaces bare ``kubectl`` and ``helm`` references with their full paths + so the command works even when these binaries are not in $PATH. + """ + kubectl = config.get_kubectl_path() + helm = config.get_helm_path() + if not kubectl: + return SSHResult( + hostname="local", command=command, return_code=1, + stdout="", + stderr=( + "kubectl not found on this machine.\n\n" + "Install kubectl:\n" + " curl -LO https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl\n" + " chmod +x kubectl && sudo mv kubectl /usr/local/bin/\n\n" + "Or on macOS: brew install kubectl\n" + "Or see: https://kubernetes.io/docs/tasks/tools/" + ), + success=False, + ) + # Replace bare kubectl/helm with full paths + resolved_cmd = command.replace("kubectl ", f"{kubectl} ").replace("helm ", f"{helm} " if helm else "helm ") + kubeconfig_path = config.get_kubeconfig_path("_monitor_temp") + with open(kubeconfig_path, "w") as f: + f.write(kubeconfig_content) + env = dict(os.environ, KUBECONFIG=kubeconfig_path) + try: + proc = subprocess.run(resolved_cmd, shell=True, capture_output=True, text=True, timeout=timeout, env=env) + return SSHResult( + hostname="local", command=command, return_code=proc.returncode, + stdout=proc.stdout, stderr=proc.stderr, success=proc.returncode == 0, + ) + except subprocess.TimeoutExpired: + return SSHResult( + hostname="local", command=command, return_code=-1, + stdout="", stderr=f"Command timed out after {timeout}s", success=False, + ) + except Exception as e: + return SSHResult( + hostname="local", command=command, return_code=-1, + stdout="", stderr=str(e), success=False, + ) + + +def _run_on_cluster(control_plane_node: dict | None, command: str, profile: ClusterProfile | None = None, timeout: int = 120) -> SSHResult: + """Route command to local kubectl or SSH based on cluster source.""" + if profile and profile.cluster_source == "imported" and profile.kubeconfig_content: + return _run_local_shell(profile.kubeconfig_content, command, timeout=timeout) + if not control_plane_node: + return SSHResult( + hostname="unknown", command=command, return_code=1, + stdout="", stderr="No control-plane node available.", success=False, + ) + return run_ssh_command( + ip_address=control_plane_node["ip_address"], + command=command, + ssh_user=control_plane_node.get("ssh_user", "root"), + ssh_port=control_plane_node.get("ssh_port", 22), + ssh_key_path=control_plane_node.get("ssh_key_path", "~/.ssh/id_rsa"), + timeout=timeout, + ) + + +def generate_helm_install_script() -> str: + """Generate script to install Helm on the control-plane node.""" + return """#!/bin/bash +set -euo pipefail + +echo "=== Installing Helm ===" + +if command -v helm &>/dev/null; then + echo "Helm already installed: $(helm version --short)" +else + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + echo "Helm installed: $(helm version --short)" +fi +""" + + +def generate_prometheus_install_script(namespace: str = "monitoring") -> str: + """Generate script to install kube-prometheus-stack via Helm.""" + return f"""#!/bin/bash +set -euo pipefail + +echo "=== Installing Prometheus Stack ===" + +# Add Helm repos +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true +helm repo update + +# Create namespace +kubectl create namespace {namespace} --dry-run=client -o yaml | kubectl apply -f - + +# Install kube-prometheus-stack +helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \\ + --namespace {namespace} \\ + --set prometheus.prometheusSpec.retention=15d \\ + --set prometheus.prometheusSpec.resources.requests.memory=512Mi \\ + --set prometheus.prometheusSpec.resources.requests.cpu=250m \\ + --set prometheus.prometheusSpec.resources.limits.memory=2Gi \\ + --set prometheus.prometheusSpec.resources.limits.cpu=1000m \\ + --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.accessModes[0]=ReadWriteOnce \\ + --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi \\ + --set alertmanager.alertmanagerSpec.resources.requests.memory=128Mi \\ + --set alertmanager.alertmanagerSpec.resources.requests.cpu=50m \\ + --set grafana.enabled=true \\ + --set grafana.adminPassword=admin \\ + --set grafana.persistence.enabled=true \\ + --set grafana.persistence.size=10Gi \\ + --set grafana.resources.requests.memory=256Mi \\ + --set grafana.resources.requests.cpu=100m \\ + --set grafana.resources.limits.memory=512Mi \\ + --set grafana.resources.limits.cpu=500m \\ + --set grafana.sidecar.dashboards.enabled=true \\ + --set grafana.sidecar.dashboards.searchNamespace=ALL \\ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \\ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \\ + --wait --timeout 10m + +echo "" +echo "=== Prometheus Stack installed ===" +echo "" +kubectl -n {namespace} get pods +""" + + +def generate_standalone_grafana_script(namespace: str = "monitoring") -> str: + """Generate script to install standalone Grafana with provisioned dashboards.""" + return f"""#!/bin/bash +set -euo pipefail + +echo "=== Installing Standalone Grafana ===" + +helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true +helm repo update + +kubectl create namespace {namespace} --dry-run=client -o yaml | kubectl apply -f - + +helm upgrade --install grafana grafana/grafana \\ + --namespace {namespace} \\ + --set adminPassword=admin \\ + --set persistence.enabled=true \\ + --set persistence.size=10Gi \\ + --set resources.requests.memory=256Mi \\ + --set resources.requests.cpu=100m \\ + --set resources.limits.memory=512Mi \\ + --set resources.limits.cpu=500m \\ + --set sidecar.dashboards.enabled=true \\ + --set sidecar.dashboards.searchNamespace=ALL \\ + --set sidecar.datasources.enabled=true \\ + --set 'datasources.datasources\\.yaml.apiVersion=1' \\ + --set 'datasources.datasources\\.yaml.datasources[0].name=Prometheus' \\ + --set 'datasources.datasources\\.yaml.datasources[0].type=prometheus' \\ + --set 'datasources.datasources\\.yaml.datasources[0].url=http://prometheus-kube-prometheus-prometheus.{namespace}.svc:9090' \\ + --set 'datasources.datasources\\.yaml.datasources[0].access=proxy' \\ + --set 'datasources.datasources\\.yaml.datasources[0].isDefault=true' \\ + --wait --timeout 5m + +echo "" +echo "=== Grafana installed ===" +kubectl -n {namespace} get pods -l app.kubernetes.io/name=grafana +""" + + +GRAFANA_DASHBOARDS = { + "cluster-overview": { + "name": "Kubernetes Cluster Overview", + "description": "Overall cluster health, node status, resource utilization", + "gnet_id": 15520, + }, + "node-exporter": { + "name": "Node Exporter Full", + "description": "Detailed node metrics — CPU, memory, disk, network", + "gnet_id": 1860, + }, + "pod-monitoring": { + "name": "Kubernetes Pods", + "description": "Pod-level CPU, memory, network, restarts", + "gnet_id": 15760, + }, + "namespace-resources": { + "name": "Namespace Resources", + "description": "Resource usage per namespace with quota tracking", + "gnet_id": 15758, + }, + "coredns": { + "name": "CoreDNS", + "description": "DNS query rates, latency, errors", + "gnet_id": 15762, + }, + "etcd": { + "name": "etcd", + "description": "etcd cluster health, leader changes, WAL sync duration", + "gnet_id": 3070, + }, + "api-server": { + "name": "Kubernetes API Server", + "description": "API server request rates, latency, errors", + "gnet_id": 15761, + }, + "persistent-volumes": { + "name": "Persistent Volumes", + "description": "PV/PVC usage and capacity tracking", + "gnet_id": 13646, + }, +} + + +def generate_dashboard_import_script( + dashboard_keys: list[str], + namespace: str = "monitoring", +) -> str: + """Generate script to import Grafana dashboards as ConfigMaps.""" + configmaps = [] + for key in dashboard_keys: + dash = GRAFANA_DASHBOARDS.get(key) + if not dash: + continue + configmaps.append(f""" +# Import: {dash['name']} +cat <<'DASHEOF' | kubectl apply -f - +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-{key} + namespace: {namespace} + labels: + grafana_dashboard: "1" +data: + {key}.json: | + {{ + "annotations": {{"list": []}}, + "description": "{dash['description']}", + "editable": true, + "gnetId": {dash['gnet_id']}, + "title": "{dash['name']}", + "uid": "{key}", + "version": 1, + "__inputs": [ + {{ + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "type": "datasource", + "pluginId": "prometheus" + }} + ] + }} +DASHEOF +echo " Imported: {dash['name']} (grafana.net #{dash['gnet_id']})" +""") + + script_body = "\n".join(configmaps) + + return f"""#!/bin/bash +set -euo pipefail + +echo "=== Importing Grafana Dashboards ===" +{script_body} + +# Also import from grafana.net directly via Grafana API +GRAFANA_POD=$(kubectl -n {namespace} get pods -l app.kubernetes.io/name=grafana -o jsonpath='{{.items[0].metadata.name}}' 2>/dev/null || echo "") + +if [ -n "$GRAFANA_POD" ]; then + echo "" + echo ">> Importing full dashboards from grafana.net via API..." + kubectl -n {namespace} port-forward "$GRAFANA_POD" 3000:3000 & + PF_PID=$! + sleep 3 + + for gnet_id in {' '.join(str(GRAFANA_DASHBOARDS[k]['gnet_id']) for k in dashboard_keys if k in GRAFANA_DASHBOARDS)}; do + curl -s -X POST http://localhost:3000/api/dashboards/import \\ + -H "Content-Type: application/json" \\ + -u admin:admin \\ + -d "{{ + \\"dashboard\\": {{\\"id\\": null}}, + \\"overwrite\\": true, + \\"inputs\\": [{{\\"name\\": \\"DS_PROMETHEUS\\", \\"type\\": \\"datasource\\", \\"pluginId\\": \\"prometheus\\", \\"value\\": \\"Prometheus\\"}}], + \\"folderId\\": 0, + \\"gnetId\\": $gnet_id + }}" 2>/dev/null && echo " Imported grafana.net #$gnet_id" || echo " Failed grafana.net #$gnet_id (non-critical)" + done + + kill $PF_PID 2>/dev/null || true +fi + +echo "" +echo "=== Dashboard import complete ===" +""" + + +def generate_alerting_rules_script(namespace: str = "monitoring") -> str: + """Generate PrometheusRule resources for common K8s alerts.""" + return f"""#!/bin/bash +set -euo pipefail + +echo "=== Installing Alert Rules ===" + +cat <<'EOF' | kubectl apply -f - +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: k8s-cluster-alerts + namespace: {namespace} + labels: + release: prometheus +spec: + groups: + - name: k8s-node-alerts + rules: + - alert: NodeNotReady + expr: kube_node_status_condition{{condition="Ready",status="true"}} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Node {{{{ $labels.node }}}} is not ready" + description: "Node has been in NotReady state for more than 5 minutes." + - alert: NodeHighCPU + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{{mode="idle"}}[5m])) * 100) > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "Node {{{{ $labels.instance }}}} has high CPU usage" + description: "CPU usage is above 85% for more than 10 minutes." + - alert: NodeHighMemory + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "Node {{{{ $labels.instance }}}} has high memory usage" + description: "Memory usage is above 85% for more than 10 minutes." + - alert: NodeDiskPressure + expr: (1 - node_filesystem_avail_bytes{{mountpoint="/"}} / node_filesystem_size_bytes{{mountpoint="/"}}) * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{{{ $labels.instance }}}} disk usage is high" + description: "Root filesystem usage is above 85%." + - name: k8s-pod-alerts + rules: + - alert: PodCrashLooping + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 5 + for: 5m + labels: + severity: critical + annotations: + summary: "Pod {{{{ $labels.namespace }}}}/{{{{ $labels.pod }}}} is crash looping" + description: "Pod has restarted more than 5 times in the last 15 minutes." + - alert: PodNotReady + expr: kube_pod_status_ready{{condition="true"}} == 0 + for: 10m + labels: + severity: warning + annotations: + summary: "Pod {{{{ $labels.namespace }}}}/{{{{ $labels.pod }}}} is not ready" + description: "Pod has been in a non-ready state for more than 10 minutes." + - alert: PVCAlmostFull + expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "PVC {{{{ $labels.persistentvolumeclaim }}}} is almost full" + description: "PVC in namespace {{{{ $labels.namespace }}}} is over 85% full." + - name: k8s-etcd-alerts + rules: + - alert: EtcdHighLatency + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 + for: 10m + labels: + severity: warning + annotations: + summary: "etcd WAL fsync latency is high" + description: "99th percentile etcd WAL fsync duration exceeds 500ms." +EOF + +echo "=== Alert rules installed ===" +""" + + +def install_helm(control_plane_node: dict | None = None, profile: ClusterProfile | None = None) -> SSHResult: + """Install Helm on the control-plane node or locally for imported clusters.""" + return _run_on_cluster(control_plane_node, generate_helm_install_script(), profile=profile, timeout=120) + + +def install_prometheus_stack( + control_plane_node: dict | None = None, + namespace: str = "monitoring", + profile: ClusterProfile | None = None, +) -> SSHResult: + """Install the full kube-prometheus-stack.""" + return _run_on_cluster(control_plane_node, generate_prometheus_install_script(namespace), profile=profile, timeout=900) + + +def install_dashboards( + control_plane_node: dict | None = None, + dashboard_keys: list[str] | None = None, + namespace: str = "monitoring", + profile: ClusterProfile | None = None, +) -> SSHResult: + """Import selected Grafana dashboards.""" + dashboard_keys = dashboard_keys or [] + return _run_on_cluster(control_plane_node, generate_dashboard_import_script(dashboard_keys, namespace), profile=profile, timeout=300) + + +def install_alert_rules( + control_plane_node: dict | None = None, + namespace: str = "monitoring", + profile: ClusterProfile | None = None, +) -> SSHResult: + """Install Prometheus alerting rules.""" + return _run_on_cluster(control_plane_node, generate_alerting_rules_script(namespace), profile=profile, timeout=60) + + +def get_monitoring_status( + control_plane_node: dict | None = None, + namespace: str = "monitoring", + profile: ClusterProfile | None = None, +) -> SSHResult: + """Check the status of the monitoring stack.""" + command = f""" +echo "=== Monitoring Stack Status ===" +echo "" +echo ">> Pods:" +kubectl -n {namespace} get pods -o wide +echo "" +echo ">> Services:" +kubectl -n {namespace} get svc +echo "" +echo ">> PVCs:" +kubectl -n {namespace} get pvc +echo "" +echo ">> PrometheusRules:" +kubectl -n {namespace} get prometheusrules 2>/dev/null || echo "No PrometheusRules found" +echo "" +echo ">> ServiceMonitors:" +kubectl -n {namespace} get servicemonitors 2>/dev/null || echo "No ServiceMonitors found" +""" + return _run_on_cluster(control_plane_node, command, profile=profile, timeout=30) + + +def get_monitoring_advice( + profile: ClusterProfile, + current_status: str = "", +) -> str: + """Ask the LLM for monitoring setup advice. + + Returns a graceful message when the LLM is not configured. + """ + from modules.llm_client import query_llm # lazy import — LLM is optional + + prompt = f"""I have a Kubernetes cluster with the following setup: +- Kubernetes: {profile.kubernetes_version} +- Runtime: CRI-O {profile.crio_version} +- CNI: Flannel +- Nodes: {len(profile.nodes)} ({len(profile.get_control_plane_nodes())} control-plane, {len(profile.get_worker_nodes())} workers) + +Current monitoring status: +{current_status or 'Not yet installed'} + +Please recommend: +1. The optimal Prometheus retention and resource settings for this cluster size +2. Essential Grafana dashboards to install +3. Critical alerting rules beyond the standard set +4. Any additional exporters I should install (e.g., blackbox, SNMP) +5. Log aggregation recommendations (Loki, EFK, etc.) +""" + return query_llm(prompt) diff --git a/k8s-agent/modules/profile_manager.py b/k8s-agent/modules/profile_manager.py new file mode 100644 index 0000000..0b7eb10 --- /dev/null +++ b/k8s-agent/modules/profile_manager.py @@ -0,0 +1,137 @@ +"""Cluster Profile Manager — CRUD operations for K8s cluster profiles.""" + +import json +import os +import time +from dataclasses import asdict, dataclass, field +from typing import Optional + +import config + + +@dataclass +class NodeInfo: + """Represents a node in the cluster.""" + + hostname: str + ip_address: str + role: str # "control-plane" or "worker" + ssh_user: str = "root" + ssh_port: int = 22 + ssh_key_path: str = "~/.ssh/id_rsa" + + +@dataclass +class ClusterProfile: + """Represents a complete cluster profile configuration.""" + + name: str + description: str = "" + kubernetes_version: str = "1.30" + crio_version: str = "1.30" + cni_plugin: str = "flannel" + pod_cidr: str = "10.244.0.0/16" + service_cidr: str = "10.96.0.0/12" + dns_domain: str = "cluster.local" + nodes: list[dict] = field(default_factory=list) + created_at: str = "" + updated_at: str = "" + status: str = "draft" # draft, provisioning, active, error + kubeconfig_path: str = "" + monitoring_enabled: bool = False + pod_security_standard: str = "restricted" # privileged, baseline, restricted + # CRI-O storage paths (override defaults in /var/lib) + crio_root: str = "/var/lib/containers/storage" # container storage root + crio_runroot: str = "/run/containers/storage" # runtime root + kubelet_root: str = "/var/lib/kubelet" # kubelet data dir + log_root: str = "/var/log" # base log directory + # Proxy settings for master node + http_proxy: str = "" + https_proxy: str = "" + no_proxy: str = "" + http_proxy_alt: str = "" # alternate proxy + https_proxy_alt: str = "" # alternate proxy + # Offline manifest paths — user-provided files for air-gapped environments + flannel_manifest_path: str = "" # local path to kube-flannel.yml + prometheus_manifest_path: str = "" # local path to prometheus manifest + # Kubeconfig for existing clusters (imported, not provisioned) + kubeconfig_content: str = "" # raw kubeconfig YAML content + cluster_source: str = "provisioned" # "provisioned" or "imported" + + def get_control_plane_nodes(self) -> list[dict]: + return [n for n in self.nodes if n.get("role") == "control-plane"] + + def get_worker_nodes(self) -> list[dict]: + return [n for n in self.nodes if n.get("role") == "worker"] + + +def _profile_path(name: str) -> str: + """Return the file path for a given profile name.""" + safe_name = name.replace(" ", "_").replace("/", "_").lower() + return os.path.join(config.PROFILES_DIR, f"{safe_name}.json") + + +def save_profile(profile: ClusterProfile) -> str: + """Save a cluster profile to disk. + + Returns the file path where the profile was saved. + """ + now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + if not profile.created_at: + profile.created_at = now + profile.updated_at = now + + path = _profile_path(profile.name) + fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + with os.fdopen(fd, "w") as f: + json.dump(asdict(profile), f, indent=2) + return path + + +def load_profile(name: str) -> Optional[ClusterProfile]: + """Load a cluster profile from disk by name.""" + path = _profile_path(name) + if not os.path.exists(path): + return None + with open(path, "r") as f: + data = json.load(f) + return ClusterProfile(**data) + + +def list_profiles() -> list[ClusterProfile]: + """List all saved cluster profiles.""" + profiles = [] + if not os.path.exists(config.PROFILES_DIR): + return profiles + for filename in sorted(os.listdir(config.PROFILES_DIR)): + if filename.endswith(".json"): + filepath = os.path.join(config.PROFILES_DIR, filename) + try: + with open(filepath, "r") as f: + data = json.load(f) + profiles.append(ClusterProfile(**data)) + except (json.JSONDecodeError, TypeError): + continue + return profiles + + +def delete_profile(name: str) -> bool: + """Delete a cluster profile by name. + + Returns True if the profile was deleted, False if it didn't exist. + """ + path = _profile_path(name) + if os.path.exists(path): + os.remove(path) + return True + return False + + +def update_profile_status(name: str, status: str) -> bool: + """Update the status field of an existing profile.""" + profile = load_profile(name) + if profile is None: + return False + profile.status = status + save_profile(profile) + return True diff --git a/k8s-agent/requirements.txt b/k8s-agent/requirements.txt new file mode 100644 index 0000000..8667fe8 --- /dev/null +++ b/k8s-agent/requirements.txt @@ -0,0 +1,6 @@ +streamlit>=1.32.0 +requests>=2.31.0 +plotly>=5.18.0 +pandas>=2.0.0 +scikit-learn>=1.3.0 +numpy>=1.24.0 diff --git a/k8s-agent/templates/.gitkeep b/k8s-agent/templates/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/k8s-agent/templates/.gitkeep @@ -0,0 +1 @@ +