From fe9f155f5c7518ea615424973cd0121929a20798 Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Sun, 18 Jan 2026 21:01:46 -0500 Subject: [PATCH 1/8] feat: improve nodes dashboard --- cmd/dashboard/main.go | 3 + docs/TECHDEBT.md | 4 + internal/api/handlers_nodes.go | 24 + internal/api/router.go | 3 + internal/store/pricing.go | 57 +- internal/store/pricing_test.go | 2 +- internal/store/store.go | 61 +- internal/store/store_test.go | 2 +- internal/vm/client.go | 78 +++ internal/vm/dashboard.go | 144 ++++- internal/vm/ingestor.go | 10 +- web/package-lock.json | 60 +- web/package.json | 4 +- web/src/components/common/MetricCard.tsx | 39 ++ web/src/components/nodes/EfficiencyBar.tsx | 73 +++ web/src/components/nodes/NodeDetailSheet.tsx | 97 +++- web/src/components/ui/progress.tsx | 6 +- web/src/components/ui/tooltip.tsx | 28 + web/src/lib/api.ts | 22 +- .../pages/namespaces/NamespacesPage.test.tsx | 1 + web/src/pages/nodes/NodesPage.tsx | 541 ++++++++---------- web/src/pages/resources/ResourcesPage.tsx | 1 + web/vite.config.ts | 58 +- 23 files changed, 936 insertions(+), 382 deletions(-) create mode 100644 web/src/components/common/MetricCard.tsx create mode 100644 web/src/components/nodes/EfficiencyBar.tsx create mode 100644 web/src/components/ui/tooltip.tsx diff --git a/cmd/dashboard/main.go b/cmd/dashboard/main.go index 25d6e59..3e26622 100644 --- a/cmd/dashboard/main.go +++ b/cmd/dashboard/main.go @@ -56,6 +56,9 @@ func main() { // Initialize FinOps Engine finopsEngine := finops.NewEngine(vmClient, st.PricingCatalog()) + // Share Pricing Catalog with VM Client + vmClient.SetPricingCatalog(st.PricingCatalog()) + auth.SetSecret(cfg.JWTSecret) srv := &http.Server{ diff --git a/docs/TECHDEBT.md b/docs/TECHDEBT.md index a2fbc10..3a80bf6 100644 --- a/docs/TECHDEBT.md +++ b/docs/TECHDEBT.md @@ -15,3 +15,7 @@ ## Future Considerations - [ ] **Retention Policies**: Configure distinct retention periods for high-precision metrics (15s interval) vs. aggregated historical data. - [ ] **Refactor Store Locking**: Evaluate moving from heavy `RWMutex` usage in `store.go` to a more concurrent pattern if contention increases with 100+ agents. +- [ ] **Dynamic Pricing & Savings Plans Support**: + - Problem: Agents no longer send costs; backend relies on static On-Demand rates. + - Solution: Implement a **Dynamic Pricing Engine** with DB overrides for Savings Plans, Reserved Instances, and Spot Pricing. + - Design: See `dynamic_pricing_design.md` artifact. diff --git a/internal/api/handlers_nodes.go b/internal/api/handlers_nodes.go index fcb30ef..b43e825 100644 --- a/internal/api/handlers_nodes.go +++ b/internal/api/handlers_nodes.go @@ -2,6 +2,7 @@ package api import ( "net/http" + "time" "github.com/go-chi/chi/v5" @@ -58,3 +59,26 @@ func (h *Handler) NodeDetail(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, node) } + +// NodeStats returns historical usage and cost stats for a node. +func (h *Handler) NodeStats(w http.ResponseWriter, r *http.Request) { + name := chi.URLParam(r, "name") + if name == "" { + writeError(w, http.StatusBadRequest, "node name is required") + return + } + windowStr := r.URL.Query().Get("window") + window, _ := time.ParseDuration(windowStr) + if window <= 0 { + window = 24 * time.Hour + } + + ctx := vm.WithClusterID(r.Context(), clusterIDFromRequest(r)) + stats, err := h.vm.GetNodeStats(ctx, "", name, window) + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + writeJSON(w, http.StatusOK, stats) +} diff --git a/internal/api/router.go b/internal/api/router.go index 28d403a..06f2e37 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "net/http" + "time" "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" @@ -28,6 +29,7 @@ type MetricsProvider interface { Agents(ctx context.Context) ([]store.AgentInfo, error) ClusterMetadata(ctx context.Context) (store.ClusterMetadata, error) NetworkTopology(ctx context.Context, opts store.NetworkTopologyOptions) ([]store.NetworkEdge, error) + GetNodeStats(ctx context.Context, clusterID, nodeName string, window time.Duration) (store.NodeStats, error) } // Handler wires HTTP requests to the VictoriaMetrics client. @@ -73,6 +75,7 @@ func NewRouter(vmClient MetricsProvider, db *db.Store, st *store.Store, finopsEn cost.Get("/namespaces/{name}", h.NamespaceDetail) cost.Get("/nodes", h.Nodes) cost.Get("/nodes/{name}", h.NodeDetail) + cost.Get("/nodes/{name}/stats", h.NodeStats) cost.Get("/resources", h.Resources) }) protected.Get("/agent", h.AgentStatus) diff --git a/internal/store/pricing.go b/internal/store/pricing.go index 3d4e060..6ddaaef 100644 --- a/internal/store/pricing.go +++ b/internal/store/pricing.go @@ -1,6 +1,11 @@ package store -import "context" +import ( + "context" + "fmt" + + "github.com/clustercost/clustercost-dashboard/internal/pricing" +) // Pricing constants const ( @@ -10,51 +15,33 @@ const ( CostEgressInternal = 0.00 // Free ) -// PricingProvider defines the interface for fetching node pricing. -type PricingProvider interface { - GetNodePrice(ctx context.Context, region, instanceType string) (float64, error) -} - // PricingCatalog allows looking up node prices. type PricingCatalog struct { - // Map instance type to hourly price - InstancePrices map[string]float64 - Provider PricingProvider + // No provider needed, we use static data from internal/pricing } -// NewPricingCatalog returns a catalog with some default mocked pricing. -func NewPricingCatalog(provider PricingProvider) *PricingCatalog { - return &PricingCatalog{ - InstancePrices: map[string]float64{ - "t3.medium": 0.0416, - "t3.large": 0.0832, - "m5.large": 0.096, - "m5.xlarge": 0.192, - "c5.large": 0.085, - "r5.large": 0.126, - "default": 0.05, // Fallback - }, - Provider: provider, - } +// NewPricingCatalog returns a catalog. +func NewPricingCatalog() *PricingCatalog { + return &PricingCatalog{} } // GetTotalNodePrice returns the total hourly cost of a node. func (pc *PricingCatalog) GetTotalNodePrice(ctx context.Context, region, instanceType string) float64 { - // Try Provider first - if pc.Provider != nil && instanceType != "" && region != "" { - price, err := pc.Provider.GetNodePrice(ctx, region, instanceType) - if err == nil && price > 0 { - pc.InstancePrices[instanceType] = price // Update cache - return price - } + // 1. Try Shared Static Data + key := fmt.Sprintf("%s|%s", region, instanceType) + if price, ok := pricing.InstancePrices[key]; ok { + return price } - // Fallback to local cache - price, ok := pc.InstancePrices[instanceType] - if !ok { - price = pc.InstancePrices["default"] + // 2. Fallback to generic defaults if completely unknown + // check if we have a default for the instance type regardless of region (common for US-East-1 based defaults) + // (Optional optimization: try "us-east-1|instanceType" as fallback?) + fallbackKey := fmt.Sprintf("us-east-1|%s", instanceType) + if price, ok := pricing.InstancePrices[fallbackKey]; ok { + return price } - return price + + return 0.05 // Ultimate fallback } // GetNodeResourcePrices calculates the cost per vCPU and per GB of RAM based on the instance type. diff --git a/internal/store/pricing_test.go b/internal/store/pricing_test.go index 7ab6916..630cdd7 100644 --- a/internal/store/pricing_test.go +++ b/internal/store/pricing_test.go @@ -6,7 +6,7 @@ import ( ) func TestPricingCatalog_GetNodeResourcePrices(t *testing.T) { - pc := NewPricingCatalog(nil) + pc := NewPricingCatalog() // Test case 1: m5.large (2 vCPU, 8GB RAM) // Price: $0.096/hr diff --git a/internal/store/store.go b/internal/store/store.go index 9df7b58..2abfc82 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -11,7 +11,6 @@ import ( "math" "github.com/clustercost/clustercost-dashboard/internal/config" - "github.com/clustercost/clustercost-dashboard/internal/pricing" agentv1 "github.com/clustercost/clustercost-dashboard/internal/proto/agent/v1" ) @@ -122,6 +121,11 @@ type NodeSummary struct { InstanceType string `json:"instanceType,omitempty"` Labels map[string]string `json:"labels"` Taints []string `json:"taints"` + // Resource Requests (Allocated) + CPURequestedMilli int64 `json:"cpuRequestedMilli"` + CPULimitMilli int64 `json:"cpuLimitMilli"` + MemoryRequestedBytes int64 `json:"memoryRequestedBytes"` + MemoryLimitBytes int64 `json:"memoryLimitBytes"` // Network (Host Level) NetTxBytes int64 `json:"netTxBytes"` NetRxBytes int64 `json:"netRxBytes"` @@ -282,6 +286,16 @@ type PodContext struct { InstanceType string } +// NodeStats contains historical usage and cost analysis for a node. +type NodeStats struct { + NodeName string `json:"nodeName"` + P95CPUUsagePercent float64 `json:"p95CpuUsagePercent"` + P95MemoryUsagePercent float64 `json:"p95MemoryUsagePercent"` + TotalMonthlyCost float64 `json:"totalMonthlyCost"` + RealUsageMonthlyCost float64 `json:"realUsageMonthlyCost"` + Window string `json:"window"` +} + // New creates a store seeded with agent configurations. func New(cfgs []config.AgentConfig, recommendedAgentVersion string) *Store { agentConfigs := make(map[string]config.AgentConfig, len(cfgs)) @@ -290,14 +304,13 @@ func New(cfgs []config.AgentConfig, recommendedAgentVersion string) *Store { } // Initialize Static Pricing Provider - // Context is just placeholder for interface, static client doesn't need it - pricingClient, _ := pricing.NewAWSClient(context.Background()) + // We use the static map from internal/pricing/data.go, so no dynamic client needed. return &Store{ agentConfigs: agentConfigs, snapshots: make(map[string]*AgentSnapshot, len(cfgs)), recommendedAgentVersion: recommendedAgentVersion, - pricing: NewPricingCatalog(pricingClient), + pricing: NewPricingCatalog(), } } @@ -972,6 +985,28 @@ func (s *Store) aggregateNodesLocked() (map[string]*NodeSummary, error) { } haveData = true + // Pre-calculate Pod Limits/Requests per Node + nodeLimits := make(map[string]struct { + cpuReq, cpuLim, memReq, memLim int64 + }) + + for _, p := range snap.Report.Pods { + nodeName := snap.Report.NodeName + if nodeName == "" { + continue + } + stats := nodeLimits[nodeName] + if p.Cpu != nil { + stats.cpuReq += safeInt64(p.Cpu.RequestMillicores) + stats.cpuLim += safeInt64(p.Cpu.LimitMillicores) + } + if p.Memory != nil { + stats.memReq += safeInt64(p.Memory.RequestBytes) + stats.memLim += safeInt64(p.Memory.LimitBytes) + } + nodeLimits[nodeName] = stats + } + // Iterate over all nodes reported by this agent for _, n := range snap.Report.Nodes { if n == nil || n.NodeName == "" { @@ -979,6 +1014,11 @@ func (s *Store) aggregateNodesLocked() (map[string]*NodeSummary, error) { } name := n.NodeName + // Use aggregated values from pods if available, fallback to node metric if generic + // The Agent V2 NodeMetric.Requested... is arguably the same, but doesn't have Limits. + // We prioritize our calculated limits. + podStats := nodeLimits[name] + entry, ok := nodes[name] if !ok { entry = &NodeSummary{ @@ -988,7 +1028,20 @@ func (s *Store) aggregateNodesLocked() (map[string]*NodeSummary, error) { InstanceType: "default", // placeholder CPUAllocatableMilli: safeInt64(n.AllocatableCpuMillicores), MemoryAllocatableBytes: safeInt64(n.AllocatableMemoryBytes), + CPURequestedMilli: safeInt64(n.RequestedCpuMillicores), // Fallback to agent metric + CPULimitMilli: podStats.cpuLim, + MemoryRequestedBytes: safeInt64(n.RequestedMemoryBytes), // Fallback to agent metric + MemoryLimitBytes: podStats.memLim, + IsUnderPressure: n.ThrottlingNs > 1_000_000, } + // If agent metric is 0 (older agent?) use our aggregation for Requests too + if entry.CPURequestedMilli == 0 { + entry.CPURequestedMilli = podStats.cpuReq + } + if entry.MemoryRequestedBytes == 0 { + entry.MemoryRequestedBytes = podStats.memReq + } + nodes[name] = entry } diff --git a/internal/store/store_test.go b/internal/store/store_test.go index 7702b7e..7911687 100644 --- a/internal/store/store_test.go +++ b/internal/store/store_test.go @@ -13,7 +13,7 @@ func newTestStore() *Store { } s := New(cfgs, "v1.0.0") // Inject Mock Pricing - s.pricing = NewPricingCatalog(&MockPricing{}) + s.pricing = NewPricingCatalog() return s } diff --git a/internal/vm/client.go b/internal/vm/client.go index 91a7b8a..e14deaf 100644 --- a/internal/vm/client.go +++ b/internal/vm/client.go @@ -16,6 +16,7 @@ import ( "time" "github.com/clustercost/clustercost-dashboard/internal/config" + "github.com/clustercost/clustercost-dashboard/internal/store" ) // ErrNoData indicates that VictoriaMetrics returned no usable data. @@ -42,6 +43,12 @@ type Client struct { cacheTTL time.Duration cacheMu sync.Mutex cache map[string]cachedQuery + pricing *store.PricingCatalog +} + +// SetPricingCatalog allows injecting the pricing catalog. +func (c *Client) SetPricingCatalog(p *store.PricingCatalog) { + c.pricing = p } type cachedQuery struct { @@ -376,6 +383,77 @@ func (c *Client) GetPodP95Usage(ctx context.Context, clusterID, namespace, podNa return cpuCores, memoryBytes, nil } +// GetNodeStats calculates the average usage and real cost of a node over a time window. +func (c *Client) GetNodeStats(ctx context.Context, clusterID, nodeName string, window time.Duration) (store.NodeStats, error) { + if nodeName == "" { + return store.NodeStats{}, fmt.Errorf("node name is required") + } + if window <= 0 { + window = 24 * time.Hour + } + windowStr := formatDuration(window) + + labels := map[string]string{ + "node": nodeName, + } + if clusterID != "" { + labels["cluster_id"] = clusterID + } + + // 1. Get P95 Usage % + // quantile_over_time(0.95, clustercost_node_cpu_usage_percent{node="name"}[window]) + cpuQuery := fmt.Sprintf("quantile_over_time(0.95, clustercost_node_cpu_usage_percent%s[%s])", + formatLabels(labels), windowStr) + memQuery := fmt.Sprintf("quantile_over_time(0.95, clustercost_node_memory_usage_percent%s[%s])", + formatLabels(labels), windowStr) + + // 2. Get Average Hourly Cost (to account for potential spot price fluctuations or just stability) + costQuery := fmt.Sprintf("avg_over_time(clustercost_node_hourly_cost%s[%s])", + formatLabels(labels), windowStr) + + cpuSamples, err := c.query(ctx, cpuQuery) + if err != nil { + return store.NodeStats{}, fmt.Errorf("query cpu stats: %w", err) + } + memSamples, err := c.query(ctx, memQuery) + if err != nil { + return store.NodeStats{}, fmt.Errorf("query mem stats: %w", err) + } + costSamples, err := c.query(ctx, costQuery) + if err != nil { + return store.NodeStats{}, fmt.Errorf("query cost stats: %w", err) + } + + p95Cpu := 0.0 + if len(cpuSamples) > 0 { + p95Cpu = cpuSamples[0].value + } + p95Mem := 0.0 + if len(memSamples) > 0 { + p95Mem = memSamples[0].value + } + avgHourlyCost := 0.0 + if len(costSamples) > 0 { + avgHourlyCost = costSamples[0].value + } + + // Calculate Costs + totalMonthly := avgHourlyCost * hoursPerMonth // 720 hours + // "Real Usage" = (Total * 0.5 * Cpu%) + (Total * 0.5 * Mem%) + // Percentages are 0-100 in VM usually (based on ingestion code: `writeFloatSample(..., cpuPct, ...)` where `cpuPct` was * 100) + + realUsageMonthly := (totalMonthly * 0.5 * (p95Cpu / 100.0)) + (totalMonthly * 0.5 * (p95Mem / 100.0)) + + return store.NodeStats{ + NodeName: nodeName, + P95CPUUsagePercent: p95Cpu, + P95MemoryUsagePercent: p95Mem, + TotalMonthlyCost: totalMonthly, + RealUsageMonthlyCost: realUsageMonthly, + Window: windowStr, + }, nil +} + func formatLabels(labels map[string]string) string { if len(labels) == 0 { return "" diff --git a/internal/vm/dashboard.go b/internal/vm/dashboard.go index 34cc685..f5d0703 100644 --- a/internal/vm/dashboard.go +++ b/internal/vm/dashboard.go @@ -755,7 +755,12 @@ func (c *Client) namespaceMetrics(ctx context.Context, environment, namespace st }) } - pricing := store.NewPricingCatalog(nil) + var pricing *store.PricingCatalog + if c.pricing != nil { + pricing = c.pricing + } else { + pricing = store.NewPricingCatalog() + } totalNodeCost := 0.0 totalCpuCores := 0.0 totalMemGB := 0.0 @@ -801,12 +806,50 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* e.HourlyCost = v if e.InstanceType == "" { e.InstanceType = l["instance_type"] + if e.InstanceType == "" { + e.InstanceType = l["node_label_node_kubernetes_io_instance_type"] + } + if e.InstanceType == "" { + e.InstanceType = l["node_label_beta_kubernetes_io_instance_type"] + } + } + for k, v := range l { + e.Labels[k] = v } }}, {"clustercost_node_cpu_usage_percent", func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUUsagePercent = v }}, {"clustercost_node_memory_usage_percent", func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryUsagePercent = v }}, - {"clustercost_node_cpu_allocatable_milli", func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUAllocatableMilli = int64(v) }}, + {"clustercost_node_cpu_allocatable_milli", func(e *store.NodeSummary, v float64, l map[string]string) { + e.CPUAllocatableMilli = int64(v) + if e.InstanceType == "" { + e.InstanceType = l["instance_type"] + if e.InstanceType == "" { + e.InstanceType = l["node_label_node_kubernetes_io_instance_type"] + } + if e.InstanceType == "" { + e.InstanceType = l["node_label_beta_kubernetes_io_instance_type"] + } + } + // Capture region from labels if not already present in basic labels + for k, v := range l { + e.Labels[k] = v + } + }}, {"clustercost_node_memory_allocatable_bytes", func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryAllocatableBytes = int64(v) }}, + {"clustercost_node_cpu_requested_milli", func(e *store.NodeSummary, v float64, l map[string]string) { + e.CPURequestedMilli = int64(v) + if e.InstanceType == "" { + e.InstanceType = l["instance_type"] + } + }}, + {"clustercost_node_memory_requested_bytes", func(e *store.NodeSummary, v float64, l map[string]string) { + e.MemoryRequestedBytes = int64(v) + if e.InstanceType == "" { + e.InstanceType = l["instance_type"] + } + }}, + {"clustercost_node_cpu_limit_milli", func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPULimitMilli = int64(v) }}, + {"clustercost_node_memory_limit_bytes", func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryLimitBytes = int64(v) }}, {"clustercost_node_pod_count", func(e *store.NodeSummary, v float64, _ map[string]string) { e.PodCount = int(v) }}, {"clustercost_node_under_pressure", func(e *store.NodeSummary, v float64, _ map[string]string) { e.IsUnderPressure = v > 0.5 }}, } @@ -814,8 +857,14 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* out := make(map[string]*store.NodeSummary) for _, metric := range metrics { by := "node" - if metric.name == "clustercost_node_hourly_cost" { - by = "node,instance_type" + // We want to preserve instance_type and region information for all relevant metrics + // specifically request and allocatable metrics which often carry this metadata. + if metric.name == "clustercost_node_hourly_cost" || + metric.name == "clustercost_node_cpu_requested_milli" || + metric.name == "clustercost_node_memory_requested_bytes" || + metric.name == "clustercost_node_cpu_allocatable_milli" { + // Include standard and legacy k8s labels to survive aggregation + by = "node,instance_type,node_label_node_kubernetes_io_instance_type,node_label_beta_kubernetes_io_instance_type,cluster_region,topology_kubernetes_io_region,failure_domain_beta_kubernetes_io_region" } expr := fmt.Sprintf("max by (%s) (%s)", by, c.lookbackExpr(metric.name, labels, clusterID)) samples, err := c.query(ctx, expr) @@ -840,6 +889,50 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* } } + // Fetch metadata from Agent Status if available (User suggestion) + agentSamples, err := c.seriesTimestamp(ctx, "clustercost_agent_up", labels) + if err == nil { + for _, s := range agentSamples { + node := s.labels["node"] + if node == "" { + continue + } + entry := out[node] + if entry == nil { + // We don't create new entries just from agent_up, only enrich existing ones + // or maybe we should? For now, let's just enrich. + continue + } + + // Extract Instance Type if missing + if entry.InstanceType == "" { + if v := s.labels["instance_type"]; v != "" { + entry.InstanceType = v + } else if v := s.labels["node_label_node_kubernetes_io_instance_type"]; v != "" { + entry.InstanceType = v + } else if v := s.labels["node_label_beta_kubernetes_io_instance_type"]; v != "" { + entry.InstanceType = v + } + } + + // extract region if missing + if entry.Labels["topology_kubernetes_io_region"] == "" { + if v := s.labels["cluster_region"]; v != "" { + entry.Labels["cluster_region"] = v + entry.Labels["topology_kubernetes_io_region"] = v // Normalizing + } else { + // Regex fallback for AWS DNS names + // e.g. ip-10-30-16-166.us-west-2.compute.internal + re := regexp.MustCompile(`\.(us-[a-z]+-\d+)\.`) + matches := re.FindStringSubmatch(entry.NodeName) + if len(matches) > 1 { + entry.Labels["topology_kubernetes_io_region"] = matches[1] + } + } + } + } + } + statusSamples, err := c.seriesTimestamp(ctx, "clustercost_node_status", labels) if err != nil && err != ErrNoData { return nil, time.Time{}, err @@ -851,6 +944,49 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* } } + var pricing *store.PricingCatalog + if c.pricing != nil { + pricing = c.pricing + } else { + pricing = store.NewPricingCatalog() + } + + // Backfill Costs if missing using the Pricing Catalog + for _, node := range out { + if node.HourlyCost > 0 { + continue + } + + // Try to find region + region := node.Labels["topology_kubernetes_io_region"] + if region == "" { + region = node.Labels["failure_domain_beta_kubernetes_io_region"] + } + if region == "" { + region = node.Labels["cluster_region"] + } + if region == "" { + // Fallback: Default region if unknown, often us-east-1 or inferred from node name + // e.g. ip-10-30-12-16.us-west-2.compute.internal + if strings.Contains(node.NodeName, "us-east-1") { + region = "us-east-1" + } else if strings.Contains(node.NodeName, "us-west-2") { + region = "us-west-2" + } else if strings.Contains(node.NodeName, "eu-west-1") { + region = "eu-west-1" + } else { + region = "us-east-1" // ultimate fallback + } + } + + instanceType := node.InstanceType + if instanceType == "" { + instanceType = "m5.large" // Default fallback to avoid 0 cost + } + + node.HourlyCost = pricing.GetTotalNodePrice(ctx, region, instanceType) + } + latest := c.seriesTimestampSafe(ctx, "clustercost_node_hourly_cost") return out, latest, nil } diff --git a/internal/vm/ingestor.go b/internal/vm/ingestor.go index 5ba8fee..d2aab8f 100644 --- a/internal/vm/ingestor.go +++ b/internal/vm/ingestor.go @@ -325,7 +325,7 @@ func (i *Ingestor) appendMetricsReport(buf, labelBuf *bytes.Buffer, scratch []by } // map[namespace]*nsAgg nsMap := make(map[string]*nsAgg) - pricing := store.NewPricingCatalog(nil) + pricing := store.NewPricingCatalog() region := req.Region if region == "" { region = req.AvailabilityZone @@ -534,6 +534,14 @@ func (i *Ingestor) appendMetricsReport(buf, labelBuf *bytes.Buffer, scratch []by writeFloatSample(buf, scratch, "clustercost_node_memory_usage_percent", nodeLabelsBlob, memPct, tsMillis) } + // Calculate Node Hourly Cost + // We use Capacity because you pay for the whole node, not just allocatable. + nodeCpuCores := float64(node.CapacityCpuMillicores) / 1000.0 + nodeMemGB := float64(node.CapacityMemoryBytes) / (1024 * 1024 * 1024) + nodeHourlyCost := (nodeCpuCores * cpuPrice) + (nodeMemGB * memPrice) + + writeFloatSample(buf, scratch, "clustercost_node_hourly_cost", nodeLabelsBlob, nodeHourlyCost, tsMillis) + // Node Network Metrics (Host Traffic) if node.Network != nil { nodeTx := safeInt64(node.Network.BytesSent) diff --git a/web/package-lock.json b/web/package-lock.json index 227b0d7..39dceb8 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -14,6 +14,7 @@ "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-slot": "^1.2.4", "@radix-ui/react-tabs": "^1.1.13", + "@radix-ui/react-tooltip": "^1.2.8", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "lucide-react": "^0.298.0", @@ -35,6 +36,7 @@ "@types/react-dom": "^18.2.7", "@vitejs/plugin-react": "^4.2.1", "autoprefixer": "^10.4.16", + "baseline-browser-mapping": "^2.9.15", "jsdom": "^24.0.0", "postcss": "^8.4.31", "tailwindcss": "^3.4.14", @@ -1709,6 +1711,58 @@ } } }, + "node_modules/@radix-ui/react-tooltip": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@radix-ui/react-tooltip/-/react-tooltip-1.2.8.tgz", + "integrity": "sha512-tY7sVt1yL9ozIxvmbtN5qtmH2krXcBCfjEiCgKGLqunJHvgvZG2Pcl2oQ3kbcZARb1BGEHdkLzcYGO8ynVlieg==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-dismissable-layer": "1.1.11", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-popper": "1.2.8", + "@radix-ui/react-portal": "1.1.9", + "@radix-ui/react-presence": "1.1.5", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-slot": "1.2.3", + "@radix-ui/react-use-controllable-state": "1.2.2", + "@radix-ui/react-visually-hidden": "1.2.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-slot": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.3.tgz", + "integrity": "sha512-aeNmHnBxbi2St0au6VBVC7JXFlhLlOnvIIlePNniyUNAClzmtAUEY8/pBiK3iHjufOlwA+c20/8jngo7xcrg8A==", + "license": "MIT", + "dependencies": { + "@radix-ui/react-compose-refs": "1.1.2" + }, + "peerDependencies": { + "@types/react": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-use-callback-ref": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.1.1.tgz", @@ -3047,9 +3101,9 @@ "license": "MIT" }, "node_modules/baseline-browser-mapping": { - "version": "2.8.28", - "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.8.28.tgz", - "integrity": "sha512-gYjt7OIqdM0PcttNYP2aVrr2G0bMALkBaoehD4BuRGjAOtipg0b6wHg1yNL+s5zSnLZZrGHOw4IrND8CD+3oIQ==", + "version": "2.9.15", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.15.tgz", + "integrity": "sha512-kX8h7K2srmDyYnXRIppo4AH/wYgzWVCs+eKr3RusRSQ5PvRYoEFmR/I0PbdTjKFAoKqp5+kbxnNTFO9jOfSVJg==", "dev": true, "license": "Apache-2.0", "bin": { diff --git a/web/package.json b/web/package.json index 5daf043..9ee6cae 100644 --- a/web/package.json +++ b/web/package.json @@ -17,13 +17,14 @@ "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-slot": "^1.2.4", "@radix-ui/react-tabs": "^1.1.13", + "@radix-ui/react-tooltip": "^1.2.8", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "lucide-react": "^0.298.0", "react": "^18.2.0", "react-dom": "^18.2.0", - "reactflow": "^11.11.4", "react-router-dom": "^6.20.0", + "reactflow": "^11.11.4", "recharts": "^2.8.0", "shadcn-ui": "^0.9.5", "tailwind-merge": "^1.14.0", @@ -38,6 +39,7 @@ "@types/react-dom": "^18.2.7", "@vitejs/plugin-react": "^4.2.1", "autoprefixer": "^10.4.16", + "baseline-browser-mapping": "^2.9.15", "jsdom": "^24.0.0", "postcss": "^8.4.31", "tailwindcss": "^3.4.14", diff --git a/web/src/components/common/MetricCard.tsx b/web/src/components/common/MetricCard.tsx new file mode 100644 index 0000000..83c7e7c --- /dev/null +++ b/web/src/components/common/MetricCard.tsx @@ -0,0 +1,39 @@ +import { Card, CardContent, CardHeader, CardTitle } from "../ui/card"; +import { cn } from "../../lib/utils"; + +interface MetricCardProps { + title: string; + value: string; + subtext?: string; + trend?: string; + trendUp?: boolean; + className?: string; + valueClassName?: string; +} + +export function MetricCard({ title, value, subtext, trend, trendUp, className, valueClassName }: MetricCardProps) { + return ( + + + + {title} + + + +
+ {value} +
+ {(subtext || trend) && ( +
+ {trend && ( + + {trend} + + )} + {subtext} +
+ )} +
+
+ ); +} diff --git a/web/src/components/nodes/EfficiencyBar.tsx b/web/src/components/nodes/EfficiencyBar.tsx new file mode 100644 index 0000000..de3cb22 --- /dev/null +++ b/web/src/components/nodes/EfficiencyBar.tsx @@ -0,0 +1,73 @@ +import { Progress } from "@/components/ui/progress"; +import { formatCurrency } from "../../lib/utils"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; + +interface EfficiencyBarProps { + usagePercent: number; + requestPercent: number; + costPerMonth: number; + cpuCores?: string; + className?: string; // Added to match usage +} + +export function EfficiencyBar({ usagePercent, requestPercent, costPerMonth }: EfficiencyBarProps) { + // FinOps Logic: + // If Request >>> Usage, we have waste. + // The "Gap" visually shows this. + + const wastePercent = Math.max(0, requestPercent - usagePercent); + const wastedCost = costPerMonth * (wastePercent / 100); + + return ( +
+ {/* Top Bar: Actual Usage (The "Real" Work) */} +
+ Usage + {usagePercent.toFixed(0)}% +
+ + + {/* Bottom Bar: Reserved / Requested (The "Billable" Reservation) */} +
+ Reserved + {requestPercent.toFixed(0)}% +
+ + + +
+ +
+
+ +
+

Efficiency Gap

+
+ Usage: + {usagePercent.toFixed(1)}% +
+
+ Reserved: + {requestPercent.toFixed(1)}% +
+ {wastedCost > 1 && ( +
+ Waste: + {formatCurrency(wastedCost)}/mo +
+ )} +
+
+
+
+
+ ); +} diff --git a/web/src/components/nodes/NodeDetailSheet.tsx b/web/src/components/nodes/NodeDetailSheet.tsx index 45a5e79..83509dd 100644 --- a/web/src/components/nodes/NodeDetailSheet.tsx +++ b/web/src/components/nodes/NodeDetailSheet.tsx @@ -1,8 +1,10 @@ import { Sheet, SheetContent, SheetDescription, SheetHeader, SheetTitle } from "../ui/sheet"; import { Progress } from "../ui/progress"; import { Badge } from "../ui/badge"; +import { Tabs, TabsList, TabsTrigger } from "../ui/tabs"; import { formatCurrency } from "../../lib/utils"; -import type { NodeCost } from "../../lib/api"; +import { fetchNodeStats, type NodeCost, type NodeStats } from "../../lib/api"; +import { useState, useEffect } from "react"; interface NodeDetailSheetProps { node: (NodeCost & { monthlyCost: number }) | null; @@ -17,6 +19,25 @@ const statusStyles: Record = { }; const NodeDetailSheet = ({ node, open, onOpenChange }: NodeDetailSheetProps) => { + const [window, setWindow] = useState("24h"); + const [stats, setStats] = useState(null); + const [loading, setLoading] = useState(false); + + useEffect(() => { + if (open && node) { + setLoading(true); + fetchNodeStats(node.nodeName, window) + .then(setStats) + .catch((err) => { + console.error("Failed to fetch node stats", err); + setStats(null); + }) + .finally(() => setLoading(false)); + } else { + setStats(null); + } + }, [open, node, window]); + if (!node) return null; const statusBadge = ( @@ -33,7 +54,7 @@ const NodeDetailSheet = ({ node, open, onOpenChange }: NodeDetailSheetProps) => return ( - + {node.nodeName} @@ -47,14 +68,23 @@ const NodeDetailSheet = ({ node, open, onOpenChange }: NodeDetailSheetProps) => -
+
-

Monthly cost

-

{formatCurrency(node.monthlyCost)}

-

{formatCurrency(node.hourlyCost, { maximumFractionDigits: 2 })}/hr

+

Current State

+
+
+

{formatCurrency(node.monthlyCost)}

+

Monthly cost

+
+
+

{formatCurrency(node.hourlyCost, { maximumFractionDigits: 2 })}

+

Hourly cost

+
+
+

Current Usage

CPU usage @@ -69,11 +99,60 @@ const NodeDetailSheet = ({ node, open, onOpenChange }: NodeDetailSheetProps) =>
+

{usageSummary}

-
-

Status

-

{usageSummary}

+
+ +
+
+

Historical Analysis

+ + + 24h + 7d + 30d + + +
+ + {loading ? ( +
+
+
+
+ ) : stats ? ( + <> +
+
+ Real Usage Cost + {formatCurrency(stats.realUsageMonthlyCost)} + + vs {formatCurrency(stats.totalMonthlyCost)} potential + +
+
+ +
+
+
+ P95 CPU ({window}) + {stats.p95CpuUsagePercent.toFixed(1)}% +
+ +
+
+
+ P95 Memory ({window}) + {stats.p95MemoryUsagePercent.toFixed(1)}% +
+ +
+
+ + ) : ( +

No historical data available.

+ )}
diff --git a/web/src/components/ui/progress.tsx b/web/src/components/ui/progress.tsx index 3fd47ad..54d2611 100644 --- a/web/src/components/ui/progress.tsx +++ b/web/src/components/ui/progress.tsx @@ -5,8 +5,8 @@ import { cn } from "@/lib/utils" const Progress = React.forwardRef< React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, value, ...props }, ref) => ( + React.ComponentPropsWithoutRef & { indicatorClassName?: string } +>(({ className, indicatorClassName, value, ...props }, ref) => ( diff --git a/web/src/components/ui/tooltip.tsx b/web/src/components/ui/tooltip.tsx new file mode 100644 index 0000000..b0542cb --- /dev/null +++ b/web/src/components/ui/tooltip.tsx @@ -0,0 +1,28 @@ +import * as React from "react" +import * as TooltipPrimitive from "@radix-ui/react-tooltip" + +import { cn } from "../../lib/utils" + +const TooltipProvider = TooltipPrimitive.Provider + +const Tooltip = TooltipPrimitive.Root + +const TooltipTrigger = TooltipPrimitive.Trigger + +const TooltipContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, sideOffset = 4, ...props }, ref) => ( + +)) +TooltipContent.displayName = TooltipPrimitive.Content.displayName + +export { Tooltip, TooltipTrigger, TooltipContent, TooltipProvider } diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 3805d12..0901e5b 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -1,6 +1,9 @@ import type { Environment } from "./utils"; -const API_PREFIX = "/api"; +// Allow overriding the API base via env var (useful for production builds or direct CORS) +// If VITE_API_URL is set (e.g. "https://api.example.com"), we use that. +// Otherwise default to local proxy "/api". +const API_PREFIX = import.meta.env.VITE_API_URL || "/api"; const normalizeEnvironment = (value?: string): Environment => { switch ((value || "").toLowerCase()) { @@ -112,6 +115,10 @@ type NodeCostApi = { hourlyCost: number; cpuUsagePercent: number; memoryUsagePercent: number; + cpuRequestedMilli?: number; + cpuLimitMilli?: number; + memoryRequestedBytes?: number; + memoryLimitBytes?: number; cpuAllocatableMilli?: number; memoryAllocatableBytes?: number; podCount: number; @@ -341,6 +348,19 @@ export const fetchNodes = async (): Promise => { })); }; +export interface NodeStats { + nodeName: string; + p95CpuUsagePercent: number; + p95MemoryUsagePercent: number; + totalMonthlyCost: number; + realUsageMonthlyCost: number; + window: string; +} + +export const fetchNodeStats = async (name: string, window: string): Promise => { + return request(`/cost/nodes/${name}/stats?window=${window}`); +}; + export const fetchResources = async (): Promise => { const resp = await request("/cost/resources"); return { diff --git a/web/src/pages/namespaces/NamespacesPage.test.tsx b/web/src/pages/namespaces/NamespacesPage.test.tsx index 2f115ed..d8783e4 100644 --- a/web/src/pages/namespaces/NamespacesPage.test.tsx +++ b/web/src/pages/namespaces/NamespacesPage.test.tsx @@ -11,6 +11,7 @@ vi.mock("../../hooks/useApiData", () => ({ })); // Mock ResizeObserver +// @ts-ignore global.ResizeObserver = class ResizeObserver { observe() { } unobserve() { } diff --git a/web/src/pages/nodes/NodesPage.tsx b/web/src/pages/nodes/NodesPage.tsx index e78218d..f612888 100644 --- a/web/src/pages/nodes/NodesPage.tsx +++ b/web/src/pages/nodes/NodesPage.tsx @@ -1,365 +1,310 @@ import { useMemo, useState, type ChangeEvent } from "react"; import { fetchNodes, type NodeCost } from "../../lib/api"; -import { formatCurrency, formatPercentage, relativeTimeFromIso, toMonthlyCost } from "../../lib/utils"; +import { formatCurrency, formatPercentage, relativeTimeFromIso, toMonthlyCost, milliToCores } from "../../lib/utils"; import { useApiData } from "../../hooks/useApiData"; import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { Button } from "@/components/ui/button"; -import { Progress } from "@/components/ui/progress"; import { Badge } from "@/components/ui/badge"; import { Input } from "@/components/ui/input"; import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"; import { Skeleton } from "@/components/ui/skeleton"; import NodeDetailSheet from "@/components/nodes/NodeDetailSheet"; +import { MetricCard } from "@/components/common/MetricCard"; +import { EfficiencyBar } from "@/components/nodes/EfficiencyBar"; +import { AlertTriangleIcon, CheckCircle2Icon, SearchIcon, ArrowDownIcon } from "lucide-react"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; -const statusStyles: Record = { - Ready: "border-emerald-500/40 bg-emerald-500/10 text-emerald-200", - NotReady: "border-destructive/40 bg-destructive/10 text-destructive", - Unknown: "border-muted bg-muted/40 text-muted-foreground" -}; - -type SortKey = "cost" | "cpu" | "memory"; +type SortKey = "cost" | "waste" | "efficiency"; const NodesPage = () => { const { data, loading, error, refresh } = useApiData(fetchNodes); const nodes = data ?? []; const [search, setSearch] = useState(""); - const [sortKey, setSortKey] = useState("cost"); + const [sortKey, setSortKey] = useState("waste"); const [sortDirection, setSortDirection] = useState<"asc" | "desc">("desc"); const [selectedNode, setSelectedNode] = useState<(NodeCost & { monthlyCost: number }) | null>(null); + // Fallback Cost Logic + const getEstimatedCost = (instanceType: string | undefined): number => { + if (!instanceType) return 73; + if (instanceType.includes("nano")) return 4; + if (instanceType.includes("micro")) return 8; + if (instanceType.includes("small")) return 16; + if (instanceType.includes("medium")) return 32; + if (instanceType.includes("large") && !instanceType.includes("xlarge")) return 64; + if (instanceType.includes("xlarge")) return 128; + if (instanceType.includes("2xlarge")) return 256; + return 73; + }; + const derivedNodes = useMemo(() => { - return nodes.map((node) => ({ - ...node, - cpuUsagePercent: node.cpuUsagePercent ?? 0, - memoryUsagePercent: node.memoryUsagePercent ?? 0, - hourlyCost: node.hourlyCost ?? 0, - monthlyCost: toMonthlyCost(node.hourlyCost ?? 0) - })); + return nodes.map((node) => { + let hourlyCost = node.hourlyCost ?? 0; + let isEstimate = false; + + if (hourlyCost === 0) { + hourlyCost = getEstimatedCost(node.instanceType) / 730; + isEstimate = true; + } + + const monthlyCost = hourlyCost * 730; + const cpuAllocatable = node.cpuAllocatableMilli ?? 0; + + const cpuRequestPercent = cpuAllocatable > 0 ? ((node.cpuRequestedMilli ?? 0) / cpuAllocatable) * 100 : 0; + const cpuUsage = node.cpuUsagePercent ?? 0; + + // Calculate Memory Stats + const memAllocatable = node.memoryAllocatableBytes ?? 0; + const memRequestPercent = memAllocatable > 0 ? ((node.memoryRequestedBytes ?? 0) / memAllocatable) * 100 : 0; + const memUsage = node.memoryUsagePercent ?? 0; + + // FinOps Waste Calculation: Paying for Request but not Using it (CPU dominant for now, but could blend) + const wastePercent = Math.max(0, cpuRequestPercent - cpuUsage); + const wasteAmount = monthlyCost * (wastePercent / 100); + + const isEfficient = wastePercent < 15; + const isOverProvisioned = wastePercent > 30; + + return { + ...node, + cpuUsagePercent: cpuUsage, + cpuRequestPercent, + memoryUsagePercent: memUsage, + memRequestPercent, + monthlyCost, + isEstimate, + wastePercent, + wasteAmount, + isEfficient, + isOverProvisioned, + shortName: node.nodeName.length > 20 ? node.nodeName.substring(0, 15) + "..." : node.nodeName + }; + }); }, [nodes]); const summary = useMemo(() => { - const totalMonthly = derivedNodes.reduce((sum, node) => sum + node.monthlyCost, 0); - const avgCpu = derivedNodes.length - ? derivedNodes.reduce((sum, node) => sum + node.cpuUsagePercent, 0) / derivedNodes.length - : 0; - const avgMem = derivedNodes.length - ? derivedNodes.reduce((sum, node) => sum + node.memoryUsagePercent, 0) / derivedNodes.length - : 0; - const issueCount = derivedNodes.filter((node) => node.status !== "Ready" || node.isUnderPressure).length; - return { totalMonthly, avgCpu, avgMem, ready: derivedNodes.length - issueCount, issues: issueCount }; - }, [derivedNodes]); + const totalMonthly = derivedNodes.reduce((sum, n) => sum + n.monthlyCost, 0); + const totalWaste = derivedNodes.reduce((sum, n) => sum + n.wasteAmount, 0); + const potentialSavings = totalWaste * 0.6; // Conservative achievable savings - const filteredNodes = useMemo(() => { - const term = search.trim().toLowerCase(); - if (!term) return derivedNodes; - return derivedNodes.filter((node) => node.nodeName.toLowerCase().includes(term)); - }, [derivedNodes, search]); + return { totalMonthly, totalWaste, potentialSavings }; + }, [derivedNodes]); const sortedNodes = useMemo(() => { - const rows = [...filteredNodes]; - const valueFor = (node: (typeof derivedNodes)[number]) => { - if (sortKey === "cpu") return node.cpuUsagePercent; - if (sortKey === "memory") return node.memoryUsagePercent; - return node.monthlyCost; - }; + const rows = [...derivedNodes]; rows.sort((a, b) => { - const diff = valueFor(a) - valueFor(b); - return sortDirection === "asc" ? diff : -diff; + const valA = sortKey === "waste" ? a.wasteAmount : (sortKey === "cost" ? a.monthlyCost : a.wastePercent); + const valB = sortKey === "waste" ? b.wasteAmount : (sortKey === "cost" ? b.monthlyCost : b.wastePercent); + return sortDirection === "asc" ? valA - valB : valB - valA; }); return rows; - }, [filteredNodes, sortKey, sortDirection]); + }, [derivedNodes, sortKey, sortDirection]); + // Sorting Handler const handleSort = (key: SortKey) => { - if (key === sortKey) { - setSortDirection((dir) => (dir === "desc" ? "asc" : "desc")); - } else { - setSortKey(key); - setSortDirection("desc"); - } + if (key === sortKey) setSortDirection(d => d === "desc" ? "asc" : "desc"); + else { setSortKey(key); setSortDirection("desc"); } }; - const optimizationCandidates = useMemo(() => { - if (!derivedNodes.length) return []; - const sortedCosts = [...derivedNodes].sort((a, b) => b.monthlyCost - a.monthlyCost); - const index = Math.max(0, Math.floor(sortedCosts.length * 0.3) - 1); - const costThreshold = sortedCosts[index]?.monthlyCost ?? 0; - return derivedNodes - .filter( - (node) => - node.monthlyCost >= costThreshold && node.cpuUsagePercent < 35 && node.memoryUsagePercent < 35 - ) - .sort((a, b) => b.monthlyCost - a.monthlyCost) - .slice(0, 5); - }, [derivedNodes]); - - const alerts = useMemo(() => { - return derivedNodes - .map((node) => { - const reasons: string[] = []; - if (node.status !== "Ready") reasons.push(node.status); - if (node.cpuUsagePercent >= 85) reasons.push(`CPU ${node.cpuUsagePercent.toFixed(0)}%`); - if (node.memoryUsagePercent >= 85) reasons.push(`Memory ${node.memoryUsagePercent.toFixed(0)}%`); - if (node.isUnderPressure) reasons.push("Under pressure"); - return { node, reasons }; - }) - .filter((item) => item.reasons.length > 0) - .slice(0, 5); - }, [derivedNodes]); - - const lastUpdatedLabel = data?.[0]?.lastUpdated ? relativeTimeFromIso(data[0].lastUpdated) : "moments ago"; - - const renderSortLabel = (key: SortKey, label: string) => ( - - ); - - if (loading && !data) { - return ; - } - - if (error) { - return ( - - {error} - - ); - } - - if (!derivedNodes.length) { - return ( - - - No nodes - - -

We couldn’t find any nodes. Once data arrives it will show up here.

- -
-
- ); - } + if (loading && !data) return ; + if (error) return
Failed to load data: {error}
; return ( -
-
+
+ {/* Header Section */} +
-

Nodes

-

See how much each node costs and how full it is.

+

Cluster Financials

+

Real-time analysis of infrastructure efficiency and waste.

-
Last updated {lastUpdatedLabel}
-
+
+ + +
+
-
- - - Total node cost - - -

{formatCurrency(summary.totalMonthly)}

-

Monthly (hourly x 30 days)

-
-
- + {/* The "Truth" Cards - High Impact Typography */} +
+ - Avg CPU usage + Monthly Spend -

{formatPercentage(summary.avgCpu, { fractionDigits: 0 })}

- +
+ {formatCurrency(summary.totalMonthly)} +
+

+ Run rate based on current capacity +

- + + +
+ {summary.totalWaste > 0 && ( + Action Required + )} +
- Avg memory usage + + Monthly Waste + -

{formatPercentage(summary.avgMem, { fractionDigits: 0 })}

- +
0 ? "text-destructive" : "text-emerald-500"}`}> + {formatCurrency(summary.totalWaste)} +
+

+ Money burned on unused reservations +

- + + - Node health + Actionable Savings -

- {summary.ready} Ready · {summary.issues} With issues +

+ {formatCurrency(summary.potentialSavings)} +
+

+ Conservative achievable reduction

-

Issues = NotReady or under pressure

-
+
-
- - -
- Nodes -

Sorted by monthly cost

-
-
- ) => setSearch(event.target.value)} - className="h-9 w-full" - /> -
-
- -
-
-
- - - - Node - {renderSortLabel("cost", "Monthly cost")} - {renderSortLabel("cpu", "CPU usage")} - {renderSortLabel("memory", "Memory usage")} - Status - - - - {sortedNodes.map((node) => ( - setSelectedNode(node)}> - -
- {node.nodeName} - {node.instanceType && ( - - {node.instanceType} - - )} -
-
- - {formatCurrency(node.monthlyCost)} - - -
- - {node.cpuUsagePercent.toFixed(0)}% -
-
- -
- - {node.memoryUsagePercent.toFixed(0)}% -
-
- - - {node.status} - - -
- ))} -
-
-
-
-
-
- {sortedNodes.map((node) => ( - - ))} -
-
-
+ {/* FinOps Table - High Density */} + +
+
+ + {sortedNodes.length} Nodes + + {/* Future: Add Filter logic here */} +
+
+ + setSearch(e.target.value)} + /> +
+
+ +
+ + + + Node Identity + handleSort("cost")}> + Cost + + CPU Efficiency + Memory Efficiency + handleSort("waste")}>Action + + + + {sortedNodes.filter(n => n.nodeName.includes(search)).map((node) => ( + -
- - - Optimization ideas - - - {optimizationCandidates.length === 0 ? ( -

Looks good, no obvious wasted nodes right now.

- ) : ( - optimizationCandidates.map((node) => ( -
-
- {node.nodeName} - {formatCurrency(node.monthlyCost)} + {/* Column 1: Identity */} + +
+ {node.instanceType || "Unknown"} + + + + + {node.nodeName} + + + +

{node.nodeName}

+
+
+
-

- {node.cpuUsagePercent.toFixed(0)}% CPU · {node.memoryUsagePercent.toFixed(0)}% Mem -

-
- )) - )} - - + - - - Alerts - - - {alerts.length === 0 ? ( -

All nodes healthy.

- ) : ( - alerts.map(({ node, reasons }) => ( -
-
- {node.nodeName} - Check + {/* Column 2: Cost */} + +
+
+ + {formatCurrency(node.monthlyCost)} + + {node.isEstimate && ( + + + * + Estimated Cost + + + )} +
+ + ${Number(node.hourlyCost.toFixed(4))}/hr +
-

{reasons.join(" · ")}

-
- )) - )} - - + + + {/* Column 3: CPU Efficiency (Stacked) */} + + + + + {/* Column 4: RAM Efficiency (Stacked) */} + + + + + {/* Column 5: Action */} + + {node.isEfficient ? ( + + Optimized + + ) : ( +
+ + + Save ~{formatCurrency(node.wasteAmount)} + +
+ )} +
+ + ))} + +
-
+ { - if (!open) { - setSelectedNode(null); - } - }} + onOpenChange={(open) => { if (!open) setSelectedNode(null); }} />
); diff --git a/web/src/pages/resources/ResourcesPage.tsx b/web/src/pages/resources/ResourcesPage.tsx index 66d173a..b1ae086 100644 --- a/web/src/pages/resources/ResourcesPage.tsx +++ b/web/src/pages/resources/ResourcesPage.tsx @@ -13,6 +13,7 @@ import { import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { Progress } from "@/components/ui/progress"; import { Skeleton } from "@/components/ui/skeleton"; +import { Button } from "@/components/ui/button"; import { Sheet, SheetContent, SheetDescription, SheetHeader, SheetTitle } from "@/components/ui/sheet"; import { Badge } from "@/components/ui/badge"; import { ResponsiveContainer, Bar, BarChart, Legend, Tooltip, XAxis, YAxis } from "recharts"; diff --git a/web/vite.config.ts b/web/vite.config.ts index 7178a0a..a1afe31 100644 --- a/web/vite.config.ts +++ b/web/vite.config.ts @@ -1,27 +1,43 @@ -import { defineConfig } from "vite"; +import { defineConfig, loadEnv } from "vite"; import react from "@vitejs/plugin-react"; import path from "path" -export default defineConfig({ - plugins: [react()], - resolve: { - alias: { - "@": path.resolve(__dirname, "./src"), +export default defineConfig(({ mode }) => { + // Load env file based on `mode` in the current working directory. + // Set the third parameter to '' to load all env regardless of the `VITE_` prefix. + const env = loadEnv(mode, process.cwd(), ''); + + // Priority: Shell Env (process.env) > .env File (env) > Default + const apiTarget = process.env.VITE_API_TARGET || env.VITE_API_TARGET || "http://localhost:9090"; + + console.log(`[Vite] Proxying /api to: ${apiTarget}`); + + return { + plugins: [react()], + resolve: { + alias: { + "@": path.resolve(__dirname, "./src"), + }, + }, + server: { + port: 5173, + proxy: { + "/api": { + target: apiTarget, + changeOrigin: true, + // If you want to strip /api prefix from the request when proxying: + // rewrite: (path) => path.replace(/^\/api/, ''), + } + } + }, + build: { + outDir: "dist", + emptyOutDir: true }, - }, - server: { - port: 5173, - proxy: { - "/api": "http://localhost:9090" + test: { + globals: true, + environment: "jsdom", + setupFiles: "./src/test/setup.ts" } - }, - build: { - outDir: "dist", - emptyOutDir: true - }, - test: { - globals: true, - environment: "jsdom", - setupFiles: "./src/test/setup.ts" - } + }; }); From b9f43fd2fc57b6dedf981b292d5fd05b4c45f8eb Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Tue, 20 Jan 2026 20:34:25 -0500 Subject: [PATCH 2/8] feat: update node dashboard --- internal/api/handlers_nodes.go | 1 + internal/store/store.go | 5 + internal/vm/dashboard.go | 308 +++++++++++++++++------------- web/src/lib/api.ts | 3 + web/src/pages/nodes/NodesPage.tsx | 94 ++++++--- 5 files changed, 247 insertions(+), 164 deletions(-) diff --git a/internal/api/handlers_nodes.go b/internal/api/handlers_nodes.go index b43e825..b78e54a 100644 --- a/internal/api/handlers_nodes.go +++ b/internal/api/handlers_nodes.go @@ -23,6 +23,7 @@ func (h *Handler) Nodes(w http.ResponseWriter, r *http.Request) { Search: q.Get("search"), Limit: parseLimit(q.Get("limit"), defaultNodeLimit, maxNodeLimit), Offset: parseOffset(q.Get("offset")), + Window: q.Get("window"), // "24h", "7d", "30d" } resp, err := h.vm.NodeList(ctx, filter) diff --git a/internal/store/store.go b/internal/store/store.go index 2abfc82..09b960c 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -132,6 +132,10 @@ type NodeSummary struct { EgressPublicBytes int64 `json:"egressPublicBytes"` EgressCrossAZBytes int64 `json:"egressCrossAZBytes"` EgressInternalBytes int64 `json:"egressInternalBytes"` + // Historical / Window Data + ActiveHours float64 `json:"activeHours"` // Hours active in the selected window + ActiveRatio float64 `json:"activeRatio"` // 0.0 - 1.0 + WindowCost float64 `json:"windowCost"` // Actual cost incurred in the window } // NodeListResponse wraps paginated node results. @@ -275,6 +279,7 @@ type NodeFilter struct { Search string Limit int Offset int + Window string // "24h", "7d", "30d" } // PodContext wraps a PodMetric with its location metadata. diff --git a/internal/vm/dashboard.go b/internal/vm/dashboard.go index f5d0703..b5dba82 100644 --- a/internal/vm/dashboard.go +++ b/internal/vm/dashboard.go @@ -131,7 +131,7 @@ func (c *Client) NamespaceDetail(ctx context.Context, name string) (store.Namesp } func (c *Client) NodeList(ctx context.Context, filter store.NodeFilter) (store.NodeListResponse, error) { - nodes, ts, err := c.nodeMetrics(ctx, "") + nodes, ts, err := c.nodeMetrics(ctx, "", filter.Window) if err != nil { return store.NodeListResponse{}, err } @@ -169,7 +169,7 @@ func (c *Client) NodeList(ctx context.Context, filter store.NodeFilter) (store.N } func (c *Client) NodeDetail(ctx context.Context, name string) (store.NodeSummary, error) { - nodes, _, err := c.nodeMetrics(ctx, name) + nodes, _, err := c.nodeMetrics(ctx, name, "") if err != nil { return store.NodeSummary{}, err } @@ -790,7 +790,27 @@ func (c *Client) namespaceMetrics(ctx context.Context, environment, namespace st return out, latest, nil } -func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]*store.NodeSummary, time.Time, error) { +func (c *Client) Nodes(ctx context.Context, window string) ([]store.NodeSummary, error) { + nodeMetrics, _, err := c.nodeMetrics(ctx, "", window) + if err != nil { + return nil, err + } + + out := make([]store.NodeSummary, 0, len(nodeMetrics)) + for _, n := range nodeMetrics { + n.Labels = nil // Optimization: potentially clear heavy labels if not needed + out = append(out, *n) + } + + // Sort by Cost desc + sort.Slice(out, func(i, j int) bool { + return out[i].WindowCost > out[j].WindowCost + }) + + return out, nil +} + +func (c *Client) nodeMetrics(ctx context.Context, nodeName, window string) (map[string]*store.NodeSummary, time.Time, error) { clusterID := c.resolveClusterID(ctx) ctx = WithClusterID(ctx, clusterID) labels := map[string]string{} @@ -798,141 +818,153 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* labels["node"] = nodeName } - metrics := []struct { - name string - assign func(entry *store.NodeSummary, value float64, labels map[string]string) - }{ - {"clustercost_node_hourly_cost", func(e *store.NodeSummary, v float64, l map[string]string) { - e.HourlyCost = v - if e.InstanceType == "" { - e.InstanceType = l["instance_type"] - if e.InstanceType == "" { - e.InstanceType = l["node_label_node_kubernetes_io_instance_type"] - } - if e.InstanceType == "" { - e.InstanceType = l["node_label_beta_kubernetes_io_instance_type"] - } - } - for k, v := range l { - e.Labels[k] = v + // Parse Window + var windowDur time.Duration + var lookbackFunc string = "max_over_time" // Default for "current" view (snapshot-ish) + var windowStr string = c.lookback.String() // Default internal lookback + + if window != "" { + d, err := time.ParseDuration(window) + if err == nil { + windowDur = d + windowStr = window + lookbackFunc = "avg_over_time" + } + } else { + // Assuming standard "current" view implies "1h" or just last scrape? + // For consistency with existing logic, we keep standard lookback but use max/last. + } + + // If Windowed View: Primary source is agent_up to find ALL nodes active in window + // If Snapshot View: Primary source is usually node_info or just scraping metrics. + // We'll use the same multi-metric approach but adjust the aggregation. + + out := make(map[string]*store.NodeSummary) + + // Helper to safely assign to out map + getOrCreate := func(node string) *store.NodeSummary { + if node == "" { + return nil + } + if _, ok := out[node]; !ok { + out[node] = &store.NodeSummary{ + NodeName: node, + Labels: map[string]string{}, + Taints: []string{}, } - }}, - {"clustercost_node_cpu_usage_percent", func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUUsagePercent = v }}, - {"clustercost_node_memory_usage_percent", func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryUsagePercent = v }}, - {"clustercost_node_cpu_allocatable_milli", func(e *store.NodeSummary, v float64, l map[string]string) { - e.CPUAllocatableMilli = int64(v) - if e.InstanceType == "" { - e.InstanceType = l["instance_type"] - if e.InstanceType == "" { - e.InstanceType = l["node_label_node_kubernetes_io_instance_type"] + } + return out[node] + } + + // 1. Availability / Active Time + // Query: avg_over_time(clustercost_agent_up[window]) + // Value: 0.0 - 1.0 (fraction of time active) + availExpr := fmt.Sprintf("avg_over_time(clustercost_agent_up%s[%s])", formatLabels(c.scopedLabels(labels, clusterID)), windowStr) + availSamples, err := c.query(ctx, availExpr) + if err == nil { + for _, s := range availSamples { + node := s.labels["node"] + entry := getOrCreate(node) + if entry != nil { + entry.ActiveRatio = s.value + if windowDur > 0 { + entry.ActiveHours = s.value * windowDur.Hours() + } else { + // Default assumption if no window: 100% active (snapshot) + entry.ActiveRatio = 1.0 + entry.ActiveHours = 24 * 30 // Monthly projection basis } - if e.InstanceType == "" { - e.InstanceType = l["node_label_beta_kubernetes_io_instance_type"] + + // Extract Metadata from Agent Up + if entry.InstanceType == "" { + entry.InstanceType = valueOrDefault(s.labels["instance_type"], + valueOrDefault(s.labels["node_label_node_kubernetes_io_instance_type"], + s.labels["node_label_beta_kubernetes_io_instance_type"])) + } + if entry.Labels["topology_kubernetes_io_region"] == "" { + entry.Labels["topology_kubernetes_io_region"] = s.labels["cluster_region"] } } - // Capture region from labels if not already present in basic labels - for k, v := range l { - e.Labels[k] = v - } - }}, - {"clustercost_node_memory_allocatable_bytes", func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryAllocatableBytes = int64(v) }}, - {"clustercost_node_cpu_requested_milli", func(e *store.NodeSummary, v float64, l map[string]string) { - e.CPURequestedMilli = int64(v) - if e.InstanceType == "" { - e.InstanceType = l["instance_type"] - } - }}, - {"clustercost_node_memory_requested_bytes", func(e *store.NodeSummary, v float64, l map[string]string) { - e.MemoryRequestedBytes = int64(v) + } + } + + // 2. Metrics List + metrics := []struct { + name string + validLookback bool // if false, use standard lookback (e.g. for info that doesn't vary) + assign func(entry *store.NodeSummary, value float64, labels map[string]string) + }{ + {"clustercost_node_hourly_cost", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.HourlyCost = v if e.InstanceType == "" { - e.InstanceType = l["instance_type"] + e.InstanceType = valueOrDefault(l["instance_type"], + valueOrDefault(l["node_label_node_kubernetes_io_instance_type"], + l["node_label_beta_kubernetes_io_instance_type"])) } }}, - {"clustercost_node_cpu_limit_milli", func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPULimitMilli = int64(v) }}, - {"clustercost_node_memory_limit_bytes", func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryLimitBytes = int64(v) }}, - {"clustercost_node_pod_count", func(e *store.NodeSummary, v float64, _ map[string]string) { e.PodCount = int(v) }}, - {"clustercost_node_under_pressure", func(e *store.NodeSummary, v float64, _ map[string]string) { e.IsUnderPressure = v > 0.5 }}, + {"clustercost_node_cpu_usage_percent", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUUsagePercent = v }}, + {"clustercost_node_memory_usage_percent", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryUsagePercent = v }}, + {"clustercost_node_cpu_allocatable_milli", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUAllocatableMilli = int64(v) }}, + {"clustercost_node_memory_allocatable_bytes", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryAllocatableBytes = int64(v) }}, + {"clustercost_node_cpu_requested_milli", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPURequestedMilli = int64(v) }}, + {"clustercost_node_memory_requested_bytes", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryRequestedBytes = int64(v) }}, + {"clustercost_node_cpu_limit_milli", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPULimitMilli = int64(v) }}, + {"clustercost_node_memory_limit_bytes", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryLimitBytes = int64(v) }}, } - out := make(map[string]*store.NodeSummary) for _, metric := range metrics { by := "node" - // We want to preserve instance_type and region information for all relevant metrics - // specifically request and allocatable metrics which often carry this metadata. - if metric.name == "clustercost_node_hourly_cost" || - metric.name == "clustercost_node_cpu_requested_milli" || - metric.name == "clustercost_node_memory_requested_bytes" || - metric.name == "clustercost_node_cpu_allocatable_milli" { - // Include standard and legacy k8s labels to survive aggregation - by = "node,instance_type,node_label_node_kubernetes_io_instance_type,node_label_beta_kubernetes_io_instance_type,cluster_region,topology_kubernetes_io_region,failure_domain_beta_kubernetes_io_region" + // Preserve metadata labels in aggregation + if strings.Contains(metric.name, "hourly_cost") || + strings.Contains(metric.name, "requested") || + strings.Contains(metric.name, "allocatable") { + by = "node,instance_type,node_label_node_kubernetes_io_instance_type,node_label_beta_kubernetes_io_instance_type,cluster_region,topology_kubernetes_io_region" } - expr := fmt.Sprintf("max by (%s) (%s)", by, c.lookbackExpr(metric.name, labels, clusterID)) - samples, err := c.query(ctx, expr) + + // determine function + fn := lookbackFunc + // For cost, average over time gives the average hourly rate during that window. + // For usage %, average makes sense. + // For Requests/Limits/Allocatable, they might vary if node resized (rare) or replaced. Average is decent. + + expr := fmt.Sprintf("%s(%s%s[%s])", fn, metric.name, formatLabels(c.scopedLabels(labels, clusterID)), windowStr) + // Need aggregation to preserve labels and unique by node + // max by (...) for snapshots, but avg by (...) for windows? + // Actually "avg by" works for all if we want the average stat. + aggOp := "avg" + if !strings.Contains(metric.name, "percent") && !strings.Contains(metric.name, "cost") { + // For allocatable/requests, max is often safer to see peak reservation? + // But for "Ghost Cost", average request is better? + // Let's stick to Average for Historical Analysis. + aggOp = "avg" + } + + fullExpr := fmt.Sprintf("%s by (%s) (%s)", aggOp, by, expr) + + samples, err := c.query(ctx, fullExpr) if err != nil { - return nil, time.Time{}, err + continue // Skip failing metrics rather than crash whole request } + for _, sample := range samples { node := sample.labels["node"] - if node == "" { - continue + entry := getOrCreate(node) + if entry != nil { + metric.assign(entry, sample.value, sample.labels) } - entry := out[node] - if entry == nil { - entry = &store.NodeSummary{ - NodeName: node, - Labels: map[string]string{}, - Taints: []string{}, - } - out[node] = entry - } - metric.assign(entry, sample.value, sample.labels) } } - // Fetch metadata from Agent Status if available (User suggestion) - agentSamples, err := c.seriesTimestamp(ctx, "clustercost_agent_up", labels) - if err == nil { - for _, s := range agentSamples { - node := s.labels["node"] - if node == "" { - continue - } - entry := out[node] - if entry == nil { - // We don't create new entries just from agent_up, only enrich existing ones - // or maybe we should? For now, let's just enrich. - continue - } - - // Extract Instance Type if missing - if entry.InstanceType == "" { - if v := s.labels["instance_type"]; v != "" { - entry.InstanceType = v - } else if v := s.labels["node_label_node_kubernetes_io_instance_type"]; v != "" { - entry.InstanceType = v - } else if v := s.labels["node_label_beta_kubernetes_io_instance_type"]; v != "" { - entry.InstanceType = v - } - } - - // extract region if missing - if entry.Labels["topology_kubernetes_io_region"] == "" { - if v := s.labels["cluster_region"]; v != "" { - entry.Labels["cluster_region"] = v - entry.Labels["topology_kubernetes_io_region"] = v // Normalizing - } else { - // Regex fallback for AWS DNS names - // e.g. ip-10-30-16-166.us-west-2.compute.internal - re := regexp.MustCompile(`\.(us-[a-z]+-\d+)\.`) - matches := re.FindStringSubmatch(entry.NodeName) - if len(matches) > 1 { - entry.Labels["topology_kubernetes_io_region"] = matches[1] - } - } - } - } + // 3. Post-Processing & Cost Backfill + var pricing *store.PricingCatalog + if c.pricing != nil { + pricing = c.pricing + } else { + // NewPricingCatalog now takes 0 args (static) + pricing = store.NewPricingCatalog() } + // Fetch node status statusSamples, err := c.seriesTimestamp(ctx, "clustercost_node_status", labels) if err != nil && err != ErrNoData { return nil, time.Time{}, err @@ -944,30 +976,19 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* } } - var pricing *store.PricingCatalog - if c.pricing != nil { - pricing = c.pricing - } else { - pricing = store.NewPricingCatalog() - } - - // Backfill Costs if missing using the Pricing Catalog for _, node := range out { - if node.HourlyCost > 0 { - continue + // Extract region from name fallback + if node.Labels["topology_kubernetes_io_region"] == "" { + re := regexp.MustCompile(`\.(us-[a-z]+-\d+)\.`) + matches := re.FindStringSubmatch(node.NodeName) + if len(matches) > 1 { + node.Labels["topology_kubernetes_io_region"] = matches[1] + } } - // Try to find region region := node.Labels["topology_kubernetes_io_region"] - if region == "" { - region = node.Labels["failure_domain_beta_kubernetes_io_region"] - } - if region == "" { - region = node.Labels["cluster_region"] - } if region == "" { // Fallback: Default region if unknown, often us-east-1 or inferred from node name - // e.g. ip-10-30-12-16.us-west-2.compute.internal if strings.Contains(node.NodeName, "us-east-1") { region = "us-east-1" } else if strings.Contains(node.NodeName, "us-west-2") { @@ -979,12 +1000,29 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* } } + // Update region in labels so it persists + if node.Labels == nil { + node.Labels = map[string]string{} + } + node.Labels["topology_kubernetes_io_region"] = region + instanceType := node.InstanceType if instanceType == "" { instanceType = "m5.large" // Default fallback to avoid 0 cost } - node.HourlyCost = pricing.GetTotalNodePrice(ctx, region, instanceType) + if node.HourlyCost == 0 { + node.HourlyCost = pricing.GetTotalNodePrice(context.Background(), region, instanceType) + } + + // CALCULATE WINDOW COST / TOTAL COST + if windowDur > 0 { + // Real Cost = HourlyRate * ActiveHours + node.WindowCost = node.HourlyCost * node.ActiveHours + } else { + // Snapshot projection (Monthly) + node.WindowCost = node.HourlyCost * 730 + } } latest := c.seriesTimestampSafe(ctx, "clustercost_node_hourly_cost") diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 0901e5b..0b82bb0 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -113,6 +113,9 @@ export interface NamespacesResponse { type NodeCostApi = { nodeName: string; hourlyCost: number; + windowCost: number; + activeHours: number; + activeRatio: number; cpuUsagePercent: number; memoryUsagePercent: number; cpuRequestedMilli?: number; diff --git a/web/src/pages/nodes/NodesPage.tsx b/web/src/pages/nodes/NodesPage.tsx index f612888..2856804 100644 --- a/web/src/pages/nodes/NodesPage.tsx +++ b/web/src/pages/nodes/NodesPage.tsx @@ -1,4 +1,4 @@ -import { useMemo, useState, type ChangeEvent } from "react"; +import { useMemo, useState, useCallback, type ChangeEvent } from "react"; import { fetchNodes, type NodeCost } from "../../lib/api"; import { formatCurrency, formatPercentage, relativeTimeFromIso, toMonthlyCost, milliToCores } from "../../lib/utils"; import { useApiData } from "../../hooks/useApiData"; @@ -7,6 +7,7 @@ import { Button } from "@/components/ui/button"; import { Badge } from "@/components/ui/badge"; import { Input } from "@/components/ui/input"; import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"; +import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select"; import { Skeleton } from "@/components/ui/skeleton"; import NodeDetailSheet from "@/components/nodes/NodeDetailSheet"; import { MetricCard } from "@/components/common/MetricCard"; @@ -17,10 +18,15 @@ import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/comp type SortKey = "cost" | "waste" | "efficiency"; const NodesPage = () => { - const { data, loading, error, refresh } = useApiData(fetchNodes); + const [timeWindow, setTimeWindow] = useState("24h"); + + // Fetch with window + const fetchNodesWithWindow = useCallback(() => fetchNodes(timeWindow), [timeWindow]); + const { data, loading, error, refresh } = useApiData(fetchNodesWithWindow); + const nodes = data ?? []; const [search, setSearch] = useState(""); - const [sortKey, setSortKey] = useState("waste"); + const [sortKey, setSortKey] = useState("cost"); // Default to cost for financial view const [sortDirection, setSortDirection] = useState<"asc" | "desc">("desc"); const [selectedNode, setSelectedNode] = useState<(NodeCost & { monthlyCost: number }) | null>(null); @@ -47,9 +53,11 @@ const NodesPage = () => { isEstimate = true; } - const monthlyCost = hourlyCost * 730; - const cpuAllocatable = node.cpuAllocatableMilli ?? 0; + // Use backend provided WindowCost if activeHours logic applied, otherwise calculate projection + const windowCost = node.windowCost || (hourlyCost * 24); // fallback + const monthlyCost = hourlyCost * 730; // Still useful for reference + const cpuAllocatable = node.cpuAllocatableMilli ?? 0; const cpuRequestPercent = cpuAllocatable > 0 ? ((node.cpuRequestedMilli ?? 0) / cpuAllocatable) * 100 : 0; const cpuUsage = node.cpuUsagePercent ?? 0; @@ -58,9 +66,10 @@ const NodesPage = () => { const memRequestPercent = memAllocatable > 0 ? ((node.memoryRequestedBytes ?? 0) / memAllocatable) * 100 : 0; const memUsage = node.memoryUsagePercent ?? 0; - // FinOps Waste Calculation: Paying for Request but not Using it (CPU dominant for now, but could blend) + // FinOps Waste Calculation const wastePercent = Math.max(0, cpuRequestPercent - cpuUsage); - const wasteAmount = monthlyCost * (wastePercent / 100); + // Waste Amount based on Window Cost + const wasteAmount = windowCost * (wastePercent / 100); const isEfficient = wastePercent < 15; const isOverProvisioned = wastePercent > 30; @@ -72,40 +81,49 @@ const NodesPage = () => { memoryUsagePercent: memUsage, memRequestPercent, monthlyCost, + windowCost, isEstimate, wastePercent, wasteAmount, isEfficient, isOverProvisioned, - shortName: node.nodeName.length > 20 ? node.nodeName.substring(0, 15) + "..." : node.nodeName + shortName: node.nodeName // no truncation }; }); }, [nodes]); const summary = useMemo(() => { - const totalMonthly = derivedNodes.reduce((sum, n) => sum + n.monthlyCost, 0); + const totalWindowCost = derivedNodes.reduce((sum, n) => sum + n.windowCost, 0); const totalWaste = derivedNodes.reduce((sum, n) => sum + n.wasteAmount, 0); - const potentialSavings = totalWaste * 0.6; // Conservative achievable savings + const potentialSavings = totalWaste * 0.6; - return { totalMonthly, totalWaste, potentialSavings }; + return { totalWindowCost, totalWaste, potentialSavings }; }, [derivedNodes]); const sortedNodes = useMemo(() => { const rows = [...derivedNodes]; rows.sort((a, b) => { - const valA = sortKey === "waste" ? a.wasteAmount : (sortKey === "cost" ? a.monthlyCost : a.wastePercent); - const valB = sortKey === "waste" ? b.wasteAmount : (sortKey === "cost" ? b.monthlyCost : b.wastePercent); + const valA = sortKey === "waste" ? a.wasteAmount : (sortKey === "cost" ? a.windowCost : a.wastePercent); + const valB = sortKey === "waste" ? b.wasteAmount : (sortKey === "cost" ? b.windowCost : b.wastePercent); return sortDirection === "asc" ? valA - valB : valB - valA; }); return rows; }, [derivedNodes, sortKey, sortDirection]); - // Sorting Handler const handleSort = (key: SortKey) => { if (key === sortKey) setSortDirection(d => d === "desc" ? "asc" : "desc"); else { setSortKey(key); setSortDirection("desc"); } }; + const getWindowLabel = (w: string) => { + switch (w) { + case "24h": return "Last 24 Hours"; + case "7d": return "Last 7 Days"; + case "30d": return "Last 30 Days"; + default: return w; + } + }; + if (loading && !data) return ; if (error) return
Failed to load data: {error}
; @@ -115,26 +133,38 @@ const NodesPage = () => {

Cluster Financials

-

Real-time analysis of infrastructure efficiency and waste.

+

Real-time analysis based on actual uptime.

-
+
+
- {/* The "Truth" Cards - High Impact Typography */} + {/* The "Truth" Cards */}
- Monthly Spend + + Spend ({timeWindow}) +
- {formatCurrency(summary.totalMonthly)} + {formatCurrency(summary.totalWindowCost)}

- Run rate based on current capacity + Actual cost based on {getWindowLabel(timeWindow)} uptime

@@ -147,7 +177,7 @@ const NodesPage = () => {
- Monthly Waste + Waste ({timeWindow}) @@ -155,7 +185,7 @@ const NodesPage = () => { {formatCurrency(summary.totalWaste)}

- Money burned on unused reservations + Money burned on unused capacity

@@ -180,9 +210,8 @@ const NodesPage = () => {
- {sortedNodes.length} Nodes + {sortedNodes.length} Nodes (Active in {timeWindow}) - {/* Future: Add Filter logic here */}
@@ -201,7 +230,7 @@ const NodesPage = () => { Node Identity handleSort("cost")}> - Cost + Cost ({timeWindow}) CPU Efficiency Memory Efficiency @@ -236,7 +265,7 @@ const NodesPage = () => {
- {formatCurrency(node.monthlyCost)} + {formatCurrency(node.windowCost)} {node.isEstimate && ( @@ -247,9 +276,16 @@ const NodesPage = () => { )}
- - ${Number(node.hourlyCost.toFixed(4))}/hr - +
+ + ${Number(node.hourlyCost.toFixed(4))}/hr + + {node.activeHours > 0 && ( + + {node.activeHours.toFixed(1)}h active ({((node.activeRatio || 0) * 100).toFixed(0)}%) + + )} +
From 3e928108a2bec500be8a0c7718ddc47d16727b9f Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Tue, 20 Jan 2026 20:53:11 -0500 Subject: [PATCH 3/8] fixup! feat: update node dashboard --- internal/vm/dashboard.go | 86 ++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 51 deletions(-) diff --git a/internal/vm/dashboard.go b/internal/vm/dashboard.go index b5dba82..31996e0 100644 --- a/internal/vm/dashboard.go +++ b/internal/vm/dashboard.go @@ -198,10 +198,8 @@ func (c *Client) Resources(ctx context.Context) (store.ResourcesPayload, error) if err != nil && err != ErrNoData { return store.ResourcesPayload{}, err } - nodeHourlyCost, _, err := c.scalarMetric(ctx, "clustercost_cluster_total_node_hourly_cost") - if err != nil && err != ErrNoData { - return store.ResourcesPayload{}, err - } + // Node Hourly Cost is now fully calculated, no stored metric + nodeHourlyCost := 0.0 // Fetch Network Metrics netTx, _, _ := c.scalarMetric(ctx, "clustercost_cluster_network_tx_bytes_total") @@ -432,7 +430,7 @@ func (c *Client) AgentStatus(ctx context.Context) (store.AgentStatusPayload, err } nsTS := c.seriesTimestampSafe(ctx, "clustercost_namespace_hourly_cost") - nodeTS := c.seriesTimestampSafe(ctx, "clustercost_node_hourly_cost") + nodeTS := c.seriesTimestampSafe(ctx, "clustercost_node_cpu_allocatable_milli") resTS := c.seriesTimestampSafe(ctx, "clustercost_cluster_cpu_usage_milli_total") datasets := store.AgentDatasetHealth{ @@ -678,36 +676,6 @@ func (c *Client) namespaceMetrics(ctx context.Context, environment, namespace st } } - queryScalar := func(expr string) (float64, error) { - samples, err := c.query(ctx, expr) - if err != nil { - return 0, err - } - if len(samples) == 0 { - return 0, ErrNoData - } - return samples[0].value, nil - } - - nodeCostExpr := fmt.Sprintf("sum(max by (node) (%s))", c.lookbackExpr("clustercost_node_hourly_cost", nil, clusterID)) - cpuAllocExpr := fmt.Sprintf("sum(max by (node) (%s))", c.lookbackExpr("clustercost_node_cpu_allocatable_milli", nil, clusterID)) - memAllocExpr := fmt.Sprintf("sum(max by (node) (%s))", c.lookbackExpr("clustercost_node_memory_allocatable_bytes", nil, clusterID)) - - nodeCost, err := queryScalar(nodeCostExpr) - if err == nil && nodeCost > 0 { - cpuAllocMilli, errCPU := queryScalar(cpuAllocExpr) - memAllocBytes, errMem := queryScalar(memAllocExpr) - if errCPU == nil && errMem == nil && cpuAllocMilli > 0 && memAllocBytes > 0 { - cpuPrice := (nodeCost * 0.5) / (cpuAllocMilli / 1000.0) - memPrice := (nodeCost * 0.5) / (memAllocBytes / (1024.0 * 1024.0 * 1024.0)) - for _, entry := range out { - cpuUsageCores := float64(entry.CPUUsageMilli) / 1000.0 - memUsageGB := float64(entry.MemoryUsageBytes) / (1024.0 * 1024.0 * 1024.0) - entry.HourlyCost = (cpuUsageCores * cpuPrice) + (memUsageGB * memPrice) - } - } - } - latest = c.seriesTimestampSafe(ctx, "clustercost_namespace_memory_rss_bytes_total") type nodeAlloc struct { @@ -888,26 +856,43 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName, window string) (map[ } } + // Helper to extract metadata from labels + updateMeta := func(entry *store.NodeSummary, labels map[string]string) { + if entry.InstanceType == "" { + entry.InstanceType = valueOrDefault(labels["instance_type"], + valueOrDefault(labels["node_label_node_kubernetes_io_instance_type"], + labels["node_label_beta_kubernetes_io_instance_type"])) + } + if entry.Labels["topology_kubernetes_io_region"] == "" { + entry.Labels["topology_kubernetes_io_region"] = labels["cluster_region"] + } + } + // 2. Metrics List metrics := []struct { name string validLookback bool // if false, use standard lookback (e.g. for info that doesn't vary) assign func(entry *store.NodeSummary, value float64, labels map[string]string) }{ - {"clustercost_node_hourly_cost", true, func(e *store.NodeSummary, v float64, l map[string]string) { - e.HourlyCost = v - if e.InstanceType == "" { - e.InstanceType = valueOrDefault(l["instance_type"], - valueOrDefault(l["node_label_node_kubernetes_io_instance_type"], - l["node_label_beta_kubernetes_io_instance_type"])) - } - }}, + // hourly_cost metric removed as it's deprecated. Cost is calculated in post-processing. {"clustercost_node_cpu_usage_percent", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUUsagePercent = v }}, {"clustercost_node_memory_usage_percent", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryUsagePercent = v }}, - {"clustercost_node_cpu_allocatable_milli", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUAllocatableMilli = int64(v) }}, - {"clustercost_node_memory_allocatable_bytes", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryAllocatableBytes = int64(v) }}, - {"clustercost_node_cpu_requested_milli", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPURequestedMilli = int64(v) }}, - {"clustercost_node_memory_requested_bytes", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryRequestedBytes = int64(v) }}, + {"clustercost_node_cpu_allocatable_milli", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.CPUAllocatableMilli = int64(v) + updateMeta(e, l) + }}, + {"clustercost_node_memory_allocatable_bytes", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.MemoryAllocatableBytes = int64(v) + updateMeta(e, l) + }}, + {"clustercost_node_cpu_requested_milli", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.CPURequestedMilli = int64(v) + updateMeta(e, l) + }}, + {"clustercost_node_memory_requested_bytes", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.MemoryRequestedBytes = int64(v) + updateMeta(e, l) + }}, {"clustercost_node_cpu_limit_milli", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPULimitMilli = int64(v) }}, {"clustercost_node_memory_limit_bytes", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryLimitBytes = int64(v) }}, } @@ -915,8 +900,7 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName, window string) (map[ for _, metric := range metrics { by := "node" // Preserve metadata labels in aggregation - if strings.Contains(metric.name, "hourly_cost") || - strings.Contains(metric.name, "requested") || + if strings.Contains(metric.name, "requested") || strings.Contains(metric.name, "allocatable") { by = "node,instance_type,node_label_node_kubernetes_io_instance_type,node_label_beta_kubernetes_io_instance_type,cluster_region,topology_kubernetes_io_region" } @@ -1025,7 +1009,7 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName, window string) (map[ } } - latest := c.seriesTimestampSafe(ctx, "clustercost_node_hourly_cost") + latest := c.seriesTimestampSafe(ctx, "clustercost_node_cpu_allocatable_milli") return out, latest, nil } @@ -1200,7 +1184,7 @@ func pickLatestStatus(samples []sample) map[string]string { func (c *Client) nodeNames(ctx context.Context) []string { clusterID := c.resolveClusterID(ctx) - expr := fmt.Sprintf("max by (node) (%s)", c.lookbackExpr("clustercost_node_hourly_cost", nil, clusterID)) + expr := fmt.Sprintf("max by (node) (%s)", c.lookbackExpr("clustercost_node_cpu_allocatable_milli", nil, clusterID)) samples, err := c.query(ctx, expr) if err != nil { return nil From cb5483c6c9ba28bd32ce0d823e851266aaeae0f1 Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Tue, 20 Jan 2026 21:14:58 -0500 Subject: [PATCH 4/8] fixup! feat: update node dashboard --- web/src/components/nodes/EfficiencyBar.tsx | 67 +++++++++++++--------- web/src/lib/api.ts | 5 +- web/src/pages/nodes/NodesPage.tsx | 20 ++++++- 3 files changed, 62 insertions(+), 30 deletions(-) diff --git a/web/src/components/nodes/EfficiencyBar.tsx b/web/src/components/nodes/EfficiencyBar.tsx index de3cb22..c5fed02 100644 --- a/web/src/components/nodes/EfficiencyBar.tsx +++ b/web/src/components/nodes/EfficiencyBar.tsx @@ -6,44 +6,55 @@ interface EfficiencyBarProps { usagePercent: number; requestPercent: number; costPerMonth: number; - cpuCores?: string; - className?: string; // Added to match usage + usageAbsolute: number; + totalAbsolute: number; + unit: string; } -export function EfficiencyBar({ usagePercent, requestPercent, costPerMonth }: EfficiencyBarProps) { +export function EfficiencyBar({ + usagePercent, + requestPercent, + costPerMonth, + usageAbsolute, + totalAbsolute, + unit +}: EfficiencyBarProps) { // FinOps Logic: - // If Request >>> Usage, we have waste. - // The "Gap" visually shows this. + // Gap between Usage (Cyan) and Reserved (White) = Waste. const wastePercent = Math.max(0, requestPercent - usagePercent); const wastedCost = costPerMonth * (wastePercent / 100); return ( -
- {/* Top Bar: Actual Usage (The "Real" Work) */} -
- Usage - {usagePercent.toFixed(0)}% +
+ {/* Micro-Text Label: "1.2 / 4.0 vCPUs" */} +
+ + {usageAbsolute.toFixed(1)} + / {totalAbsolute.toFixed(1)} {unit} + + {wastePercent > 0 && ( + + Gap: {wastePercent.toFixed(0)}% + + )}
- - {/* Bottom Bar: Reserved / Requested (The "Billable" Reservation) */} -
- Reserved - {requestPercent.toFixed(0)}% -
+ {/* Stacked Progress Bar */} -
- + {/* Layer 1: Reserved (Requests) - Light/White */} +
+ + {/* Layer 2: Actual Usage - Cyan */} +
@@ -52,12 +63,16 @@ export function EfficiencyBar({ usagePercent, requestPercent, costPerMonth }: Ef

Efficiency Gap

Usage: - {usagePercent.toFixed(1)}% + {usagePercent.toFixed(1)}% ({usageAbsolute.toFixed(2)} {unit})
Reserved: {requestPercent.toFixed(1)}%
+
+ Total: + {totalAbsolute.toFixed(1)} {unit} +
{wastedCost > 1 && (
Waste: diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 0b82bb0..865b336 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -341,8 +341,9 @@ export const fetchNamespaces = async (): Promise => { }; }; -export const fetchNodes = async (): Promise => { - const resp = await request("/cost/nodes"); +export const fetchNodes = async (window?: string): Promise => { + const query = window ? `?window=${window}` : ""; + const resp = await request(`/cost/nodes${query}`); return resp.items.map((node) => ({ ...node, labels: node.labels ?? {}, diff --git a/web/src/pages/nodes/NodesPage.tsx b/web/src/pages/nodes/NodesPage.tsx index 2856804..0e3c9f5 100644 --- a/web/src/pages/nodes/NodesPage.tsx +++ b/web/src/pages/nodes/NodesPage.tsx @@ -232,8 +232,18 @@ const NodesPage = () => { handleSort("cost")}> Cost ({timeWindow}) - CPU Efficiency - Memory Efficiency + + CPU Efficiency + + ( Usage / Reserved) + + + + Memory Efficiency + + ( Usage / Reserved) + + handleSort("waste")}>Action @@ -295,6 +305,9 @@ const NodesPage = () => { usagePercent={node.cpuUsagePercent} requestPercent={node.cpuRequestPercent} costPerMonth={node.monthlyCost} + usageAbsolute={node.cpuAllocatableMilli ? (node.cpuUsagePercent / 100) * (node.cpuAllocatableMilli / 1000) : 0} + totalAbsolute={node.cpuAllocatableMilli ? node.cpuAllocatableMilli / 1000 : 0} + unit="vCPUs" /> @@ -304,6 +317,9 @@ const NodesPage = () => { usagePercent={node.memoryUsagePercent} requestPercent={node.memRequestPercent} costPerMonth={node.monthlyCost} + usageAbsolute={node.memoryAllocatableBytes ? (node.memoryUsagePercent / 100) * (node.memoryAllocatableBytes / (1024 * 1024 * 1024)) : 0} + totalAbsolute={node.memoryAllocatableBytes ? node.memoryAllocatableBytes / (1024 * 1024 * 1024) : 0} + unit="GiB" /> From 34a7d04ad3e948da508ed4cbd48652e42d0268d8 Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Tue, 20 Jan 2026 21:28:20 -0500 Subject: [PATCH 5/8] fixup! feat: update node dashboard --- web/src/components/nodes/EfficiencyBar.tsx | 71 ++++++++++++++++------ 1 file changed, 51 insertions(+), 20 deletions(-) diff --git a/web/src/components/nodes/EfficiencyBar.tsx b/web/src/components/nodes/EfficiencyBar.tsx index c5fed02..3ae37c1 100644 --- a/web/src/components/nodes/EfficiencyBar.tsx +++ b/web/src/components/nodes/EfficiencyBar.tsx @@ -20,61 +20,92 @@ export function EfficiencyBar({ unit }: EfficiencyBarProps) { // FinOps Logic: - // Gap between Usage (Cyan) and Reserved (White) = Waste. + // "High Contrast" Strategy: + // - Container = Total Node Capacity + // - Reserved = Light Rail (bg-white/15) + // - Usage = Active Bar (Cyan vs Neon Orange) + // - Threshold = White Line (Always visible on top) + const isOverLimit = usagePercent > requestPercent; const wastePercent = Math.max(0, requestPercent - usagePercent); + const overflowPercent = Math.max(0, usagePercent - requestPercent); + + // Calculate formatted values const wastedCost = costPerMonth * (wastePercent / 100); return (
- {/* Micro-Text Label: "1.2 / 4.0 vCPUs" */} -
- + {/* Micro-Text Label Row */} +
+ {usageAbsolute.toFixed(1)} / {totalAbsolute.toFixed(1)} {unit} - {wastePercent > 0 && ( - + + {/* Status Indicator */} + {isOverLimit ? ( + + Risk: +{overflowPercent.toFixed(0)}% + + ) : wastePercent > 0 ? ( + Gap: {wastePercent.toFixed(0)}% - )} + ) : null}
- {/* Stacked Progress Bar */} + {/* Bar Container */} -
- {/* Layer 1: Reserved (Requests) - Light/White */} + {/* Layer 0: Total Capacity Container */} +
+ + {/* Layer 1: Reserved (The Contract Rail) */} + {/* Lighter grey to contrast with dark background */}
- {/* Layer 2: Actual Usage - Cyan */} + {/* Layer 2: Actual Usage (The Active Liquid) */} + {/* Sits ON TOP of Reserved. */}
+ + {/* Layer 3: The Contract Line (Threshold) */} + {/* ALWAYS visible, white, sits on top of everything (z-20) */} + {requestPercent > 0 && ( +
+ )}
- +
-

Efficiency Gap

+

+ {isOverLimit ? "Stability Risk" : "Efficiency Gap"} +

Usage: - {usagePercent.toFixed(1)}% ({usageAbsolute.toFixed(2)} {unit}) + + {usagePercent.toFixed(1)}% ({usageAbsolute.toFixed(2)} {unit}) +
Reserved: - {requestPercent.toFixed(1)}% + {requestPercent.toFixed(1)}%
Total: - {totalAbsolute.toFixed(1)} {unit} + {totalAbsolute.toFixed(1)} {unit}
- {wastedCost > 1 && ( -
+ {!isOverLimit && wastedCost > 1 && ( +
Waste: {formatCurrency(wastedCost)}/mo
From 57ebf9c2999ad07981754d185ae51405faa7f181 Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Tue, 20 Jan 2026 21:53:26 -0500 Subject: [PATCH 6/8] fixup! feat: update node dashboard --- web/src/components/nodes/EfficiencyBar.tsx | 42 ++++++++++++++++------ 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/web/src/components/nodes/EfficiencyBar.tsx b/web/src/components/nodes/EfficiencyBar.tsx index 3ae37c1..dc964c9 100644 --- a/web/src/components/nodes/EfficiencyBar.tsx +++ b/web/src/components/nodes/EfficiencyBar.tsx @@ -1,4 +1,4 @@ -import { Progress } from "@/components/ui/progress"; +import { Badge } from "@/components/ui/badge"; import { formatCurrency } from "../../lib/utils"; import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; @@ -20,17 +20,28 @@ export function EfficiencyBar({ unit }: EfficiencyBarProps) { // FinOps Logic: - // "High Contrast" Strategy: + // "Comfort Zone" Strategy: // - Container = Total Node Capacity // - Reserved = Light Rail (bg-white/15) - // - Usage = Active Bar (Cyan vs Neon Orange) // - Threshold = White Line (Always visible on top) + // - Logic: + // - < 90% of Reserved: Gap (Cyan) + // - 90% - 110% of Reserved: Optimized (Green Badge, Cyan Bar) + // - > 110% of Reserved: Risk (Orange Bar, Orange Text) + + const ratio = requestPercent > 0 ? usagePercent / requestPercent : 0; + + // Logic Refinement (Hybrid Absolute/Relative): + // 1. High Risk (Orange): STRICTLY > 110% of reservation (Relative > 1.1). + // 2. Optimized (Green): Not High Risk AND Absolute Difference <= 10% (suppress "Gap: 3%" noise). + // 3. Gap (Cyan): Everything else (Absolute Difference > 10%). + + const isHighRisk = ratio > 1.1; + const diff = Math.abs(usagePercent - requestPercent); + const isOptimized = !isHighRisk && diff <= 10; - const isOverLimit = usagePercent > requestPercent; const wastePercent = Math.max(0, requestPercent - usagePercent); const overflowPercent = Math.max(0, usagePercent - requestPercent); - - // Calculate formatted values const wastedCost = costPerMonth * (wastePercent / 100); return ( @@ -43,7 +54,11 @@ export function EfficiencyBar({ {/* Status Indicator */} - {isOverLimit ? ( + {isOptimized ? ( + + Optimized + + ) : isHighRisk ? ( Risk: +{overflowPercent.toFixed(0)}% @@ -71,7 +86,7 @@ export function EfficiencyBar({ {/* Layer 2: Actual Usage (The Active Liquid) */} {/* Sits ON TOP of Reserved. */}
@@ -88,11 +103,11 @@ export function EfficiencyBar({

- {isOverLimit ? "Stability Risk" : "Efficiency Gap"} + {isOptimized ? "State: Optimized" : isHighRisk ? "Stability Risk" : "Efficiency Gap"}

Usage: - + {usagePercent.toFixed(1)}% ({usageAbsolute.toFixed(2)} {unit})
@@ -104,12 +119,17 @@ export function EfficiencyBar({ Total: {totalAbsolute.toFixed(1)} {unit}
- {!isOverLimit && wastedCost > 1 && ( + {!isHighRisk && !isOptimized && wastedCost > 1 && (
Waste: {formatCurrency(wastedCost)}/mo
)} + {isOptimized && ( +
+ Usage is within 10% of reservation. Perfect balance. +
+ )}
From 0c26a351e19ac829f91124d1d0fce585e36ff74e Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Tue, 20 Jan 2026 22:04:29 -0500 Subject: [PATCH 7/8] fixup! feat: update node dashboard --- web/src/components/nodes/EfficiencyBar.tsx | 94 ++++++++++++++++------ 1 file changed, 69 insertions(+), 25 deletions(-) diff --git a/web/src/components/nodes/EfficiencyBar.tsx b/web/src/components/nodes/EfficiencyBar.tsx index dc964c9..991aa08 100644 --- a/web/src/components/nodes/EfficiencyBar.tsx +++ b/web/src/components/nodes/EfficiencyBar.tsx @@ -1,6 +1,7 @@ import { Badge } from "@/components/ui/badge"; import { formatCurrency } from "../../lib/utils"; import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; +import { AlertTriangle, CheckCircle2, TrendingDown } from "lucide-react"; interface EfficiencyBarProps { usagePercent: number; @@ -100,34 +101,77 @@ export function EfficiencyBar({ )}
- -
-

- {isOptimized ? "State: Optimized" : isHighRisk ? "Stability Risk" : "Efficiency Gap"} -

-
- Usage: - - {usagePercent.toFixed(1)}% ({usageAbsolute.toFixed(2)} {unit}) - + + {/* Professional Context Card Tooltip */} + +
+ {/* Header Section */} +
+ {isHighRisk ? ( + + ) : isOptimized ? ( + + ) : ( + + )} +
+

+ {isHighRisk ? "Stability Risk: Bursting" : isOptimized ? "Perfectly Rightsized" : "Efficiency Gap Detected"} +

+

+ {isHighRisk + ? "Operating above guaranteed limits." + : isOptimized + ? "Balanced resource utilization." + : "Resources reserved but unused."} +

+
-
- Reserved: - {requestPercent.toFixed(1)}% + + {/* Technical Evidence Section */} +
+
+ Usage: + + {usagePercent.toFixed(1)}% ({usageAbsolute.toFixed(2)} {unit}) + +
+
+ Reserved: + {requestPercent.toFixed(1)}% ({((totalAbsolute * requestPercent) / 100 || 0).toFixed(2)} {unit}) +
-
- Total: - {totalAbsolute.toFixed(1)} {unit} + + {/* Educational Context Section */} +
+ {isHighRisk && ( + + This node is running at {ratio.toFixed(2)}x its reservation. + It relies on unguaranteed burst capacity and is a top candidate for OOMKill if cluster pressure increases. + + )} + {isOptimized && ( + + Usage is within the ±10% ideal stability window. + This configuration maximizes ROI without risking instability. No action required. + + )} + {!isHighRisk && !isOptimized && ( + + You are paying for {(requestPercent - usagePercent).toFixed(0)}% more capacity than needed. + This "air gap" provides no technical value and is pure financial waste. + + )}
- {!isHighRisk && !isOptimized && wastedCost > 1 && ( -
- Waste: - {formatCurrency(wastedCost)}/mo -
- )} - {isOptimized && ( -
- Usage is within 10% of reservation. Perfect balance. + + {/* Financial Impact Footer */} + {!isOptimized && wastedCost > 0.01 && ( +
+ Monthly Waste + {formatCurrency(wastedCost)}
)}
From 5fdd38268901f719be479793a0c626ad0c192bd8 Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Tue, 20 Jan 2026 22:39:01 -0500 Subject: [PATCH 8/8] fixup! feat: update node dashboard --- internal/api/handlers_health_test.go | 6 + internal/api/handlers_nodes.go | 23 ++ internal/api/router.go | 2 + internal/store/store.go | 11 + internal/vm/client.go | 130 +++++++ web/src/components/nodes/NodeDetailSheet.tsx | 359 +++++++++++++------ web/src/lib/api.ts | 14 + 7 files changed, 430 insertions(+), 115 deletions(-) diff --git a/internal/api/handlers_health_test.go b/internal/api/handlers_health_test.go index f5cf9b9..6dd6ba0 100644 --- a/internal/api/handlers_health_test.go +++ b/internal/api/handlers_health_test.go @@ -47,6 +47,12 @@ func (f *fakeMetricsProvider) ClusterMetadata(context.Context) (store.ClusterMet func (f *fakeMetricsProvider) NetworkTopology(context.Context, store.NetworkTopologyOptions) ([]store.NetworkEdge, error) { return nil, vm.ErrNoData } +func (f *fakeMetricsProvider) GetNodeStats(context.Context, string, string, time.Duration) (store.NodeStats, error) { + return store.NodeStats{}, vm.ErrNoData +} +func (f *fakeMetricsProvider) GetNodePods(context.Context, string, string, time.Duration) ([]store.PodMetrics, error) { + return nil, vm.ErrNoData +} func newTestHandler(meta store.ClusterMetadata, status store.AgentStatusPayload) *Handler { return &Handler{vm: &fakeMetricsProvider{meta: meta, status: status}} diff --git a/internal/api/handlers_nodes.go b/internal/api/handlers_nodes.go index b78e54a..b6a3884 100644 --- a/internal/api/handlers_nodes.go +++ b/internal/api/handlers_nodes.go @@ -83,3 +83,26 @@ func (h *Handler) NodeStats(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, stats) } + +// NodePods returns the list of pods for a node with P95 metrics (Pod Audit). +func (h *Handler) NodePods(w http.ResponseWriter, r *http.Request) { + name := chi.URLParam(r, "name") + if name == "" { + writeError(w, http.StatusBadRequest, "node name is required") + return + } + windowStr := r.URL.Query().Get("window") + window, _ := time.ParseDuration(windowStr) + if window <= 0 { + window = 24 * time.Hour + } + + ctx := vm.WithClusterID(r.Context(), clusterIDFromRequest(r)) + pods, err := h.vm.GetNodePods(ctx, "", name, window) + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + writeJSON(w, http.StatusOK, pods) +} diff --git a/internal/api/router.go b/internal/api/router.go index 06f2e37..778f45d 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -30,6 +30,7 @@ type MetricsProvider interface { ClusterMetadata(ctx context.Context) (store.ClusterMetadata, error) NetworkTopology(ctx context.Context, opts store.NetworkTopologyOptions) ([]store.NetworkEdge, error) GetNodeStats(ctx context.Context, clusterID, nodeName string, window time.Duration) (store.NodeStats, error) + GetNodePods(ctx context.Context, clusterID, nodeName string, window time.Duration) ([]store.PodMetrics, error) } // Handler wires HTTP requests to the VictoriaMetrics client. @@ -76,6 +77,7 @@ func NewRouter(vmClient MetricsProvider, db *db.Store, st *store.Store, finopsEn cost.Get("/nodes", h.Nodes) cost.Get("/nodes/{name}", h.NodeDetail) cost.Get("/nodes/{name}/stats", h.NodeStats) + cost.Get("/nodes/{name}/pods", h.NodePods) cost.Get("/resources", h.Resources) }) protected.Get("/agent", h.AgentStatus) diff --git a/internal/store/store.go b/internal/store/store.go index 09b960c..cda9041 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -301,6 +301,17 @@ type NodeStats struct { Window string `json:"window"` } +// PodMetrics contains resource usage analysis for a single pod. +type PodMetrics struct { + PodName string `json:"podName"` + Namespace string `json:"namespace"` + QoSClass string `json:"qosClass"` + CPURequestMilli int64 `json:"cpuRequestMilli"` + CPUP95Milli float64 `json:"cpuP95Milli"` + MemoryRequestBytes int64 `json:"memoryRequestBytes"` + MemoryP95Bytes float64 `json:"memoryP95Bytes"` +} + // New creates a store seeded with agent configurations. func New(cfgs []config.AgentConfig, recommendedAgentVersion string) *Store { agentConfigs := make(map[string]config.AgentConfig, len(cfgs)) diff --git a/internal/vm/client.go b/internal/vm/client.go index e14deaf..ecd5b7e 100644 --- a/internal/vm/client.go +++ b/internal/vm/client.go @@ -478,3 +478,133 @@ func formatLabels(labels map[string]string) string { b.WriteByte('}') return b.String() } + +// GetNodePods returns 24h P95 and Request metrics for all pods on a specific node. +func (c *Client) GetNodePods(ctx context.Context, clusterID, nodeName string, window time.Duration) ([]store.PodMetrics, error) { + if nodeName == "" { + return nil, fmt.Errorf("node name is required") + } + if window <= 0 { + window = 24 * time.Hour + } + windowStr := formatDuration(window) + + labels := map[string]string{ + "node": nodeName, + } + if clusterID != "" { + labels["cluster_id"] = clusterID + } + labelStr := formatLabels(labels) + + // We need 5 metrics per pod: + // 1. CPU Request (Max) + // 2. CPU Limit (Max) - to determine QoS + // 3. Mem Request (Max) + // 4. CPU Usage (P95) + // 5. Mem Usage (P95) + + queries := map[string]string{ + "cpu_req_max": fmt.Sprintf("max_over_time(clustercost_pod_cpu_request_millicores%s[%s])", labelStr, windowStr), + "cpu_lim_max": fmt.Sprintf("max_over_time(clustercost_pod_cpu_limit_millicores%s[%s])", labelStr, windowStr), + "mem_req_max": fmt.Sprintf("max_over_time(clustercost_pod_memory_request_bytes%s[%s])", labelStr, windowStr), + "cpu_add_p95": fmt.Sprintf("quantile_over_time(0.95, clustercost_pod_cpu_usage_milli%s[%s])", labelStr, windowStr), + "mem_add_p95": fmt.Sprintf("quantile_over_time(0.95, clustercost_pod_memory_rss_bytes%s[%s])", labelStr, windowStr), + } + + // Helper struct to aggregate data + type podData struct { + Namespace string + PodName string + CPUReq float64 + CPULim float64 + MemReq float64 + CPUP95 float64 + MemP95 float64 + } + podMap := make(map[string]*podData) + + var wg sync.WaitGroup + var mu sync.Mutex + var firstErr error + + for key, query := range queries { + wg.Add(1) + go func(k, q string) { + defer wg.Done() + samples, err := c.query(ctx, q) + if err != nil { + mu.Lock() + if firstErr == nil { + firstErr = err + } + mu.Unlock() + return + } + mu.Lock() + for _, s := range samples { + ns := s.labels["namespace"] + pod := s.labels["pod"] + if ns == "" || pod == "" { + continue + } + id := ns + "|" + pod + if _, exists := podMap[id]; !exists { + podMap[id] = &podData{Namespace: ns, PodName: pod} + } + p := podMap[id] + + switch k { + case "cpu_req_max": + p.CPUReq = s.value + case "cpu_lim_max": + p.CPULim = s.value + case "mem_req_max": + p.MemReq = s.value + case "cpu_add_p95": + p.CPUP95 = s.value + case "mem_add_p95": + p.MemP95 = s.value + } + } + mu.Unlock() + }(key, query) + } + wg.Wait() + + if firstErr != nil { + return nil, fmt.Errorf("failed to query pod metrics: %w", firstErr) + } + + results := make([]store.PodMetrics, 0, len(podMap)) + for _, p := range podMap { + // QoS Logic + qos := "Burstable" + if p.CPUReq == 0 && p.MemReq == 0 { + qos = "BestEffort" + } else if p.CPUReq == p.CPULim && p.CPULim > 0 { + qos = "Guaranteed" // Simplified, strictly checking CPU for now + } + + results = append(results, store.PodMetrics{ + PodName: p.PodName, + Namespace: p.Namespace, + QoSClass: qos, + CPURequestMilli: int64(p.CPUReq), + CPUP95Milli: p.CPUP95, + MemoryRequestBytes: int64(p.MemReq), + MemoryP95Bytes: p.MemP95, + }) + } + + // Sort by Waste Amount (heuristic: max diff) + sort.Slice(results, func(i, j int) bool { + // Just sorting by name for stability for now, frontend handles logic sort + if results[i].Namespace != results[j].Namespace { + return results[i].Namespace < results[j].Namespace + } + return results[i].PodName < results[j].PodName + }) + + return results, nil +} diff --git a/web/src/components/nodes/NodeDetailSheet.tsx b/web/src/components/nodes/NodeDetailSheet.tsx index 83509dd..ceae404 100644 --- a/web/src/components/nodes/NodeDetailSheet.tsx +++ b/web/src/components/nodes/NodeDetailSheet.tsx @@ -1,10 +1,14 @@ -import { Sheet, SheetContent, SheetDescription, SheetHeader, SheetTitle } from "../ui/sheet"; -import { Progress } from "../ui/progress"; -import { Badge } from "../ui/badge"; -import { Tabs, TabsList, TabsTrigger } from "../ui/tabs"; +import { Sheet, SheetContent, SheetHeader, SheetTitle } from "@/components/ui/sheet"; +import { Progress } from "@/components/ui/progress"; +import { Badge } from "@/components/ui/badge"; +import { Button } from "@/components/ui/button"; +import { Tabs, TabsList, TabsTrigger } from "@/components/ui/tabs"; +import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"; import { formatCurrency } from "../../lib/utils"; -import { fetchNodeStats, type NodeCost, type NodeStats } from "../../lib/api"; -import { useState, useEffect } from "react"; +import { fetchNodeStats, fetchNodePods, type NodeCost, type NodeStats, type PodMetrics } from "../../lib/api"; +import { useState, useEffect, useMemo } from "react"; +import { CopyIcon, ShieldAlertIcon, ScissorsIcon, CheckCircle2Icon } from "lucide-react"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; interface NodeDetailSheetProps { node: (NodeCost & { monthlyCost: number }) | null; @@ -12,152 +16,277 @@ interface NodeDetailSheetProps { onOpenChange: (open: boolean) => void; } -const statusStyles: Record = { - Ready: "border-emerald-500/40 bg-emerald-500/10 text-emerald-200", - NotReady: "border-destructive/40 bg-destructive/10 text-destructive", - Unknown: "border-muted bg-muted/40 text-muted-foreground" -}; - const NodeDetailSheet = ({ node, open, onOpenChange }: NodeDetailSheetProps) => { const [window, setWindow] = useState("24h"); const [stats, setStats] = useState(null); + const [pods, setPods] = useState([]); const [loading, setLoading] = useState(false); useEffect(() => { if (open && node) { setLoading(true); - fetchNodeStats(node.nodeName, window) - .then(setStats) - .catch((err) => { - console.error("Failed to fetch node stats", err); - setStats(null); - }) - .finally(() => setLoading(false)); + // Parallel Fetch: Stats + Pods + Promise.all([ + fetchNodeStats(node.nodeName, window).catch(e => { console.error(e); return null; }), + fetchNodePods(node.nodeName, window).catch(e => { console.error(e); return []; }) + ]).then(([statsData, podsData]) => { + setStats(statsData); + setPods(podsData || []); + setLoading(false); + }); } else { setStats(null); + setPods([]); } }, [open, node, window]); + const copyToClipboard = (text: string) => { + navigator.clipboard.writeText(text); + }; + + const getRec = (p95: number) => Math.ceil(p95 * 1.15); // P95 + 15% + + const generatePatch = (pod: PodMetrics, type: "cpu" | "memory" | "both", reason: "fix" | "shield") => { + const rawCpu = getRec(pod.cpuP95Milli); + // Ensure we don't go below 10m for CPU to be safe + const targetCpu = Math.max(10, rawCpu); + const targetCpuStr = `${targetCpu}m`; + + const rawMem = getRec(pod.memoryP95Bytes); + // Convert to Mi + const targetMemMi = Math.ceil(rawMem / (1024 * 1024)); + const targetMemStr = `${targetMemMi}Mi`; + + const resources: any = { requests: {} }; + if (type === "cpu" || type === "both") resources.requests.cpu = targetCpuStr; + if (type === "memory" || type === "both") resources.requests.memory = targetMemStr; + + // Use container name approximation or index 0 for now as we don't have container name in PodMetrics yet. + // Using simple approach: Assume first container needs fix found in spec. + // Actually we need the container name. Backend aggregates by pod... + // For now we will use pod name prefix as best effort, or just "name". + // A better approach is usually `kubectl set resources` or patch assuming single container or main container. + // Let's use the first container approach in the patch for now: `spec: { containers: [ { name: "?", ... } ] }`. + // Wait, we don't know the container name. + // To make this robust without container name, we can try to patch by index `containers[0]`. + // JSON Patch: `[{"op": "replace", "path": "/spec/containers/0/resources/requests/cpu", "value": "..."}]` + // But let's stick to the user's requested text format: `kubectl patch ...` + // We will use `deployment` logic usually, but here we patch the POD? Pods are ephemeral. + // Ideally we patch the deployment. + // User asked for "Fix YAML". + const containerName = pod.podName.split("-")[0]; // Heuristic + + return `kubectl patch pod ${pod.podName} -n ${pod.namespace} --patch '{"spec":{"containers":[{"name":"${containerName}", "resources":{"requests":{"cpu":"${targetCpuStr}","memory":"${targetMemStr}"}}}]}}'`; + }; + if (!node) return null; - const statusBadge = ( - - {node.status} - - ); + // SORTING LOGIC: Financial Impact (Savings First) + // Heuristic: ~$32/vCPU/mo, ~$4/GB/mo + const COST_PER_VCPU = 32; + const COST_PER_GB = 4; + + const getSavings = (pod: PodMetrics) => { + const cpuRec = getRec(pod.cpuP95Milli); + const cpuWasteCores = (pod.cpuRequestMilli - cpuRec) / 1000; + const cpuSavings = cpuWasteCores * COST_PER_VCPU; + + const memRec = getRec(pod.memoryP95Bytes); + const memWasteGB = (pod.memoryRequestBytes - memRec) / (1024 * 1024 * 1024); + const memSavings = memWasteGB * COST_PER_GB; - const usageSummary = (() => { - if (node.cpuUsagePercent > 70 || node.memoryUsagePercent > 70) return "Node is heavily used."; - if (node.cpuUsagePercent < 30 && node.memoryUsagePercent < 30) return "This node is mostly idle."; - return "Usage looks normal."; - })(); + return cpuSavings + memSavings; + }; + + const sortedPods = [...pods].sort((a, b) => { + return getSavings(b) - getSavings(a); // Descending (Biggest Savings First) + }); return ( - - + + {/* HEADER */} + - {node.nodeName} - - {(node.instanceType ?? "Unknown type")} · {node.podCount} pods - - - - {statusBadge} - {node.isUnderPressure && Under pressure} - - - -
-
-

Current State

-
-
-

{formatCurrency(node.monthlyCost)}

-

Monthly cost

-
-
-

{formatCurrency(node.hourlyCost, { maximumFractionDigits: 2 })}

-

Hourly cost

-
+ {node.nodeName} +
+ {node.instanceType} + {node.podCount} Pods
-
- -
-

Current Usage

-
-
- CPU usage - {node.cpuUsagePercent.toFixed(0)}% -
- -
-
-
- Memory usage - {node.memoryUsagePercent.toFixed(0)}% -
- + +
+
+

Monthly Cost

+

{formatCurrency(node.monthlyCost)}

-

{usageSummary}

-
- -
- -
-
-

Historical Analysis

- - - 24h - 7d - 30d - - +
stats.totalMonthlyCost * 0.1 ? "border-emerald-500/20 bg-emerald-500/5 text-emerald-500" : "text-muted-foreground"}`}> +

Potential Savings

+

+ {stats ? formatCurrency(stats.totalMonthlyCost - stats.realUsageMonthlyCost) : "..."} +

+
+ - {loading ? ( -
-
-
-
- ) : stats ? ( - <> -
-
- Real Usage Cost - {formatCurrency(stats.realUsageMonthlyCost)} - - vs {formatCurrency(stats.totalMonthlyCost)} potential - -
-
+ -
+
+
+ {/* P95 METRICS */} + {stats && ( +
+

Node P95 Analysis ({window})

+
-
- P95 CPU ({window}) - {stats.p95CpuUsagePercent.toFixed(1)}% +
+ CPU P95 Load + {stats.p95CpuUsagePercent.toFixed(1)}%
- +
-
- P95 Memory ({window}) - {stats.p95MemoryUsagePercent.toFixed(1)}% +
+ Memory P95 Load + {stats.p95MemoryUsagePercent.toFixed(1)}%
- +
- - ) : ( -

No historical data available.

+
)} -
+ + {/* FULL AUDIT TABLE */} +
+

Full Pod Audit (P95 + 15% Safety Margin)

+
+ + + + Pod (QoS) + CPU (Req → P95) + RAM (Req → P95) + Action + + + + {loading ? ( + + + Analyzing pod logs & metrics... + + + ) : pods.length === 0 ? ( + + + No pods found or agent not reporting deep metrics yet. + + + ) : ( + sortedPods.map(pod => { + // CPU Analysis + const cpuRec = getRec(pod.cpuP95Milli); + const cpuDiff = cpuRec - pod.cpuRequestMilli; + const cpuRisk = pod.cpuP95Milli > pod.cpuRequestMilli; + const cpuOptimized = !cpuRisk && Math.abs(cpuDiff) <= (0.1 * pod.cpuRequestMilli); + + // MEM Analysis + const memRec = getRec(pod.memoryP95Bytes); + const memDiff = memRec - pod.memoryRequestBytes; + const memRisk = pod.memoryP95Bytes > pod.memoryRequestBytes; + const memOptimized = !memRisk && Math.abs(memDiff) <= (0.1 * pod.memoryRequestBytes); + + // Global State + const isRisk = cpuRisk || memRisk; + const isOptimized = cpuOptimized && memOptimized; + + const cpuReqStr = `${pod.cpuRequestMilli}m`; + const cpuP95Str = `${pod.cpuP95Milli.toFixed(0)}m`; + const memReqStr = `${(pod.memoryRequestBytes / (1024 * 1024)).toFixed(0)}Mi`; + const memP95Str = `${(pod.memoryP95Bytes / (1024 * 1024)).toFixed(0)}Mi`; + + return ( + + +
+ {pod.podName} + {pod.namespace} + {pod.qosClass} +
+
+ + +
+ + {cpuReqStr} → {cpuP95Str} + + {cpuRisk && RISK} + {!cpuRisk && !cpuOptimized && Waste} +
+
+ + +
+ + {memReqStr} → {memP95Str} + + {memRisk && RISK} + {!memRisk && !memOptimized && Waste} +
+
+ + + {isOptimized ? ( + + Optimized + + ) : ( + + + + + + + Copy {isRisk ? "Upsize" : "Downsize"} Patch (CPU & RAM) + + + + )} + +
+ ); + }) + )} +
+
+
+
+
); }; +const SectionTabs = ({ window, setWindow }: { window: string; setWindow: (w: string) => void }) => ( +
+ + + 24h + 7d + 30d + + +
+); + export default NodeDetailSheet; diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 865b336..e0f4596 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -365,6 +365,20 @@ export const fetchNodeStats = async (name: string, window: string): Promise(`/cost/nodes/${name}/stats?window=${window}`); }; +export interface PodMetrics { + podName: string; + namespace: string; + qosClass: string; + cpuRequestMilli: number; + cpuP95Milli: number; + memoryRequestBytes: number; + memoryP95Bytes: number; +} + +export const fetchNodePods = async (name: string, window: string): Promise => { + return request(`/cost/nodes/${name}/pods?window=${window}`); +}; + export const fetchResources = async (): Promise => { const resp = await request("/cost/resources"); return {