diff --git a/cmd/dashboard/main.go b/cmd/dashboard/main.go index 25d6e59..3e26622 100644 --- a/cmd/dashboard/main.go +++ b/cmd/dashboard/main.go @@ -56,6 +56,9 @@ func main() { // Initialize FinOps Engine finopsEngine := finops.NewEngine(vmClient, st.PricingCatalog()) + // Share Pricing Catalog with VM Client + vmClient.SetPricingCatalog(st.PricingCatalog()) + auth.SetSecret(cfg.JWTSecret) srv := &http.Server{ diff --git a/docs/TECHDEBT.md b/docs/TECHDEBT.md index a2fbc10..3a80bf6 100644 --- a/docs/TECHDEBT.md +++ b/docs/TECHDEBT.md @@ -15,3 +15,7 @@ ## Future Considerations - [ ] **Retention Policies**: Configure distinct retention periods for high-precision metrics (15s interval) vs. aggregated historical data. - [ ] **Refactor Store Locking**: Evaluate moving from heavy `RWMutex` usage in `store.go` to a more concurrent pattern if contention increases with 100+ agents. +- [ ] **Dynamic Pricing & Savings Plans Support**: + - Problem: Agents no longer send costs; backend relies on static On-Demand rates. + - Solution: Implement a **Dynamic Pricing Engine** with DB overrides for Savings Plans, Reserved Instances, and Spot Pricing. + - Design: See `dynamic_pricing_design.md` artifact. diff --git a/internal/api/handlers_health_test.go b/internal/api/handlers_health_test.go index f5cf9b9..6dd6ba0 100644 --- a/internal/api/handlers_health_test.go +++ b/internal/api/handlers_health_test.go @@ -47,6 +47,12 @@ func (f *fakeMetricsProvider) ClusterMetadata(context.Context) (store.ClusterMet func (f *fakeMetricsProvider) NetworkTopology(context.Context, store.NetworkTopologyOptions) ([]store.NetworkEdge, error) { return nil, vm.ErrNoData } +func (f *fakeMetricsProvider) GetNodeStats(context.Context, string, string, time.Duration) (store.NodeStats, error) { + return store.NodeStats{}, vm.ErrNoData +} +func (f *fakeMetricsProvider) GetNodePods(context.Context, string, string, time.Duration) ([]store.PodMetrics, error) { + return nil, vm.ErrNoData +} func newTestHandler(meta store.ClusterMetadata, status store.AgentStatusPayload) *Handler { return &Handler{vm: &fakeMetricsProvider{meta: meta, status: status}} diff --git a/internal/api/handlers_nodes.go b/internal/api/handlers_nodes.go index fcb30ef..b6a3884 100644 --- a/internal/api/handlers_nodes.go +++ b/internal/api/handlers_nodes.go @@ -2,6 +2,7 @@ package api import ( "net/http" + "time" "github.com/go-chi/chi/v5" @@ -22,6 +23,7 @@ func (h *Handler) Nodes(w http.ResponseWriter, r *http.Request) { Search: q.Get("search"), Limit: parseLimit(q.Get("limit"), defaultNodeLimit, maxNodeLimit), Offset: parseOffset(q.Get("offset")), + Window: q.Get("window"), // "24h", "7d", "30d" } resp, err := h.vm.NodeList(ctx, filter) @@ -58,3 +60,49 @@ func (h *Handler) NodeDetail(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, node) } + +// NodeStats returns historical usage and cost stats for a node. +func (h *Handler) NodeStats(w http.ResponseWriter, r *http.Request) { + name := chi.URLParam(r, "name") + if name == "" { + writeError(w, http.StatusBadRequest, "node name is required") + return + } + windowStr := r.URL.Query().Get("window") + window, _ := time.ParseDuration(windowStr) + if window <= 0 { + window = 24 * time.Hour + } + + ctx := vm.WithClusterID(r.Context(), clusterIDFromRequest(r)) + stats, err := h.vm.GetNodeStats(ctx, "", name, window) + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + writeJSON(w, http.StatusOK, stats) +} + +// NodePods returns the list of pods for a node with P95 metrics (Pod Audit). +func (h *Handler) NodePods(w http.ResponseWriter, r *http.Request) { + name := chi.URLParam(r, "name") + if name == "" { + writeError(w, http.StatusBadRequest, "node name is required") + return + } + windowStr := r.URL.Query().Get("window") + window, _ := time.ParseDuration(windowStr) + if window <= 0 { + window = 24 * time.Hour + } + + ctx := vm.WithClusterID(r.Context(), clusterIDFromRequest(r)) + pods, err := h.vm.GetNodePods(ctx, "", name, window) + if err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + writeJSON(w, http.StatusOK, pods) +} diff --git a/internal/api/router.go b/internal/api/router.go index 28d403a..778f45d 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "net/http" + "time" "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" @@ -28,6 +29,8 @@ type MetricsProvider interface { Agents(ctx context.Context) ([]store.AgentInfo, error) ClusterMetadata(ctx context.Context) (store.ClusterMetadata, error) NetworkTopology(ctx context.Context, opts store.NetworkTopologyOptions) ([]store.NetworkEdge, error) + GetNodeStats(ctx context.Context, clusterID, nodeName string, window time.Duration) (store.NodeStats, error) + GetNodePods(ctx context.Context, clusterID, nodeName string, window time.Duration) ([]store.PodMetrics, error) } // Handler wires HTTP requests to the VictoriaMetrics client. @@ -73,6 +76,8 @@ func NewRouter(vmClient MetricsProvider, db *db.Store, st *store.Store, finopsEn cost.Get("/namespaces/{name}", h.NamespaceDetail) cost.Get("/nodes", h.Nodes) cost.Get("/nodes/{name}", h.NodeDetail) + cost.Get("/nodes/{name}/stats", h.NodeStats) + cost.Get("/nodes/{name}/pods", h.NodePods) cost.Get("/resources", h.Resources) }) protected.Get("/agent", h.AgentStatus) diff --git a/internal/store/pricing.go b/internal/store/pricing.go index 3d4e060..6ddaaef 100644 --- a/internal/store/pricing.go +++ b/internal/store/pricing.go @@ -1,6 +1,11 @@ package store -import "context" +import ( + "context" + "fmt" + + "github.com/clustercost/clustercost-dashboard/internal/pricing" +) // Pricing constants const ( @@ -10,51 +15,33 @@ const ( CostEgressInternal = 0.00 // Free ) -// PricingProvider defines the interface for fetching node pricing. -type PricingProvider interface { - GetNodePrice(ctx context.Context, region, instanceType string) (float64, error) -} - // PricingCatalog allows looking up node prices. type PricingCatalog struct { - // Map instance type to hourly price - InstancePrices map[string]float64 - Provider PricingProvider + // No provider needed, we use static data from internal/pricing } -// NewPricingCatalog returns a catalog with some default mocked pricing. -func NewPricingCatalog(provider PricingProvider) *PricingCatalog { - return &PricingCatalog{ - InstancePrices: map[string]float64{ - "t3.medium": 0.0416, - "t3.large": 0.0832, - "m5.large": 0.096, - "m5.xlarge": 0.192, - "c5.large": 0.085, - "r5.large": 0.126, - "default": 0.05, // Fallback - }, - Provider: provider, - } +// NewPricingCatalog returns a catalog. +func NewPricingCatalog() *PricingCatalog { + return &PricingCatalog{} } // GetTotalNodePrice returns the total hourly cost of a node. func (pc *PricingCatalog) GetTotalNodePrice(ctx context.Context, region, instanceType string) float64 { - // Try Provider first - if pc.Provider != nil && instanceType != "" && region != "" { - price, err := pc.Provider.GetNodePrice(ctx, region, instanceType) - if err == nil && price > 0 { - pc.InstancePrices[instanceType] = price // Update cache - return price - } + // 1. Try Shared Static Data + key := fmt.Sprintf("%s|%s", region, instanceType) + if price, ok := pricing.InstancePrices[key]; ok { + return price } - // Fallback to local cache - price, ok := pc.InstancePrices[instanceType] - if !ok { - price = pc.InstancePrices["default"] + // 2. Fallback to generic defaults if completely unknown + // check if we have a default for the instance type regardless of region (common for US-East-1 based defaults) + // (Optional optimization: try "us-east-1|instanceType" as fallback?) + fallbackKey := fmt.Sprintf("us-east-1|%s", instanceType) + if price, ok := pricing.InstancePrices[fallbackKey]; ok { + return price } - return price + + return 0.05 // Ultimate fallback } // GetNodeResourcePrices calculates the cost per vCPU and per GB of RAM based on the instance type. diff --git a/internal/store/pricing_test.go b/internal/store/pricing_test.go index 7ab6916..630cdd7 100644 --- a/internal/store/pricing_test.go +++ b/internal/store/pricing_test.go @@ -6,7 +6,7 @@ import ( ) func TestPricingCatalog_GetNodeResourcePrices(t *testing.T) { - pc := NewPricingCatalog(nil) + pc := NewPricingCatalog() // Test case 1: m5.large (2 vCPU, 8GB RAM) // Price: $0.096/hr diff --git a/internal/store/store.go b/internal/store/store.go index 9df7b58..cda9041 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -11,7 +11,6 @@ import ( "math" "github.com/clustercost/clustercost-dashboard/internal/config" - "github.com/clustercost/clustercost-dashboard/internal/pricing" agentv1 "github.com/clustercost/clustercost-dashboard/internal/proto/agent/v1" ) @@ -122,12 +121,21 @@ type NodeSummary struct { InstanceType string `json:"instanceType,omitempty"` Labels map[string]string `json:"labels"` Taints []string `json:"taints"` + // Resource Requests (Allocated) + CPURequestedMilli int64 `json:"cpuRequestedMilli"` + CPULimitMilli int64 `json:"cpuLimitMilli"` + MemoryRequestedBytes int64 `json:"memoryRequestedBytes"` + MemoryLimitBytes int64 `json:"memoryLimitBytes"` // Network (Host Level) NetTxBytes int64 `json:"netTxBytes"` NetRxBytes int64 `json:"netRxBytes"` EgressPublicBytes int64 `json:"egressPublicBytes"` EgressCrossAZBytes int64 `json:"egressCrossAZBytes"` EgressInternalBytes int64 `json:"egressInternalBytes"` + // Historical / Window Data + ActiveHours float64 `json:"activeHours"` // Hours active in the selected window + ActiveRatio float64 `json:"activeRatio"` // 0.0 - 1.0 + WindowCost float64 `json:"windowCost"` // Actual cost incurred in the window } // NodeListResponse wraps paginated node results. @@ -271,6 +279,7 @@ type NodeFilter struct { Search string Limit int Offset int + Window string // "24h", "7d", "30d" } // PodContext wraps a PodMetric with its location metadata. @@ -282,6 +291,27 @@ type PodContext struct { InstanceType string } +// NodeStats contains historical usage and cost analysis for a node. +type NodeStats struct { + NodeName string `json:"nodeName"` + P95CPUUsagePercent float64 `json:"p95CpuUsagePercent"` + P95MemoryUsagePercent float64 `json:"p95MemoryUsagePercent"` + TotalMonthlyCost float64 `json:"totalMonthlyCost"` + RealUsageMonthlyCost float64 `json:"realUsageMonthlyCost"` + Window string `json:"window"` +} + +// PodMetrics contains resource usage analysis for a single pod. +type PodMetrics struct { + PodName string `json:"podName"` + Namespace string `json:"namespace"` + QoSClass string `json:"qosClass"` + CPURequestMilli int64 `json:"cpuRequestMilli"` + CPUP95Milli float64 `json:"cpuP95Milli"` + MemoryRequestBytes int64 `json:"memoryRequestBytes"` + MemoryP95Bytes float64 `json:"memoryP95Bytes"` +} + // New creates a store seeded with agent configurations. func New(cfgs []config.AgentConfig, recommendedAgentVersion string) *Store { agentConfigs := make(map[string]config.AgentConfig, len(cfgs)) @@ -290,14 +320,13 @@ func New(cfgs []config.AgentConfig, recommendedAgentVersion string) *Store { } // Initialize Static Pricing Provider - // Context is just placeholder for interface, static client doesn't need it - pricingClient, _ := pricing.NewAWSClient(context.Background()) + // We use the static map from internal/pricing/data.go, so no dynamic client needed. return &Store{ agentConfigs: agentConfigs, snapshots: make(map[string]*AgentSnapshot, len(cfgs)), recommendedAgentVersion: recommendedAgentVersion, - pricing: NewPricingCatalog(pricingClient), + pricing: NewPricingCatalog(), } } @@ -972,6 +1001,28 @@ func (s *Store) aggregateNodesLocked() (map[string]*NodeSummary, error) { } haveData = true + // Pre-calculate Pod Limits/Requests per Node + nodeLimits := make(map[string]struct { + cpuReq, cpuLim, memReq, memLim int64 + }) + + for _, p := range snap.Report.Pods { + nodeName := snap.Report.NodeName + if nodeName == "" { + continue + } + stats := nodeLimits[nodeName] + if p.Cpu != nil { + stats.cpuReq += safeInt64(p.Cpu.RequestMillicores) + stats.cpuLim += safeInt64(p.Cpu.LimitMillicores) + } + if p.Memory != nil { + stats.memReq += safeInt64(p.Memory.RequestBytes) + stats.memLim += safeInt64(p.Memory.LimitBytes) + } + nodeLimits[nodeName] = stats + } + // Iterate over all nodes reported by this agent for _, n := range snap.Report.Nodes { if n == nil || n.NodeName == "" { @@ -979,6 +1030,11 @@ func (s *Store) aggregateNodesLocked() (map[string]*NodeSummary, error) { } name := n.NodeName + // Use aggregated values from pods if available, fallback to node metric if generic + // The Agent V2 NodeMetric.Requested... is arguably the same, but doesn't have Limits. + // We prioritize our calculated limits. + podStats := nodeLimits[name] + entry, ok := nodes[name] if !ok { entry = &NodeSummary{ @@ -988,7 +1044,20 @@ func (s *Store) aggregateNodesLocked() (map[string]*NodeSummary, error) { InstanceType: "default", // placeholder CPUAllocatableMilli: safeInt64(n.AllocatableCpuMillicores), MemoryAllocatableBytes: safeInt64(n.AllocatableMemoryBytes), + CPURequestedMilli: safeInt64(n.RequestedCpuMillicores), // Fallback to agent metric + CPULimitMilli: podStats.cpuLim, + MemoryRequestedBytes: safeInt64(n.RequestedMemoryBytes), // Fallback to agent metric + MemoryLimitBytes: podStats.memLim, + IsUnderPressure: n.ThrottlingNs > 1_000_000, + } + // If agent metric is 0 (older agent?) use our aggregation for Requests too + if entry.CPURequestedMilli == 0 { + entry.CPURequestedMilli = podStats.cpuReq } + if entry.MemoryRequestedBytes == 0 { + entry.MemoryRequestedBytes = podStats.memReq + } + nodes[name] = entry } diff --git a/internal/store/store_test.go b/internal/store/store_test.go index 7702b7e..7911687 100644 --- a/internal/store/store_test.go +++ b/internal/store/store_test.go @@ -13,7 +13,7 @@ func newTestStore() *Store { } s := New(cfgs, "v1.0.0") // Inject Mock Pricing - s.pricing = NewPricingCatalog(&MockPricing{}) + s.pricing = NewPricingCatalog() return s } diff --git a/internal/vm/client.go b/internal/vm/client.go index 91a7b8a..ecd5b7e 100644 --- a/internal/vm/client.go +++ b/internal/vm/client.go @@ -16,6 +16,7 @@ import ( "time" "github.com/clustercost/clustercost-dashboard/internal/config" + "github.com/clustercost/clustercost-dashboard/internal/store" ) // ErrNoData indicates that VictoriaMetrics returned no usable data. @@ -42,6 +43,12 @@ type Client struct { cacheTTL time.Duration cacheMu sync.Mutex cache map[string]cachedQuery + pricing *store.PricingCatalog +} + +// SetPricingCatalog allows injecting the pricing catalog. +func (c *Client) SetPricingCatalog(p *store.PricingCatalog) { + c.pricing = p } type cachedQuery struct { @@ -376,6 +383,77 @@ func (c *Client) GetPodP95Usage(ctx context.Context, clusterID, namespace, podNa return cpuCores, memoryBytes, nil } +// GetNodeStats calculates the average usage and real cost of a node over a time window. +func (c *Client) GetNodeStats(ctx context.Context, clusterID, nodeName string, window time.Duration) (store.NodeStats, error) { + if nodeName == "" { + return store.NodeStats{}, fmt.Errorf("node name is required") + } + if window <= 0 { + window = 24 * time.Hour + } + windowStr := formatDuration(window) + + labels := map[string]string{ + "node": nodeName, + } + if clusterID != "" { + labels["cluster_id"] = clusterID + } + + // 1. Get P95 Usage % + // quantile_over_time(0.95, clustercost_node_cpu_usage_percent{node="name"}[window]) + cpuQuery := fmt.Sprintf("quantile_over_time(0.95, clustercost_node_cpu_usage_percent%s[%s])", + formatLabels(labels), windowStr) + memQuery := fmt.Sprintf("quantile_over_time(0.95, clustercost_node_memory_usage_percent%s[%s])", + formatLabels(labels), windowStr) + + // 2. Get Average Hourly Cost (to account for potential spot price fluctuations or just stability) + costQuery := fmt.Sprintf("avg_over_time(clustercost_node_hourly_cost%s[%s])", + formatLabels(labels), windowStr) + + cpuSamples, err := c.query(ctx, cpuQuery) + if err != nil { + return store.NodeStats{}, fmt.Errorf("query cpu stats: %w", err) + } + memSamples, err := c.query(ctx, memQuery) + if err != nil { + return store.NodeStats{}, fmt.Errorf("query mem stats: %w", err) + } + costSamples, err := c.query(ctx, costQuery) + if err != nil { + return store.NodeStats{}, fmt.Errorf("query cost stats: %w", err) + } + + p95Cpu := 0.0 + if len(cpuSamples) > 0 { + p95Cpu = cpuSamples[0].value + } + p95Mem := 0.0 + if len(memSamples) > 0 { + p95Mem = memSamples[0].value + } + avgHourlyCost := 0.0 + if len(costSamples) > 0 { + avgHourlyCost = costSamples[0].value + } + + // Calculate Costs + totalMonthly := avgHourlyCost * hoursPerMonth // 720 hours + // "Real Usage" = (Total * 0.5 * Cpu%) + (Total * 0.5 * Mem%) + // Percentages are 0-100 in VM usually (based on ingestion code: `writeFloatSample(..., cpuPct, ...)` where `cpuPct` was * 100) + + realUsageMonthly := (totalMonthly * 0.5 * (p95Cpu / 100.0)) + (totalMonthly * 0.5 * (p95Mem / 100.0)) + + return store.NodeStats{ + NodeName: nodeName, + P95CPUUsagePercent: p95Cpu, + P95MemoryUsagePercent: p95Mem, + TotalMonthlyCost: totalMonthly, + RealUsageMonthlyCost: realUsageMonthly, + Window: windowStr, + }, nil +} + func formatLabels(labels map[string]string) string { if len(labels) == 0 { return "" @@ -400,3 +478,133 @@ func formatLabels(labels map[string]string) string { b.WriteByte('}') return b.String() } + +// GetNodePods returns 24h P95 and Request metrics for all pods on a specific node. +func (c *Client) GetNodePods(ctx context.Context, clusterID, nodeName string, window time.Duration) ([]store.PodMetrics, error) { + if nodeName == "" { + return nil, fmt.Errorf("node name is required") + } + if window <= 0 { + window = 24 * time.Hour + } + windowStr := formatDuration(window) + + labels := map[string]string{ + "node": nodeName, + } + if clusterID != "" { + labels["cluster_id"] = clusterID + } + labelStr := formatLabels(labels) + + // We need 5 metrics per pod: + // 1. CPU Request (Max) + // 2. CPU Limit (Max) - to determine QoS + // 3. Mem Request (Max) + // 4. CPU Usage (P95) + // 5. Mem Usage (P95) + + queries := map[string]string{ + "cpu_req_max": fmt.Sprintf("max_over_time(clustercost_pod_cpu_request_millicores%s[%s])", labelStr, windowStr), + "cpu_lim_max": fmt.Sprintf("max_over_time(clustercost_pod_cpu_limit_millicores%s[%s])", labelStr, windowStr), + "mem_req_max": fmt.Sprintf("max_over_time(clustercost_pod_memory_request_bytes%s[%s])", labelStr, windowStr), + "cpu_add_p95": fmt.Sprintf("quantile_over_time(0.95, clustercost_pod_cpu_usage_milli%s[%s])", labelStr, windowStr), + "mem_add_p95": fmt.Sprintf("quantile_over_time(0.95, clustercost_pod_memory_rss_bytes%s[%s])", labelStr, windowStr), + } + + // Helper struct to aggregate data + type podData struct { + Namespace string + PodName string + CPUReq float64 + CPULim float64 + MemReq float64 + CPUP95 float64 + MemP95 float64 + } + podMap := make(map[string]*podData) + + var wg sync.WaitGroup + var mu sync.Mutex + var firstErr error + + for key, query := range queries { + wg.Add(1) + go func(k, q string) { + defer wg.Done() + samples, err := c.query(ctx, q) + if err != nil { + mu.Lock() + if firstErr == nil { + firstErr = err + } + mu.Unlock() + return + } + mu.Lock() + for _, s := range samples { + ns := s.labels["namespace"] + pod := s.labels["pod"] + if ns == "" || pod == "" { + continue + } + id := ns + "|" + pod + if _, exists := podMap[id]; !exists { + podMap[id] = &podData{Namespace: ns, PodName: pod} + } + p := podMap[id] + + switch k { + case "cpu_req_max": + p.CPUReq = s.value + case "cpu_lim_max": + p.CPULim = s.value + case "mem_req_max": + p.MemReq = s.value + case "cpu_add_p95": + p.CPUP95 = s.value + case "mem_add_p95": + p.MemP95 = s.value + } + } + mu.Unlock() + }(key, query) + } + wg.Wait() + + if firstErr != nil { + return nil, fmt.Errorf("failed to query pod metrics: %w", firstErr) + } + + results := make([]store.PodMetrics, 0, len(podMap)) + for _, p := range podMap { + // QoS Logic + qos := "Burstable" + if p.CPUReq == 0 && p.MemReq == 0 { + qos = "BestEffort" + } else if p.CPUReq == p.CPULim && p.CPULim > 0 { + qos = "Guaranteed" // Simplified, strictly checking CPU for now + } + + results = append(results, store.PodMetrics{ + PodName: p.PodName, + Namespace: p.Namespace, + QoSClass: qos, + CPURequestMilli: int64(p.CPUReq), + CPUP95Milli: p.CPUP95, + MemoryRequestBytes: int64(p.MemReq), + MemoryP95Bytes: p.MemP95, + }) + } + + // Sort by Waste Amount (heuristic: max diff) + sort.Slice(results, func(i, j int) bool { + // Just sorting by name for stability for now, frontend handles logic sort + if results[i].Namespace != results[j].Namespace { + return results[i].Namespace < results[j].Namespace + } + return results[i].PodName < results[j].PodName + }) + + return results, nil +} diff --git a/internal/vm/dashboard.go b/internal/vm/dashboard.go index 34cc685..31996e0 100644 --- a/internal/vm/dashboard.go +++ b/internal/vm/dashboard.go @@ -131,7 +131,7 @@ func (c *Client) NamespaceDetail(ctx context.Context, name string) (store.Namesp } func (c *Client) NodeList(ctx context.Context, filter store.NodeFilter) (store.NodeListResponse, error) { - nodes, ts, err := c.nodeMetrics(ctx, "") + nodes, ts, err := c.nodeMetrics(ctx, "", filter.Window) if err != nil { return store.NodeListResponse{}, err } @@ -169,7 +169,7 @@ func (c *Client) NodeList(ctx context.Context, filter store.NodeFilter) (store.N } func (c *Client) NodeDetail(ctx context.Context, name string) (store.NodeSummary, error) { - nodes, _, err := c.nodeMetrics(ctx, name) + nodes, _, err := c.nodeMetrics(ctx, name, "") if err != nil { return store.NodeSummary{}, err } @@ -198,10 +198,8 @@ func (c *Client) Resources(ctx context.Context) (store.ResourcesPayload, error) if err != nil && err != ErrNoData { return store.ResourcesPayload{}, err } - nodeHourlyCost, _, err := c.scalarMetric(ctx, "clustercost_cluster_total_node_hourly_cost") - if err != nil && err != ErrNoData { - return store.ResourcesPayload{}, err - } + // Node Hourly Cost is now fully calculated, no stored metric + nodeHourlyCost := 0.0 // Fetch Network Metrics netTx, _, _ := c.scalarMetric(ctx, "clustercost_cluster_network_tx_bytes_total") @@ -432,7 +430,7 @@ func (c *Client) AgentStatus(ctx context.Context) (store.AgentStatusPayload, err } nsTS := c.seriesTimestampSafe(ctx, "clustercost_namespace_hourly_cost") - nodeTS := c.seriesTimestampSafe(ctx, "clustercost_node_hourly_cost") + nodeTS := c.seriesTimestampSafe(ctx, "clustercost_node_cpu_allocatable_milli") resTS := c.seriesTimestampSafe(ctx, "clustercost_cluster_cpu_usage_milli_total") datasets := store.AgentDatasetHealth{ @@ -678,36 +676,6 @@ func (c *Client) namespaceMetrics(ctx context.Context, environment, namespace st } } - queryScalar := func(expr string) (float64, error) { - samples, err := c.query(ctx, expr) - if err != nil { - return 0, err - } - if len(samples) == 0 { - return 0, ErrNoData - } - return samples[0].value, nil - } - - nodeCostExpr := fmt.Sprintf("sum(max by (node) (%s))", c.lookbackExpr("clustercost_node_hourly_cost", nil, clusterID)) - cpuAllocExpr := fmt.Sprintf("sum(max by (node) (%s))", c.lookbackExpr("clustercost_node_cpu_allocatable_milli", nil, clusterID)) - memAllocExpr := fmt.Sprintf("sum(max by (node) (%s))", c.lookbackExpr("clustercost_node_memory_allocatable_bytes", nil, clusterID)) - - nodeCost, err := queryScalar(nodeCostExpr) - if err == nil && nodeCost > 0 { - cpuAllocMilli, errCPU := queryScalar(cpuAllocExpr) - memAllocBytes, errMem := queryScalar(memAllocExpr) - if errCPU == nil && errMem == nil && cpuAllocMilli > 0 && memAllocBytes > 0 { - cpuPrice := (nodeCost * 0.5) / (cpuAllocMilli / 1000.0) - memPrice := (nodeCost * 0.5) / (memAllocBytes / (1024.0 * 1024.0 * 1024.0)) - for _, entry := range out { - cpuUsageCores := float64(entry.CPUUsageMilli) / 1000.0 - memUsageGB := float64(entry.MemoryUsageBytes) / (1024.0 * 1024.0 * 1024.0) - entry.HourlyCost = (cpuUsageCores * cpuPrice) + (memUsageGB * memPrice) - } - } - } - latest = c.seriesTimestampSafe(ctx, "clustercost_namespace_memory_rss_bytes_total") type nodeAlloc struct { @@ -755,7 +723,12 @@ func (c *Client) namespaceMetrics(ctx context.Context, environment, namespace st }) } - pricing := store.NewPricingCatalog(nil) + var pricing *store.PricingCatalog + if c.pricing != nil { + pricing = c.pricing + } else { + pricing = store.NewPricingCatalog() + } totalNodeCost := 0.0 totalCpuCores := 0.0 totalMemGB := 0.0 @@ -785,7 +758,27 @@ func (c *Client) namespaceMetrics(ctx context.Context, environment, namespace st return out, latest, nil } -func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]*store.NodeSummary, time.Time, error) { +func (c *Client) Nodes(ctx context.Context, window string) ([]store.NodeSummary, error) { + nodeMetrics, _, err := c.nodeMetrics(ctx, "", window) + if err != nil { + return nil, err + } + + out := make([]store.NodeSummary, 0, len(nodeMetrics)) + for _, n := range nodeMetrics { + n.Labels = nil // Optimization: potentially clear heavy labels if not needed + out = append(out, *n) + } + + // Sort by Cost desc + sort.Slice(out, func(i, j int) bool { + return out[i].WindowCost > out[j].WindowCost + }) + + return out, nil +} + +func (c *Client) nodeMetrics(ctx context.Context, nodeName, window string) (map[string]*store.NodeSummary, time.Time, error) { clusterID := c.resolveClusterID(ctx) ctx = WithClusterID(ctx, clusterID) labels := map[string]string{} @@ -793,53 +786,169 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* labels["node"] = nodeName } + // Parse Window + var windowDur time.Duration + var lookbackFunc string = "max_over_time" // Default for "current" view (snapshot-ish) + var windowStr string = c.lookback.String() // Default internal lookback + + if window != "" { + d, err := time.ParseDuration(window) + if err == nil { + windowDur = d + windowStr = window + lookbackFunc = "avg_over_time" + } + } else { + // Assuming standard "current" view implies "1h" or just last scrape? + // For consistency with existing logic, we keep standard lookback but use max/last. + } + + // If Windowed View: Primary source is agent_up to find ALL nodes active in window + // If Snapshot View: Primary source is usually node_info or just scraping metrics. + // We'll use the same multi-metric approach but adjust the aggregation. + + out := make(map[string]*store.NodeSummary) + + // Helper to safely assign to out map + getOrCreate := func(node string) *store.NodeSummary { + if node == "" { + return nil + } + if _, ok := out[node]; !ok { + out[node] = &store.NodeSummary{ + NodeName: node, + Labels: map[string]string{}, + Taints: []string{}, + } + } + return out[node] + } + + // 1. Availability / Active Time + // Query: avg_over_time(clustercost_agent_up[window]) + // Value: 0.0 - 1.0 (fraction of time active) + availExpr := fmt.Sprintf("avg_over_time(clustercost_agent_up%s[%s])", formatLabels(c.scopedLabels(labels, clusterID)), windowStr) + availSamples, err := c.query(ctx, availExpr) + if err == nil { + for _, s := range availSamples { + node := s.labels["node"] + entry := getOrCreate(node) + if entry != nil { + entry.ActiveRatio = s.value + if windowDur > 0 { + entry.ActiveHours = s.value * windowDur.Hours() + } else { + // Default assumption if no window: 100% active (snapshot) + entry.ActiveRatio = 1.0 + entry.ActiveHours = 24 * 30 // Monthly projection basis + } + + // Extract Metadata from Agent Up + if entry.InstanceType == "" { + entry.InstanceType = valueOrDefault(s.labels["instance_type"], + valueOrDefault(s.labels["node_label_node_kubernetes_io_instance_type"], + s.labels["node_label_beta_kubernetes_io_instance_type"])) + } + if entry.Labels["topology_kubernetes_io_region"] == "" { + entry.Labels["topology_kubernetes_io_region"] = s.labels["cluster_region"] + } + } + } + } + + // Helper to extract metadata from labels + updateMeta := func(entry *store.NodeSummary, labels map[string]string) { + if entry.InstanceType == "" { + entry.InstanceType = valueOrDefault(labels["instance_type"], + valueOrDefault(labels["node_label_node_kubernetes_io_instance_type"], + labels["node_label_beta_kubernetes_io_instance_type"])) + } + if entry.Labels["topology_kubernetes_io_region"] == "" { + entry.Labels["topology_kubernetes_io_region"] = labels["cluster_region"] + } + } + + // 2. Metrics List metrics := []struct { - name string - assign func(entry *store.NodeSummary, value float64, labels map[string]string) + name string + validLookback bool // if false, use standard lookback (e.g. for info that doesn't vary) + assign func(entry *store.NodeSummary, value float64, labels map[string]string) }{ - {"clustercost_node_hourly_cost", func(e *store.NodeSummary, v float64, l map[string]string) { - e.HourlyCost = v - if e.InstanceType == "" { - e.InstanceType = l["instance_type"] - } + // hourly_cost metric removed as it's deprecated. Cost is calculated in post-processing. + {"clustercost_node_cpu_usage_percent", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUUsagePercent = v }}, + {"clustercost_node_memory_usage_percent", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryUsagePercent = v }}, + {"clustercost_node_cpu_allocatable_milli", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.CPUAllocatableMilli = int64(v) + updateMeta(e, l) }}, - {"clustercost_node_cpu_usage_percent", func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUUsagePercent = v }}, - {"clustercost_node_memory_usage_percent", func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryUsagePercent = v }}, - {"clustercost_node_cpu_allocatable_milli", func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPUAllocatableMilli = int64(v) }}, - {"clustercost_node_memory_allocatable_bytes", func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryAllocatableBytes = int64(v) }}, - {"clustercost_node_pod_count", func(e *store.NodeSummary, v float64, _ map[string]string) { e.PodCount = int(v) }}, - {"clustercost_node_under_pressure", func(e *store.NodeSummary, v float64, _ map[string]string) { e.IsUnderPressure = v > 0.5 }}, + {"clustercost_node_memory_allocatable_bytes", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.MemoryAllocatableBytes = int64(v) + updateMeta(e, l) + }}, + {"clustercost_node_cpu_requested_milli", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.CPURequestedMilli = int64(v) + updateMeta(e, l) + }}, + {"clustercost_node_memory_requested_bytes", true, func(e *store.NodeSummary, v float64, l map[string]string) { + e.MemoryRequestedBytes = int64(v) + updateMeta(e, l) + }}, + {"clustercost_node_cpu_limit_milli", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.CPULimitMilli = int64(v) }}, + {"clustercost_node_memory_limit_bytes", true, func(e *store.NodeSummary, v float64, _ map[string]string) { e.MemoryLimitBytes = int64(v) }}, } - out := make(map[string]*store.NodeSummary) for _, metric := range metrics { by := "node" - if metric.name == "clustercost_node_hourly_cost" { - by = "node,instance_type" + // Preserve metadata labels in aggregation + if strings.Contains(metric.name, "requested") || + strings.Contains(metric.name, "allocatable") { + by = "node,instance_type,node_label_node_kubernetes_io_instance_type,node_label_beta_kubernetes_io_instance_type,cluster_region,topology_kubernetes_io_region" } - expr := fmt.Sprintf("max by (%s) (%s)", by, c.lookbackExpr(metric.name, labels, clusterID)) - samples, err := c.query(ctx, expr) + + // determine function + fn := lookbackFunc + // For cost, average over time gives the average hourly rate during that window. + // For usage %, average makes sense. + // For Requests/Limits/Allocatable, they might vary if node resized (rare) or replaced. Average is decent. + + expr := fmt.Sprintf("%s(%s%s[%s])", fn, metric.name, formatLabels(c.scopedLabels(labels, clusterID)), windowStr) + // Need aggregation to preserve labels and unique by node + // max by (...) for snapshots, but avg by (...) for windows? + // Actually "avg by" works for all if we want the average stat. + aggOp := "avg" + if !strings.Contains(metric.name, "percent") && !strings.Contains(metric.name, "cost") { + // For allocatable/requests, max is often safer to see peak reservation? + // But for "Ghost Cost", average request is better? + // Let's stick to Average for Historical Analysis. + aggOp = "avg" + } + + fullExpr := fmt.Sprintf("%s by (%s) (%s)", aggOp, by, expr) + + samples, err := c.query(ctx, fullExpr) if err != nil { - return nil, time.Time{}, err + continue // Skip failing metrics rather than crash whole request } + for _, sample := range samples { node := sample.labels["node"] - if node == "" { - continue - } - entry := out[node] - if entry == nil { - entry = &store.NodeSummary{ - NodeName: node, - Labels: map[string]string{}, - Taints: []string{}, - } - out[node] = entry + entry := getOrCreate(node) + if entry != nil { + metric.assign(entry, sample.value, sample.labels) } - metric.assign(entry, sample.value, sample.labels) } } + // 3. Post-Processing & Cost Backfill + var pricing *store.PricingCatalog + if c.pricing != nil { + pricing = c.pricing + } else { + // NewPricingCatalog now takes 0 args (static) + pricing = store.NewPricingCatalog() + } + + // Fetch node status statusSamples, err := c.seriesTimestamp(ctx, "clustercost_node_status", labels) if err != nil && err != ErrNoData { return nil, time.Time{}, err @@ -851,7 +960,56 @@ func (c *Client) nodeMetrics(ctx context.Context, nodeName string) (map[string]* } } - latest := c.seriesTimestampSafe(ctx, "clustercost_node_hourly_cost") + for _, node := range out { + // Extract region from name fallback + if node.Labels["topology_kubernetes_io_region"] == "" { + re := regexp.MustCompile(`\.(us-[a-z]+-\d+)\.`) + matches := re.FindStringSubmatch(node.NodeName) + if len(matches) > 1 { + node.Labels["topology_kubernetes_io_region"] = matches[1] + } + } + + region := node.Labels["topology_kubernetes_io_region"] + if region == "" { + // Fallback: Default region if unknown, often us-east-1 or inferred from node name + if strings.Contains(node.NodeName, "us-east-1") { + region = "us-east-1" + } else if strings.Contains(node.NodeName, "us-west-2") { + region = "us-west-2" + } else if strings.Contains(node.NodeName, "eu-west-1") { + region = "eu-west-1" + } else { + region = "us-east-1" // ultimate fallback + } + } + + // Update region in labels so it persists + if node.Labels == nil { + node.Labels = map[string]string{} + } + node.Labels["topology_kubernetes_io_region"] = region + + instanceType := node.InstanceType + if instanceType == "" { + instanceType = "m5.large" // Default fallback to avoid 0 cost + } + + if node.HourlyCost == 0 { + node.HourlyCost = pricing.GetTotalNodePrice(context.Background(), region, instanceType) + } + + // CALCULATE WINDOW COST / TOTAL COST + if windowDur > 0 { + // Real Cost = HourlyRate * ActiveHours + node.WindowCost = node.HourlyCost * node.ActiveHours + } else { + // Snapshot projection (Monthly) + node.WindowCost = node.HourlyCost * 730 + } + } + + latest := c.seriesTimestampSafe(ctx, "clustercost_node_cpu_allocatable_milli") return out, latest, nil } @@ -1026,7 +1184,7 @@ func pickLatestStatus(samples []sample) map[string]string { func (c *Client) nodeNames(ctx context.Context) []string { clusterID := c.resolveClusterID(ctx) - expr := fmt.Sprintf("max by (node) (%s)", c.lookbackExpr("clustercost_node_hourly_cost", nil, clusterID)) + expr := fmt.Sprintf("max by (node) (%s)", c.lookbackExpr("clustercost_node_cpu_allocatable_milli", nil, clusterID)) samples, err := c.query(ctx, expr) if err != nil { return nil diff --git a/internal/vm/ingestor.go b/internal/vm/ingestor.go index 5ba8fee..d2aab8f 100644 --- a/internal/vm/ingestor.go +++ b/internal/vm/ingestor.go @@ -325,7 +325,7 @@ func (i *Ingestor) appendMetricsReport(buf, labelBuf *bytes.Buffer, scratch []by } // map[namespace]*nsAgg nsMap := make(map[string]*nsAgg) - pricing := store.NewPricingCatalog(nil) + pricing := store.NewPricingCatalog() region := req.Region if region == "" { region = req.AvailabilityZone @@ -534,6 +534,14 @@ func (i *Ingestor) appendMetricsReport(buf, labelBuf *bytes.Buffer, scratch []by writeFloatSample(buf, scratch, "clustercost_node_memory_usage_percent", nodeLabelsBlob, memPct, tsMillis) } + // Calculate Node Hourly Cost + // We use Capacity because you pay for the whole node, not just allocatable. + nodeCpuCores := float64(node.CapacityCpuMillicores) / 1000.0 + nodeMemGB := float64(node.CapacityMemoryBytes) / (1024 * 1024 * 1024) + nodeHourlyCost := (nodeCpuCores * cpuPrice) + (nodeMemGB * memPrice) + + writeFloatSample(buf, scratch, "clustercost_node_hourly_cost", nodeLabelsBlob, nodeHourlyCost, tsMillis) + // Node Network Metrics (Host Traffic) if node.Network != nil { nodeTx := safeInt64(node.Network.BytesSent) diff --git a/web/package-lock.json b/web/package-lock.json index 227b0d7..39dceb8 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -14,6 +14,7 @@ "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-slot": "^1.2.4", "@radix-ui/react-tabs": "^1.1.13", + "@radix-ui/react-tooltip": "^1.2.8", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "lucide-react": "^0.298.0", @@ -35,6 +36,7 @@ "@types/react-dom": "^18.2.7", "@vitejs/plugin-react": "^4.2.1", "autoprefixer": "^10.4.16", + "baseline-browser-mapping": "^2.9.15", "jsdom": "^24.0.0", "postcss": "^8.4.31", "tailwindcss": "^3.4.14", @@ -1709,6 +1711,58 @@ } } }, + "node_modules/@radix-ui/react-tooltip": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@radix-ui/react-tooltip/-/react-tooltip-1.2.8.tgz", + "integrity": "sha512-tY7sVt1yL9ozIxvmbtN5qtmH2krXcBCfjEiCgKGLqunJHvgvZG2Pcl2oQ3kbcZARb1BGEHdkLzcYGO8ynVlieg==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-dismissable-layer": "1.1.11", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-popper": "1.2.8", + "@radix-ui/react-portal": "1.1.9", + "@radix-ui/react-presence": "1.1.5", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-slot": "1.2.3", + "@radix-ui/react-use-controllable-state": "1.2.2", + "@radix-ui/react-visually-hidden": "1.2.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-slot": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.3.tgz", + "integrity": "sha512-aeNmHnBxbi2St0au6VBVC7JXFlhLlOnvIIlePNniyUNAClzmtAUEY8/pBiK3iHjufOlwA+c20/8jngo7xcrg8A==", + "license": "MIT", + "dependencies": { + "@radix-ui/react-compose-refs": "1.1.2" + }, + "peerDependencies": { + "@types/react": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-use-callback-ref": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.1.1.tgz", @@ -3047,9 +3101,9 @@ "license": "MIT" }, "node_modules/baseline-browser-mapping": { - "version": "2.8.28", - "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.8.28.tgz", - "integrity": "sha512-gYjt7OIqdM0PcttNYP2aVrr2G0bMALkBaoehD4BuRGjAOtipg0b6wHg1yNL+s5zSnLZZrGHOw4IrND8CD+3oIQ==", + "version": "2.9.15", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.15.tgz", + "integrity": "sha512-kX8h7K2srmDyYnXRIppo4AH/wYgzWVCs+eKr3RusRSQ5PvRYoEFmR/I0PbdTjKFAoKqp5+kbxnNTFO9jOfSVJg==", "dev": true, "license": "Apache-2.0", "bin": { diff --git a/web/package.json b/web/package.json index 5daf043..9ee6cae 100644 --- a/web/package.json +++ b/web/package.json @@ -17,13 +17,14 @@ "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-slot": "^1.2.4", "@radix-ui/react-tabs": "^1.1.13", + "@radix-ui/react-tooltip": "^1.2.8", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "lucide-react": "^0.298.0", "react": "^18.2.0", "react-dom": "^18.2.0", - "reactflow": "^11.11.4", "react-router-dom": "^6.20.0", + "reactflow": "^11.11.4", "recharts": "^2.8.0", "shadcn-ui": "^0.9.5", "tailwind-merge": "^1.14.0", @@ -38,6 +39,7 @@ "@types/react-dom": "^18.2.7", "@vitejs/plugin-react": "^4.2.1", "autoprefixer": "^10.4.16", + "baseline-browser-mapping": "^2.9.15", "jsdom": "^24.0.0", "postcss": "^8.4.31", "tailwindcss": "^3.4.14", diff --git a/web/src/components/common/MetricCard.tsx b/web/src/components/common/MetricCard.tsx new file mode 100644 index 0000000..83c7e7c --- /dev/null +++ b/web/src/components/common/MetricCard.tsx @@ -0,0 +1,39 @@ +import { Card, CardContent, CardHeader, CardTitle } from "../ui/card"; +import { cn } from "../../lib/utils"; + +interface MetricCardProps { + title: string; + value: string; + subtext?: string; + trend?: string; + trendUp?: boolean; + className?: string; + valueClassName?: string; +} + +export function MetricCard({ title, value, subtext, trend, trendUp, className, valueClassName }: MetricCardProps) { + return ( + + + + {title} + + + +
+ {value} +
+ {(subtext || trend) && ( +
+ {trend && ( + + {trend} + + )} + {subtext} +
+ )} +
+
+ ); +} diff --git a/web/src/components/nodes/EfficiencyBar.tsx b/web/src/components/nodes/EfficiencyBar.tsx new file mode 100644 index 0000000..991aa08 --- /dev/null +++ b/web/src/components/nodes/EfficiencyBar.tsx @@ -0,0 +1,183 @@ +import { Badge } from "@/components/ui/badge"; +import { formatCurrency } from "../../lib/utils"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; +import { AlertTriangle, CheckCircle2, TrendingDown } from "lucide-react"; + +interface EfficiencyBarProps { + usagePercent: number; + requestPercent: number; + costPerMonth: number; + usageAbsolute: number; + totalAbsolute: number; + unit: string; +} + +export function EfficiencyBar({ + usagePercent, + requestPercent, + costPerMonth, + usageAbsolute, + totalAbsolute, + unit +}: EfficiencyBarProps) { + // FinOps Logic: + // "Comfort Zone" Strategy: + // - Container = Total Node Capacity + // - Reserved = Light Rail (bg-white/15) + // - Threshold = White Line (Always visible on top) + // - Logic: + // - < 90% of Reserved: Gap (Cyan) + // - 90% - 110% of Reserved: Optimized (Green Badge, Cyan Bar) + // - > 110% of Reserved: Risk (Orange Bar, Orange Text) + + const ratio = requestPercent > 0 ? usagePercent / requestPercent : 0; + + // Logic Refinement (Hybrid Absolute/Relative): + // 1. High Risk (Orange): STRICTLY > 110% of reservation (Relative > 1.1). + // 2. Optimized (Green): Not High Risk AND Absolute Difference <= 10% (suppress "Gap: 3%" noise). + // 3. Gap (Cyan): Everything else (Absolute Difference > 10%). + + const isHighRisk = ratio > 1.1; + const diff = Math.abs(usagePercent - requestPercent); + const isOptimized = !isHighRisk && diff <= 10; + + const wastePercent = Math.max(0, requestPercent - usagePercent); + const overflowPercent = Math.max(0, usagePercent - requestPercent); + const wastedCost = costPerMonth * (wastePercent / 100); + + return ( +
+ {/* Micro-Text Label Row */} +
+ + {usageAbsolute.toFixed(1)} + / {totalAbsolute.toFixed(1)} {unit} + + + {/* Status Indicator */} + {isOptimized ? ( + + Optimized + + ) : isHighRisk ? ( + + Risk: +{overflowPercent.toFixed(0)}% + + ) : wastePercent > 0 ? ( + + Gap: {wastePercent.toFixed(0)}% + + ) : null} +
+ + {/* Bar Container */} + + + + {/* Layer 0: Total Capacity Container */} +
+ + {/* Layer 1: Reserved (The Contract Rail) */} + {/* Lighter grey to contrast with dark background */} +
+ + {/* Layer 2: Actual Usage (The Active Liquid) */} + {/* Sits ON TOP of Reserved. */} +
+ + {/* Layer 3: The Contract Line (Threshold) */} + {/* ALWAYS visible, white, sits on top of everything (z-20) */} + {requestPercent > 0 && ( +
+ )} +
+ + + {/* Professional Context Card Tooltip */} + +
+ {/* Header Section */} +
+ {isHighRisk ? ( + + ) : isOptimized ? ( + + ) : ( + + )} +
+

+ {isHighRisk ? "Stability Risk: Bursting" : isOptimized ? "Perfectly Rightsized" : "Efficiency Gap Detected"} +

+

+ {isHighRisk + ? "Operating above guaranteed limits." + : isOptimized + ? "Balanced resource utilization." + : "Resources reserved but unused."} +

+
+
+ + {/* Technical Evidence Section */} +
+
+ Usage: + + {usagePercent.toFixed(1)}% ({usageAbsolute.toFixed(2)} {unit}) + +
+
+ Reserved: + {requestPercent.toFixed(1)}% ({((totalAbsolute * requestPercent) / 100 || 0).toFixed(2)} {unit}) +
+
+ + {/* Educational Context Section */} +
+ {isHighRisk && ( + + This node is running at {ratio.toFixed(2)}x its reservation. + It relies on unguaranteed burst capacity and is a top candidate for OOMKill if cluster pressure increases. + + )} + {isOptimized && ( + + Usage is within the ±10% ideal stability window. + This configuration maximizes ROI without risking instability. No action required. + + )} + {!isHighRisk && !isOptimized && ( + + You are paying for {(requestPercent - usagePercent).toFixed(0)}% more capacity than needed. + This "air gap" provides no technical value and is pure financial waste. + + )} +
+ + {/* Financial Impact Footer */} + {!isOptimized && wastedCost > 0.01 && ( +
+ Monthly Waste + {formatCurrency(wastedCost)} +
+ )} +
+
+ + +
+ ); +} diff --git a/web/src/components/nodes/NodeDetailSheet.tsx b/web/src/components/nodes/NodeDetailSheet.tsx index 45a5e79..ceae404 100644 --- a/web/src/components/nodes/NodeDetailSheet.tsx +++ b/web/src/components/nodes/NodeDetailSheet.tsx @@ -1,8 +1,14 @@ -import { Sheet, SheetContent, SheetDescription, SheetHeader, SheetTitle } from "../ui/sheet"; -import { Progress } from "../ui/progress"; -import { Badge } from "../ui/badge"; +import { Sheet, SheetContent, SheetHeader, SheetTitle } from "@/components/ui/sheet"; +import { Progress } from "@/components/ui/progress"; +import { Badge } from "@/components/ui/badge"; +import { Button } from "@/components/ui/button"; +import { Tabs, TabsList, TabsTrigger } from "@/components/ui/tabs"; +import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"; import { formatCurrency } from "../../lib/utils"; -import type { NodeCost } from "../../lib/api"; +import { fetchNodeStats, fetchNodePods, type NodeCost, type NodeStats, type PodMetrics } from "../../lib/api"; +import { useState, useEffect, useMemo } from "react"; +import { CopyIcon, ShieldAlertIcon, ScissorsIcon, CheckCircle2Icon } from "lucide-react"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; interface NodeDetailSheetProps { node: (NodeCost & { monthlyCost: number }) | null; @@ -10,75 +16,277 @@ interface NodeDetailSheetProps { onOpenChange: (open: boolean) => void; } -const statusStyles: Record = { - Ready: "border-emerald-500/40 bg-emerald-500/10 text-emerald-200", - NotReady: "border-destructive/40 bg-destructive/10 text-destructive", - Unknown: "border-muted bg-muted/40 text-muted-foreground" -}; - const NodeDetailSheet = ({ node, open, onOpenChange }: NodeDetailSheetProps) => { + const [window, setWindow] = useState("24h"); + const [stats, setStats] = useState(null); + const [pods, setPods] = useState([]); + const [loading, setLoading] = useState(false); + + useEffect(() => { + if (open && node) { + setLoading(true); + // Parallel Fetch: Stats + Pods + Promise.all([ + fetchNodeStats(node.nodeName, window).catch(e => { console.error(e); return null; }), + fetchNodePods(node.nodeName, window).catch(e => { console.error(e); return []; }) + ]).then(([statsData, podsData]) => { + setStats(statsData); + setPods(podsData || []); + setLoading(false); + }); + } else { + setStats(null); + setPods([]); + } + }, [open, node, window]); + + const copyToClipboard = (text: string) => { + navigator.clipboard.writeText(text); + }; + + const getRec = (p95: number) => Math.ceil(p95 * 1.15); // P95 + 15% + + const generatePatch = (pod: PodMetrics, type: "cpu" | "memory" | "both", reason: "fix" | "shield") => { + const rawCpu = getRec(pod.cpuP95Milli); + // Ensure we don't go below 10m for CPU to be safe + const targetCpu = Math.max(10, rawCpu); + const targetCpuStr = `${targetCpu}m`; + + const rawMem = getRec(pod.memoryP95Bytes); + // Convert to Mi + const targetMemMi = Math.ceil(rawMem / (1024 * 1024)); + const targetMemStr = `${targetMemMi}Mi`; + + const resources: any = { requests: {} }; + if (type === "cpu" || type === "both") resources.requests.cpu = targetCpuStr; + if (type === "memory" || type === "both") resources.requests.memory = targetMemStr; + + // Use container name approximation or index 0 for now as we don't have container name in PodMetrics yet. + // Using simple approach: Assume first container needs fix found in spec. + // Actually we need the container name. Backend aggregates by pod... + // For now we will use pod name prefix as best effort, or just "name". + // A better approach is usually `kubectl set resources` or patch assuming single container or main container. + // Let's use the first container approach in the patch for now: `spec: { containers: [ { name: "?", ... } ] }`. + // Wait, we don't know the container name. + // To make this robust without container name, we can try to patch by index `containers[0]`. + // JSON Patch: `[{"op": "replace", "path": "/spec/containers/0/resources/requests/cpu", "value": "..."}]` + // But let's stick to the user's requested text format: `kubectl patch ...` + // We will use `deployment` logic usually, but here we patch the POD? Pods are ephemeral. + // Ideally we patch the deployment. + // User asked for "Fix YAML". + const containerName = pod.podName.split("-")[0]; // Heuristic + + return `kubectl patch pod ${pod.podName} -n ${pod.namespace} --patch '{"spec":{"containers":[{"name":"${containerName}", "resources":{"requests":{"cpu":"${targetCpuStr}","memory":"${targetMemStr}"}}}]}}'`; + }; + if (!node) return null; - const statusBadge = ( - - {node.status} - - ); + // SORTING LOGIC: Financial Impact (Savings First) + // Heuristic: ~$32/vCPU/mo, ~$4/GB/mo + const COST_PER_VCPU = 32; + const COST_PER_GB = 4; - const usageSummary = (() => { - if (node.cpuUsagePercent > 70 || node.memoryUsagePercent > 70) return "Node is heavily used."; - if (node.cpuUsagePercent < 30 && node.memoryUsagePercent < 30) return "This node is mostly idle."; - return "Usage looks normal."; - })(); + const getSavings = (pod: PodMetrics) => { + const cpuRec = getRec(pod.cpuP95Milli); + const cpuWasteCores = (pod.cpuRequestMilli - cpuRec) / 1000; + const cpuSavings = cpuWasteCores * COST_PER_VCPU; + + const memRec = getRec(pod.memoryP95Bytes); + const memWasteGB = (pod.memoryRequestBytes - memRec) / (1024 * 1024 * 1024); + const memSavings = memWasteGB * COST_PER_GB; + + return cpuSavings + memSavings; + }; + + const sortedPods = [...pods].sort((a, b) => { + return getSavings(b) - getSavings(a); // Descending (Biggest Savings First) + }); return ( - - + + {/* HEADER */} + - {node.nodeName} - - {(node.instanceType ?? "Unknown type")} · {node.podCount} pods - + {node.nodeName} +
+ {node.instanceType} + {node.podCount} Pods +
- - {statusBadge} - {node.isUnderPressure && Under pressure} - -
- -
-
-

Monthly cost

-

{formatCurrency(node.monthlyCost)}

-

{formatCurrency(node.hourlyCost, { maximumFractionDigits: 2 })}/hr

-
- -
-
-
- CPU usage - {node.cpuUsagePercent.toFixed(0)}% -
- +
+
+

Monthly Cost

+

{formatCurrency(node.monthlyCost)}

-
-
- Memory usage - {node.memoryUsagePercent.toFixed(0)}% -
- +
stats.totalMonthlyCost * 0.1 ? "border-emerald-500/20 bg-emerald-500/5 text-emerald-500" : "text-muted-foreground"}`}> +

Potential Savings

+

+ {stats ? formatCurrency(stats.totalMonthlyCost - stats.realUsageMonthlyCost) : "..."} +

-
+
+
+ + + +
+
+ {/* P95 METRICS */} + {stats && ( +
+

Node P95 Analysis ({window})

+
+
+
+ CPU P95 Load + {stats.p95CpuUsagePercent.toFixed(1)}% +
+ +
+
+
+ Memory P95 Load + {stats.p95MemoryUsagePercent.toFixed(1)}% +
+ +
+
+
+ )} + + {/* FULL AUDIT TABLE */} +
+

Full Pod Audit (P95 + 15% Safety Margin)

+
+ + + + Pod (QoS) + CPU (Req → P95) + RAM (Req → P95) + Action + + + + {loading ? ( + + + Analyzing pod logs & metrics... + + + ) : pods.length === 0 ? ( + + + No pods found or agent not reporting deep metrics yet. + + + ) : ( + sortedPods.map(pod => { + // CPU Analysis + const cpuRec = getRec(pod.cpuP95Milli); + const cpuDiff = cpuRec - pod.cpuRequestMilli; + const cpuRisk = pod.cpuP95Milli > pod.cpuRequestMilli; + const cpuOptimized = !cpuRisk && Math.abs(cpuDiff) <= (0.1 * pod.cpuRequestMilli); -
-

Status

-

{usageSummary}

-
+ // MEM Analysis + const memRec = getRec(pod.memoryP95Bytes); + const memDiff = memRec - pod.memoryRequestBytes; + const memRisk = pod.memoryP95Bytes > pod.memoryRequestBytes; + const memOptimized = !memRisk && Math.abs(memDiff) <= (0.1 * pod.memoryRequestBytes); + + // Global State + const isRisk = cpuRisk || memRisk; + const isOptimized = cpuOptimized && memOptimized; + + const cpuReqStr = `${pod.cpuRequestMilli}m`; + const cpuP95Str = `${pod.cpuP95Milli.toFixed(0)}m`; + const memReqStr = `${(pod.memoryRequestBytes / (1024 * 1024)).toFixed(0)}Mi`; + const memP95Str = `${(pod.memoryP95Bytes / (1024 * 1024)).toFixed(0)}Mi`; + + return ( + + +
+ {pod.podName} + {pod.namespace} + {pod.qosClass} +
+
+ + +
+ + {cpuReqStr} → {cpuP95Str} + + {cpuRisk && RISK} + {!cpuRisk && !cpuOptimized && Waste} +
+
+ + +
+ + {memReqStr} → {memP95Str} + + {memRisk && RISK} + {!memRisk && !memOptimized && Waste} +
+
+ + + {isOptimized ? ( + + Optimized + + ) : ( + + + + + + + Copy {isRisk ? "Upsize" : "Downsize"} Patch (CPU & RAM) + + + + )} + +
+ ); + }) + )} +
+
+
+
+
); }; +const SectionTabs = ({ window, setWindow }: { window: string; setWindow: (w: string) => void }) => ( +
+ + + 24h + 7d + 30d + + +
+); + export default NodeDetailSheet; diff --git a/web/src/components/ui/progress.tsx b/web/src/components/ui/progress.tsx index 3fd47ad..54d2611 100644 --- a/web/src/components/ui/progress.tsx +++ b/web/src/components/ui/progress.tsx @@ -5,8 +5,8 @@ import { cn } from "@/lib/utils" const Progress = React.forwardRef< React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, value, ...props }, ref) => ( + React.ComponentPropsWithoutRef & { indicatorClassName?: string } +>(({ className, indicatorClassName, value, ...props }, ref) => ( diff --git a/web/src/components/ui/tooltip.tsx b/web/src/components/ui/tooltip.tsx new file mode 100644 index 0000000..b0542cb --- /dev/null +++ b/web/src/components/ui/tooltip.tsx @@ -0,0 +1,28 @@ +import * as React from "react" +import * as TooltipPrimitive from "@radix-ui/react-tooltip" + +import { cn } from "../../lib/utils" + +const TooltipProvider = TooltipPrimitive.Provider + +const Tooltip = TooltipPrimitive.Root + +const TooltipTrigger = TooltipPrimitive.Trigger + +const TooltipContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, sideOffset = 4, ...props }, ref) => ( + +)) +TooltipContent.displayName = TooltipPrimitive.Content.displayName + +export { Tooltip, TooltipTrigger, TooltipContent, TooltipProvider } diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 3805d12..e0f4596 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -1,6 +1,9 @@ import type { Environment } from "./utils"; -const API_PREFIX = "/api"; +// Allow overriding the API base via env var (useful for production builds or direct CORS) +// If VITE_API_URL is set (e.g. "https://api.example.com"), we use that. +// Otherwise default to local proxy "/api". +const API_PREFIX = import.meta.env.VITE_API_URL || "/api"; const normalizeEnvironment = (value?: string): Environment => { switch ((value || "").toLowerCase()) { @@ -110,8 +113,15 @@ export interface NamespacesResponse { type NodeCostApi = { nodeName: string; hourlyCost: number; + windowCost: number; + activeHours: number; + activeRatio: number; cpuUsagePercent: number; memoryUsagePercent: number; + cpuRequestedMilli?: number; + cpuLimitMilli?: number; + memoryRequestedBytes?: number; + memoryLimitBytes?: number; cpuAllocatableMilli?: number; memoryAllocatableBytes?: number; podCount: number; @@ -331,8 +341,9 @@ export const fetchNamespaces = async (): Promise => { }; }; -export const fetchNodes = async (): Promise => { - const resp = await request("/cost/nodes"); +export const fetchNodes = async (window?: string): Promise => { + const query = window ? `?window=${window}` : ""; + const resp = await request(`/cost/nodes${query}`); return resp.items.map((node) => ({ ...node, labels: node.labels ?? {}, @@ -341,6 +352,33 @@ export const fetchNodes = async (): Promise => { })); }; +export interface NodeStats { + nodeName: string; + p95CpuUsagePercent: number; + p95MemoryUsagePercent: number; + totalMonthlyCost: number; + realUsageMonthlyCost: number; + window: string; +} + +export const fetchNodeStats = async (name: string, window: string): Promise => { + return request(`/cost/nodes/${name}/stats?window=${window}`); +}; + +export interface PodMetrics { + podName: string; + namespace: string; + qosClass: string; + cpuRequestMilli: number; + cpuP95Milli: number; + memoryRequestBytes: number; + memoryP95Bytes: number; +} + +export const fetchNodePods = async (name: string, window: string): Promise => { + return request(`/cost/nodes/${name}/pods?window=${window}`); +}; + export const fetchResources = async (): Promise => { const resp = await request("/cost/resources"); return { diff --git a/web/src/pages/namespaces/NamespacesPage.test.tsx b/web/src/pages/namespaces/NamespacesPage.test.tsx index 2f115ed..d8783e4 100644 --- a/web/src/pages/namespaces/NamespacesPage.test.tsx +++ b/web/src/pages/namespaces/NamespacesPage.test.tsx @@ -11,6 +11,7 @@ vi.mock("../../hooks/useApiData", () => ({ })); // Mock ResizeObserver +// @ts-ignore global.ResizeObserver = class ResizeObserver { observe() { } unobserve() { } diff --git a/web/src/pages/nodes/NodesPage.tsx b/web/src/pages/nodes/NodesPage.tsx index e78218d..0e3c9f5 100644 --- a/web/src/pages/nodes/NodesPage.tsx +++ b/web/src/pages/nodes/NodesPage.tsx @@ -1,365 +1,362 @@ -import { useMemo, useState, type ChangeEvent } from "react"; +import { useMemo, useState, useCallback, type ChangeEvent } from "react"; import { fetchNodes, type NodeCost } from "../../lib/api"; -import { formatCurrency, formatPercentage, relativeTimeFromIso, toMonthlyCost } from "../../lib/utils"; +import { formatCurrency, formatPercentage, relativeTimeFromIso, toMonthlyCost, milliToCores } from "../../lib/utils"; import { useApiData } from "../../hooks/useApiData"; import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { Button } from "@/components/ui/button"; -import { Progress } from "@/components/ui/progress"; import { Badge } from "@/components/ui/badge"; import { Input } from "@/components/ui/input"; import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"; +import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select"; import { Skeleton } from "@/components/ui/skeleton"; import NodeDetailSheet from "@/components/nodes/NodeDetailSheet"; +import { MetricCard } from "@/components/common/MetricCard"; +import { EfficiencyBar } from "@/components/nodes/EfficiencyBar"; +import { AlertTriangleIcon, CheckCircle2Icon, SearchIcon, ArrowDownIcon } from "lucide-react"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; -const statusStyles: Record = { - Ready: "border-emerald-500/40 bg-emerald-500/10 text-emerald-200", - NotReady: "border-destructive/40 bg-destructive/10 text-destructive", - Unknown: "border-muted bg-muted/40 text-muted-foreground" -}; - -type SortKey = "cost" | "cpu" | "memory"; +type SortKey = "cost" | "waste" | "efficiency"; const NodesPage = () => { - const { data, loading, error, refresh } = useApiData(fetchNodes); + const [timeWindow, setTimeWindow] = useState("24h"); + + // Fetch with window + const fetchNodesWithWindow = useCallback(() => fetchNodes(timeWindow), [timeWindow]); + const { data, loading, error, refresh } = useApiData(fetchNodesWithWindow); + const nodes = data ?? []; const [search, setSearch] = useState(""); - const [sortKey, setSortKey] = useState("cost"); + const [sortKey, setSortKey] = useState("cost"); // Default to cost for financial view const [sortDirection, setSortDirection] = useState<"asc" | "desc">("desc"); const [selectedNode, setSelectedNode] = useState<(NodeCost & { monthlyCost: number }) | null>(null); + // Fallback Cost Logic + const getEstimatedCost = (instanceType: string | undefined): number => { + if (!instanceType) return 73; + if (instanceType.includes("nano")) return 4; + if (instanceType.includes("micro")) return 8; + if (instanceType.includes("small")) return 16; + if (instanceType.includes("medium")) return 32; + if (instanceType.includes("large") && !instanceType.includes("xlarge")) return 64; + if (instanceType.includes("xlarge")) return 128; + if (instanceType.includes("2xlarge")) return 256; + return 73; + }; + const derivedNodes = useMemo(() => { - return nodes.map((node) => ({ - ...node, - cpuUsagePercent: node.cpuUsagePercent ?? 0, - memoryUsagePercent: node.memoryUsagePercent ?? 0, - hourlyCost: node.hourlyCost ?? 0, - monthlyCost: toMonthlyCost(node.hourlyCost ?? 0) - })); + return nodes.map((node) => { + let hourlyCost = node.hourlyCost ?? 0; + let isEstimate = false; + + if (hourlyCost === 0) { + hourlyCost = getEstimatedCost(node.instanceType) / 730; + isEstimate = true; + } + + // Use backend provided WindowCost if activeHours logic applied, otherwise calculate projection + const windowCost = node.windowCost || (hourlyCost * 24); // fallback + const monthlyCost = hourlyCost * 730; // Still useful for reference + + const cpuAllocatable = node.cpuAllocatableMilli ?? 0; + const cpuRequestPercent = cpuAllocatable > 0 ? ((node.cpuRequestedMilli ?? 0) / cpuAllocatable) * 100 : 0; + const cpuUsage = node.cpuUsagePercent ?? 0; + + // Calculate Memory Stats + const memAllocatable = node.memoryAllocatableBytes ?? 0; + const memRequestPercent = memAllocatable > 0 ? ((node.memoryRequestedBytes ?? 0) / memAllocatable) * 100 : 0; + const memUsage = node.memoryUsagePercent ?? 0; + + // FinOps Waste Calculation + const wastePercent = Math.max(0, cpuRequestPercent - cpuUsage); + // Waste Amount based on Window Cost + const wasteAmount = windowCost * (wastePercent / 100); + + const isEfficient = wastePercent < 15; + const isOverProvisioned = wastePercent > 30; + + return { + ...node, + cpuUsagePercent: cpuUsage, + cpuRequestPercent, + memoryUsagePercent: memUsage, + memRequestPercent, + monthlyCost, + windowCost, + isEstimate, + wastePercent, + wasteAmount, + isEfficient, + isOverProvisioned, + shortName: node.nodeName // no truncation + }; + }); }, [nodes]); const summary = useMemo(() => { - const totalMonthly = derivedNodes.reduce((sum, node) => sum + node.monthlyCost, 0); - const avgCpu = derivedNodes.length - ? derivedNodes.reduce((sum, node) => sum + node.cpuUsagePercent, 0) / derivedNodes.length - : 0; - const avgMem = derivedNodes.length - ? derivedNodes.reduce((sum, node) => sum + node.memoryUsagePercent, 0) / derivedNodes.length - : 0; - const issueCount = derivedNodes.filter((node) => node.status !== "Ready" || node.isUnderPressure).length; - return { totalMonthly, avgCpu, avgMem, ready: derivedNodes.length - issueCount, issues: issueCount }; - }, [derivedNodes]); + const totalWindowCost = derivedNodes.reduce((sum, n) => sum + n.windowCost, 0); + const totalWaste = derivedNodes.reduce((sum, n) => sum + n.wasteAmount, 0); + const potentialSavings = totalWaste * 0.6; - const filteredNodes = useMemo(() => { - const term = search.trim().toLowerCase(); - if (!term) return derivedNodes; - return derivedNodes.filter((node) => node.nodeName.toLowerCase().includes(term)); - }, [derivedNodes, search]); + return { totalWindowCost, totalWaste, potentialSavings }; + }, [derivedNodes]); const sortedNodes = useMemo(() => { - const rows = [...filteredNodes]; - const valueFor = (node: (typeof derivedNodes)[number]) => { - if (sortKey === "cpu") return node.cpuUsagePercent; - if (sortKey === "memory") return node.memoryUsagePercent; - return node.monthlyCost; - }; + const rows = [...derivedNodes]; rows.sort((a, b) => { - const diff = valueFor(a) - valueFor(b); - return sortDirection === "asc" ? diff : -diff; + const valA = sortKey === "waste" ? a.wasteAmount : (sortKey === "cost" ? a.windowCost : a.wastePercent); + const valB = sortKey === "waste" ? b.wasteAmount : (sortKey === "cost" ? b.windowCost : b.wastePercent); + return sortDirection === "asc" ? valA - valB : valB - valA; }); return rows; - }, [filteredNodes, sortKey, sortDirection]); + }, [derivedNodes, sortKey, sortDirection]); const handleSort = (key: SortKey) => { - if (key === sortKey) { - setSortDirection((dir) => (dir === "desc" ? "asc" : "desc")); - } else { - setSortKey(key); - setSortDirection("desc"); - } + if (key === sortKey) setSortDirection(d => d === "desc" ? "asc" : "desc"); + else { setSortKey(key); setSortDirection("desc"); } }; - const optimizationCandidates = useMemo(() => { - if (!derivedNodes.length) return []; - const sortedCosts = [...derivedNodes].sort((a, b) => b.monthlyCost - a.monthlyCost); - const index = Math.max(0, Math.floor(sortedCosts.length * 0.3) - 1); - const costThreshold = sortedCosts[index]?.monthlyCost ?? 0; - return derivedNodes - .filter( - (node) => - node.monthlyCost >= costThreshold && node.cpuUsagePercent < 35 && node.memoryUsagePercent < 35 - ) - .sort((a, b) => b.monthlyCost - a.monthlyCost) - .slice(0, 5); - }, [derivedNodes]); - - const alerts = useMemo(() => { - return derivedNodes - .map((node) => { - const reasons: string[] = []; - if (node.status !== "Ready") reasons.push(node.status); - if (node.cpuUsagePercent >= 85) reasons.push(`CPU ${node.cpuUsagePercent.toFixed(0)}%`); - if (node.memoryUsagePercent >= 85) reasons.push(`Memory ${node.memoryUsagePercent.toFixed(0)}%`); - if (node.isUnderPressure) reasons.push("Under pressure"); - return { node, reasons }; - }) - .filter((item) => item.reasons.length > 0) - .slice(0, 5); - }, [derivedNodes]); - - const lastUpdatedLabel = data?.[0]?.lastUpdated ? relativeTimeFromIso(data[0].lastUpdated) : "moments ago"; - - const renderSortLabel = (key: SortKey, label: string) => ( - - ); - - if (loading && !data) { - return ; - } + const getWindowLabel = (w: string) => { + switch (w) { + case "24h": return "Last 24 Hours"; + case "7d": return "Last 7 Days"; + case "30d": return "Last 30 Days"; + default: return w; + } + }; - if (error) { - return ( - - {error} - - ); - } - - if (!derivedNodes.length) { - return ( - - - No nodes - - -

We couldn’t find any nodes. Once data arrives it will show up here.

- -
-
- ); - } + if (loading && !data) return ; + if (error) return
Failed to load data: {error}
; return ( -
-
+
+ {/* Header Section */} +
-

Nodes

-

See how much each node costs and how full it is.

+

Cluster Financials

+

Real-time analysis based on actual uptime.

-
Last updated {lastUpdatedLabel}
-
+
+ + + +
+
-
- + {/* The "Truth" Cards */} +
+ - Total node cost + + Spend ({timeWindow}) + -

{formatCurrency(summary.totalMonthly)}

-

Monthly (hourly x 30 days)

-
-
- - - Avg CPU usage - - -

{formatPercentage(summary.avgCpu, { fractionDigits: 0 })}

- +
+ {formatCurrency(summary.totalWindowCost)} +
+

+ Actual cost based on {getWindowLabel(timeWindow)} uptime +

- + + +
+ {summary.totalWaste > 0 && ( + Action Required + )} +
- Avg memory usage + + Waste ({timeWindow}) + -

{formatPercentage(summary.avgMem, { fractionDigits: 0 })}

- +
0 ? "text-destructive" : "text-emerald-500"}`}> + {formatCurrency(summary.totalWaste)} +
+

+ Money burned on unused capacity +

- + + - Node health + Actionable Savings -

- {summary.ready} Ready · {summary.issues} With issues -

-

Issues = NotReady or under pressure

-
-
-
- -
- - -
- Nodes -

Sorted by monthly cost

-
-
- ) => setSearch(event.target.value)} - className="h-9 w-full" - /> -
-
- -
-
-
- - - - Node - {renderSortLabel("cost", "Monthly cost")} - {renderSortLabel("cpu", "CPU usage")} - {renderSortLabel("memory", "Memory usage")} - Status - - - - {sortedNodes.map((node) => ( - setSelectedNode(node)}> - -
- {node.nodeName} - {node.instanceType && ( - - {node.instanceType} - - )} -
-
- - {formatCurrency(node.monthlyCost)} - - -
- - {node.cpuUsagePercent.toFixed(0)}% -
-
- -
- - {node.memoryUsagePercent.toFixed(0)}% -
-
- - - {node.status} - - -
- ))} -
-
-
-
-
-
- {sortedNodes.map((node) => ( - - ))} +
+ {formatCurrency(summary.potentialSavings)}
+

+ Conservative achievable reduction +

+
+ + {/* FinOps Table - High Density */} + +
+
+ + {sortedNodes.length} Nodes (Active in {timeWindow}) + +
+
+ + setSearch(e.target.value)} + /> +
+
-
- - - Optimization ideas - - - {optimizationCandidates.length === 0 ? ( -

Looks good, no obvious wasted nodes right now.

- ) : ( - optimizationCandidates.map((node) => ( -
-
- {node.nodeName} - {formatCurrency(node.monthlyCost)} +
+ + + + Node Identity + handleSort("cost")}> + Cost ({timeWindow}) + + + CPU Efficiency + + ( Usage / Reserved) + + + + Memory Efficiency + + ( Usage / Reserved) + + + handleSort("waste")}>Action + + + + {sortedNodes.filter(n => n.nodeName.includes(search)).map((node) => ( + + + {/* Column 1: Identity */} + +
+ {node.instanceType || "Unknown"} + + + + + {node.nodeName} + + + +

{node.nodeName}

+
+
+
-

- {node.cpuUsagePercent.toFixed(0)}% CPU · {node.memoryUsagePercent.toFixed(0)}% Mem -

- - )) - )} - - - - - - Alerts - - - {alerts.length === 0 ? ( -

All nodes healthy.

- ) : ( - alerts.map(({ node, reasons }) => ( -
-
- {node.nodeName} - Check + + + {/* Column 2: Cost */} + +
+
+ + {formatCurrency(node.windowCost)} + + {node.isEstimate && ( + + + * + Estimated Cost + + + )} +
+
+ + ${Number(node.hourlyCost.toFixed(4))}/hr + + {node.activeHours > 0 && ( + + {node.activeHours.toFixed(1)}h active ({((node.activeRatio || 0) * 100).toFixed(0)}%) + + )} +
-

{reasons.join(" · ")}

-
- )) - )} - - + + + {/* Column 3: CPU Efficiency (Stacked) */} + + + + + {/* Column 4: RAM Efficiency (Stacked) */} + + + + + {/* Column 5: Action */} + + {node.isEfficient ? ( + + Optimized + + ) : ( +
+ + + Save ~{formatCurrency(node.wasteAmount)} + +
+ )} +
+ + ))} + +
-
+ { - if (!open) { - setSelectedNode(null); - } - }} + onOpenChange={(open) => { if (!open) setSelectedNode(null); }} />
); diff --git a/web/src/pages/resources/ResourcesPage.tsx b/web/src/pages/resources/ResourcesPage.tsx index 66d173a..b1ae086 100644 --- a/web/src/pages/resources/ResourcesPage.tsx +++ b/web/src/pages/resources/ResourcesPage.tsx @@ -13,6 +13,7 @@ import { import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { Progress } from "@/components/ui/progress"; import { Skeleton } from "@/components/ui/skeleton"; +import { Button } from "@/components/ui/button"; import { Sheet, SheetContent, SheetDescription, SheetHeader, SheetTitle } from "@/components/ui/sheet"; import { Badge } from "@/components/ui/badge"; import { ResponsiveContainer, Bar, BarChart, Legend, Tooltip, XAxis, YAxis } from "recharts"; diff --git a/web/vite.config.ts b/web/vite.config.ts index 7178a0a..a1afe31 100644 --- a/web/vite.config.ts +++ b/web/vite.config.ts @@ -1,27 +1,43 @@ -import { defineConfig } from "vite"; +import { defineConfig, loadEnv } from "vite"; import react from "@vitejs/plugin-react"; import path from "path" -export default defineConfig({ - plugins: [react()], - resolve: { - alias: { - "@": path.resolve(__dirname, "./src"), +export default defineConfig(({ mode }) => { + // Load env file based on `mode` in the current working directory. + // Set the third parameter to '' to load all env regardless of the `VITE_` prefix. + const env = loadEnv(mode, process.cwd(), ''); + + // Priority: Shell Env (process.env) > .env File (env) > Default + const apiTarget = process.env.VITE_API_TARGET || env.VITE_API_TARGET || "http://localhost:9090"; + + console.log(`[Vite] Proxying /api to: ${apiTarget}`); + + return { + plugins: [react()], + resolve: { + alias: { + "@": path.resolve(__dirname, "./src"), + }, + }, + server: { + port: 5173, + proxy: { + "/api": { + target: apiTarget, + changeOrigin: true, + // If you want to strip /api prefix from the request when proxying: + // rewrite: (path) => path.replace(/^\/api/, ''), + } + } + }, + build: { + outDir: "dist", + emptyOutDir: true }, - }, - server: { - port: 5173, - proxy: { - "/api": "http://localhost:9090" + test: { + globals: true, + environment: "jsdom", + setupFiles: "./src/test/setup.ts" } - }, - build: { - outDir: "dist", - emptyOutDir: true - }, - test: { - globals: true, - environment: "jsdom", - setupFiles: "./src/test/setup.ts" - } + }; });