Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cmd/dashboard/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ func main() {
// Initialize FinOps Engine
finopsEngine := finops.NewEngine(vmClient, st.PricingCatalog())

// Share Pricing Catalog with VM Client
vmClient.SetPricingCatalog(st.PricingCatalog())

auth.SetSecret(cfg.JWTSecret)

srv := &http.Server{
Expand Down
4 changes: 4 additions & 0 deletions docs/TECHDEBT.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@
## Future Considerations
- [ ] **Retention Policies**: Configure distinct retention periods for high-precision metrics (15s interval) vs. aggregated historical data.
- [ ] **Refactor Store Locking**: Evaluate moving from heavy `RWMutex` usage in `store.go` to a more concurrent pattern if contention increases with 100+ agents.
- [ ] **Dynamic Pricing & Savings Plans Support**:
- Problem: Agents no longer send costs; backend relies on static On-Demand rates.
- Solution: Implement a **Dynamic Pricing Engine** with DB overrides for Savings Plans, Reserved Instances, and Spot Pricing.
- Design: See `dynamic_pricing_design.md` artifact.
6 changes: 6 additions & 0 deletions internal/api/handlers_health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ func (f *fakeMetricsProvider) ClusterMetadata(context.Context) (store.ClusterMet
func (f *fakeMetricsProvider) NetworkTopology(context.Context, store.NetworkTopologyOptions) ([]store.NetworkEdge, error) {
return nil, vm.ErrNoData
}
func (f *fakeMetricsProvider) GetNodeStats(context.Context, string, string, time.Duration) (store.NodeStats, error) {
return store.NodeStats{}, vm.ErrNoData
}
func (f *fakeMetricsProvider) GetNodePods(context.Context, string, string, time.Duration) ([]store.PodMetrics, error) {
return nil, vm.ErrNoData
}

func newTestHandler(meta store.ClusterMetadata, status store.AgentStatusPayload) *Handler {
return &Handler{vm: &fakeMetricsProvider{meta: meta, status: status}}
Expand Down
48 changes: 48 additions & 0 deletions internal/api/handlers_nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package api

import (
"net/http"
"time"

"github.com/go-chi/chi/v5"

Expand All @@ -22,6 +23,7 @@ func (h *Handler) Nodes(w http.ResponseWriter, r *http.Request) {
Search: q.Get("search"),
Limit: parseLimit(q.Get("limit"), defaultNodeLimit, maxNodeLimit),
Offset: parseOffset(q.Get("offset")),
Window: q.Get("window"), // "24h", "7d", "30d"
}

resp, err := h.vm.NodeList(ctx, filter)
Expand Down Expand Up @@ -58,3 +60,49 @@ func (h *Handler) NodeDetail(w http.ResponseWriter, r *http.Request) {

writeJSON(w, http.StatusOK, node)
}

// NodeStats returns historical usage and cost stats for a node.
func (h *Handler) NodeStats(w http.ResponseWriter, r *http.Request) {
name := chi.URLParam(r, "name")
if name == "" {
writeError(w, http.StatusBadRequest, "node name is required")
return
}
windowStr := r.URL.Query().Get("window")
window, _ := time.ParseDuration(windowStr)
if window <= 0 {
window = 24 * time.Hour
}

ctx := vm.WithClusterID(r.Context(), clusterIDFromRequest(r))
stats, err := h.vm.GetNodeStats(ctx, "", name, window)
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}

writeJSON(w, http.StatusOK, stats)
}

// NodePods returns the list of pods for a node with P95 metrics (Pod Audit).
func (h *Handler) NodePods(w http.ResponseWriter, r *http.Request) {
name := chi.URLParam(r, "name")
if name == "" {
writeError(w, http.StatusBadRequest, "node name is required")
return
}
windowStr := r.URL.Query().Get("window")
window, _ := time.ParseDuration(windowStr)
if window <= 0 {
window = 24 * time.Hour
}

ctx := vm.WithClusterID(r.Context(), clusterIDFromRequest(r))
pods, err := h.vm.GetNodePods(ctx, "", name, window)
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}

writeJSON(w, http.StatusOK, pods)
}
5 changes: 5 additions & 0 deletions internal/api/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"net/http"
"time"

"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
Expand All @@ -28,6 +29,8 @@ type MetricsProvider interface {
Agents(ctx context.Context) ([]store.AgentInfo, error)
ClusterMetadata(ctx context.Context) (store.ClusterMetadata, error)
NetworkTopology(ctx context.Context, opts store.NetworkTopologyOptions) ([]store.NetworkEdge, error)
GetNodeStats(ctx context.Context, clusterID, nodeName string, window time.Duration) (store.NodeStats, error)
GetNodePods(ctx context.Context, clusterID, nodeName string, window time.Duration) ([]store.PodMetrics, error)
}

// Handler wires HTTP requests to the VictoriaMetrics client.
Expand Down Expand Up @@ -73,6 +76,8 @@ func NewRouter(vmClient MetricsProvider, db *db.Store, st *store.Store, finopsEn
cost.Get("/namespaces/{name}", h.NamespaceDetail)
cost.Get("/nodes", h.Nodes)
cost.Get("/nodes/{name}", h.NodeDetail)
cost.Get("/nodes/{name}/stats", h.NodeStats)
cost.Get("/nodes/{name}/pods", h.NodePods)
cost.Get("/resources", h.Resources)
})
protected.Get("/agent", h.AgentStatus)
Expand Down
57 changes: 22 additions & 35 deletions internal/store/pricing.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package store

import "context"
import (
"context"
"fmt"

"github.com/clustercost/clustercost-dashboard/internal/pricing"
)

// Pricing constants
const (
Expand All @@ -10,51 +15,33 @@ const (
CostEgressInternal = 0.00 // Free
)

// PricingProvider defines the interface for fetching node pricing.
type PricingProvider interface {
GetNodePrice(ctx context.Context, region, instanceType string) (float64, error)
}

// PricingCatalog allows looking up node prices.
type PricingCatalog struct {
// Map instance type to hourly price
InstancePrices map[string]float64
Provider PricingProvider
// No provider needed, we use static data from internal/pricing
}

// NewPricingCatalog returns a catalog with some default mocked pricing.
func NewPricingCatalog(provider PricingProvider) *PricingCatalog {
return &PricingCatalog{
InstancePrices: map[string]float64{
"t3.medium": 0.0416,
"t3.large": 0.0832,
"m5.large": 0.096,
"m5.xlarge": 0.192,
"c5.large": 0.085,
"r5.large": 0.126,
"default": 0.05, // Fallback
},
Provider: provider,
}
// NewPricingCatalog returns a catalog.
func NewPricingCatalog() *PricingCatalog {
return &PricingCatalog{}
}

// GetTotalNodePrice returns the total hourly cost of a node.
func (pc *PricingCatalog) GetTotalNodePrice(ctx context.Context, region, instanceType string) float64 {
// Try Provider first
if pc.Provider != nil && instanceType != "" && region != "" {
price, err := pc.Provider.GetNodePrice(ctx, region, instanceType)
if err == nil && price > 0 {
pc.InstancePrices[instanceType] = price // Update cache
return price
}
// 1. Try Shared Static Data
key := fmt.Sprintf("%s|%s", region, instanceType)
if price, ok := pricing.InstancePrices[key]; ok {
return price
}

// Fallback to local cache
price, ok := pc.InstancePrices[instanceType]
if !ok {
price = pc.InstancePrices["default"]
// 2. Fallback to generic defaults if completely unknown
// check if we have a default for the instance type regardless of region (common for US-East-1 based defaults)
// (Optional optimization: try "us-east-1|instanceType" as fallback?)
fallbackKey := fmt.Sprintf("us-east-1|%s", instanceType)
if price, ok := pricing.InstancePrices[fallbackKey]; ok {
return price
}
return price

return 0.05 // Ultimate fallback
}

// GetNodeResourcePrices calculates the cost per vCPU and per GB of RAM based on the instance type.
Expand Down
2 changes: 1 addition & 1 deletion internal/store/pricing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
)

func TestPricingCatalog_GetNodeResourcePrices(t *testing.T) {
pc := NewPricingCatalog(nil)
pc := NewPricingCatalog()

// Test case 1: m5.large (2 vCPU, 8GB RAM)
// Price: $0.096/hr
Expand Down
77 changes: 73 additions & 4 deletions internal/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"math"

"github.com/clustercost/clustercost-dashboard/internal/config"
"github.com/clustercost/clustercost-dashboard/internal/pricing"
agentv1 "github.com/clustercost/clustercost-dashboard/internal/proto/agent/v1"
)

Expand Down Expand Up @@ -122,12 +121,21 @@ type NodeSummary struct {
InstanceType string `json:"instanceType,omitempty"`
Labels map[string]string `json:"labels"`
Taints []string `json:"taints"`
// Resource Requests (Allocated)
CPURequestedMilli int64 `json:"cpuRequestedMilli"`
CPULimitMilli int64 `json:"cpuLimitMilli"`
MemoryRequestedBytes int64 `json:"memoryRequestedBytes"`
MemoryLimitBytes int64 `json:"memoryLimitBytes"`
// Network (Host Level)
NetTxBytes int64 `json:"netTxBytes"`
NetRxBytes int64 `json:"netRxBytes"`
EgressPublicBytes int64 `json:"egressPublicBytes"`
EgressCrossAZBytes int64 `json:"egressCrossAZBytes"`
EgressInternalBytes int64 `json:"egressInternalBytes"`
// Historical / Window Data
ActiveHours float64 `json:"activeHours"` // Hours active in the selected window
ActiveRatio float64 `json:"activeRatio"` // 0.0 - 1.0
WindowCost float64 `json:"windowCost"` // Actual cost incurred in the window
}

// NodeListResponse wraps paginated node results.
Expand Down Expand Up @@ -271,6 +279,7 @@ type NodeFilter struct {
Search string
Limit int
Offset int
Window string // "24h", "7d", "30d"
}

// PodContext wraps a PodMetric with its location metadata.
Expand All @@ -282,6 +291,27 @@ type PodContext struct {
InstanceType string
}

// NodeStats contains historical usage and cost analysis for a node.
type NodeStats struct {
NodeName string `json:"nodeName"`
P95CPUUsagePercent float64 `json:"p95CpuUsagePercent"`
P95MemoryUsagePercent float64 `json:"p95MemoryUsagePercent"`
TotalMonthlyCost float64 `json:"totalMonthlyCost"`
RealUsageMonthlyCost float64 `json:"realUsageMonthlyCost"`
Window string `json:"window"`
}

// PodMetrics contains resource usage analysis for a single pod.
type PodMetrics struct {
PodName string `json:"podName"`
Namespace string `json:"namespace"`
QoSClass string `json:"qosClass"`
CPURequestMilli int64 `json:"cpuRequestMilli"`
CPUP95Milli float64 `json:"cpuP95Milli"`
MemoryRequestBytes int64 `json:"memoryRequestBytes"`
MemoryP95Bytes float64 `json:"memoryP95Bytes"`
}

// New creates a store seeded with agent configurations.
func New(cfgs []config.AgentConfig, recommendedAgentVersion string) *Store {
agentConfigs := make(map[string]config.AgentConfig, len(cfgs))
Expand All @@ -290,14 +320,13 @@ func New(cfgs []config.AgentConfig, recommendedAgentVersion string) *Store {
}

// Initialize Static Pricing Provider
// Context is just placeholder for interface, static client doesn't need it
pricingClient, _ := pricing.NewAWSClient(context.Background())
// We use the static map from internal/pricing/data.go, so no dynamic client needed.

return &Store{
agentConfigs: agentConfigs,
snapshots: make(map[string]*AgentSnapshot, len(cfgs)),
recommendedAgentVersion: recommendedAgentVersion,
pricing: NewPricingCatalog(pricingClient),
pricing: NewPricingCatalog(),
}
}

Expand Down Expand Up @@ -972,13 +1001,40 @@ func (s *Store) aggregateNodesLocked() (map[string]*NodeSummary, error) {
}
haveData = true

// Pre-calculate Pod Limits/Requests per Node
nodeLimits := make(map[string]struct {
cpuReq, cpuLim, memReq, memLim int64
})

for _, p := range snap.Report.Pods {
nodeName := snap.Report.NodeName
if nodeName == "" {
continue
}
stats := nodeLimits[nodeName]
if p.Cpu != nil {
stats.cpuReq += safeInt64(p.Cpu.RequestMillicores)
stats.cpuLim += safeInt64(p.Cpu.LimitMillicores)
}
if p.Memory != nil {
stats.memReq += safeInt64(p.Memory.RequestBytes)
stats.memLim += safeInt64(p.Memory.LimitBytes)
}
nodeLimits[nodeName] = stats
}

// Iterate over all nodes reported by this agent
for _, n := range snap.Report.Nodes {
if n == nil || n.NodeName == "" {
continue
}
name := n.NodeName

// Use aggregated values from pods if available, fallback to node metric if generic
// The Agent V2 NodeMetric.Requested... is arguably the same, but doesn't have Limits.
// We prioritize our calculated limits.
podStats := nodeLimits[name]

entry, ok := nodes[name]
if !ok {
entry = &NodeSummary{
Expand All @@ -988,7 +1044,20 @@ func (s *Store) aggregateNodesLocked() (map[string]*NodeSummary, error) {
InstanceType: "default", // placeholder
CPUAllocatableMilli: safeInt64(n.AllocatableCpuMillicores),
MemoryAllocatableBytes: safeInt64(n.AllocatableMemoryBytes),
CPURequestedMilli: safeInt64(n.RequestedCpuMillicores), // Fallback to agent metric
CPULimitMilli: podStats.cpuLim,
MemoryRequestedBytes: safeInt64(n.RequestedMemoryBytes), // Fallback to agent metric
MemoryLimitBytes: podStats.memLim,
IsUnderPressure: n.ThrottlingNs > 1_000_000,
}
// If agent metric is 0 (older agent?) use our aggregation for Requests too
if entry.CPURequestedMilli == 0 {
entry.CPURequestedMilli = podStats.cpuReq
}
if entry.MemoryRequestedBytes == 0 {
entry.MemoryRequestedBytes = podStats.memReq
}

nodes[name] = entry
}

Expand Down
2 changes: 1 addition & 1 deletion internal/store/store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ func newTestStore() *Store {
}
s := New(cfgs, "v1.0.0")
// Inject Mock Pricing
s.pricing = NewPricingCatalog(&MockPricing{})
s.pricing = NewPricingCatalog()
return s
}

Expand Down
Loading
Loading