From 91d6ead5b3d4839d31b6061806116203576a4a53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20LIARD?= Date: Fri, 15 May 2026 11:29:00 +0200 Subject: [PATCH] =?UTF-8?q?fix(pricing):=20April=202026=20drift=20?= =?UTF-8?q?=E2=80=94=20Opus=203=C3=97=20over-billing,=20missing=20GPT-5/De?= =?UTF-8?q?epSeek-V4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Static pricing table reflected pre-2026 rates. This caused silent over-billing (Opus 4.6/4.7 at $15/$75 instead of $5/$25 since 4.6 shipped), under-billing (Gemini 2.5 Pro output at $5/M instead of $10/M), and complete fallback miss for models with no row at all (GPT-5/5.5, DeepSeek V4, Grok 4.1, Mercury 2). Critical corrections: - claude-opus-4-7 (added at $5/$25) - claude-opus-4-6: $15/$75 → $5/$25 - claude-haiku-4-5: $0.80/$4 → $1/$5 - MiniMax-M2.5: $0.30/$1.20 → $0.15/$0.95 - gemini-2.5-pro: $1.25/$5 → $1.25/$10 - deepseek-chat/reasoner: rerouted to V4-Flash rates ($0.14/$0.28) New entries: - OpenAI: gpt-5, gpt-5.5, gpt-5.5-pro, o1/o1-mini, o3/o3-mini, gpt-oss-20b/120b - DeepSeek: deepseek-v4-flash, deepseek-v4-pro - Z.ai: glm-4.5-flash - MiniMax: M2.7 - Llama: llama-4-scout-17b (Groq free) - Inception: mercury-2 ($0.25/$0.75, supersedes mercury-coder-small) - xAI: grok-4.1-fast - Anthropic: claude-sonnet-4-7 Reasoning-token 3× surcharge (o-series) deferred to follow-up PR introducing CostInputs + cache_read/creation multipliers. Test asserts and one upstream test in features::token_pricing updated to reflect the corrected Opus rates. --- src/features/token_pricing/mod.rs | 6 +- src/pricing.rs | 273 ++++++++++++++++++++++++++---- 2 files changed, 240 insertions(+), 39 deletions(-) diff --git a/src/features/token_pricing/mod.rs b/src/features/token_pricing/mod.rs index f7c221ff..6525cc5a 100644 --- a/src/features/token_pricing/mod.rs +++ b/src/features/token_pricing/mod.rs @@ -249,10 +249,10 @@ mod tests { let table = PricingTable::from_known(); assert!(!table.is_empty()); - // Exact match + // Exact match — Opus 4.6 moved to $5/$25 in early 2026 (was $15/$75). let (inp, out) = table.get("claude-opus-4-6").unwrap(); - assert_eq!(inp, 15.0); - assert_eq!(out, 75.0); + assert_eq!(inp, 5.0); + assert_eq!(out, 25.0); } #[test] diff --git a/src/pricing.rs b/src/pricing.rs index e01f9847..bd6e3515 100644 --- a/src/pricing.rs +++ b/src/pricing.rs @@ -3,6 +3,11 @@ //! Provides per-model cost estimates from a hardcoded fallback table. //! Both `providers::streaming` and `features::token_pricing` import from //! here, avoiding a circular dependency between those two modules. +//! +//! Prices reflect publicly listed rates as of 2026-04-30 (USD per 1M tokens). +//! Update this file when providers publish price changes; the upstream +//! OpenRouter feed in [`crate::features::token_pricing`] takes precedence +//! at runtime so this is the cold-start fallback. use serde::Serialize; @@ -35,13 +40,20 @@ impl ModelPricing { } } -/// Known model pricing (USD) - fallback for models not available on OpenRouter. +/// Known model pricing (USD) — fallback for models not available on OpenRouter. pub static KNOWN_PRICING: &[ModelPricing] = &[ - // Anthropic + // ----------------------------------------------------------------- + // Anthropic — Opus 4.6+ moved to $5/$25 (was $15/$75 on 4.5 and earlier). + // ----------------------------------------------------------------- + ModelPricing { + model: "claude-opus-4-7", + input_per_million: 5.0, + output_per_million: 25.0, + }, ModelPricing { model: "claude-opus-4-6", - input_per_million: 15.0, - output_per_million: 75.0, + input_per_million: 5.0, + output_per_million: 25.0, }, ModelPricing { model: "claude-opus-4-5", @@ -53,6 +65,11 @@ pub static KNOWN_PRICING: &[ModelPricing] = &[ input_per_million: 15.0, output_per_million: 75.0, }, + ModelPricing { + model: "claude-sonnet-4-7", + input_per_million: 3.0, + output_per_million: 15.0, + }, ModelPricing { model: "claude-sonnet-4-6", input_per_million: 3.0, @@ -68,17 +85,35 @@ pub static KNOWN_PRICING: &[ModelPricing] = &[ input_per_million: 3.0, output_per_million: 15.0, }, + // Haiku 4.5 raised to $1/$5 in early 2026. ModelPricing { model: "claude-haiku-4-5", - input_per_million: 0.8, - output_per_million: 4.0, + input_per_million: 1.0, + output_per_million: 5.0, }, ModelPricing { model: "claude-haiku-3-5", input_per_million: 0.8, output_per_million: 4.0, }, - // OpenAI + // ----------------------------------------------------------------- + // OpenAI — GPT-5 family added April 2026. + // ----------------------------------------------------------------- + ModelPricing { + model: "gpt-5", + input_per_million: 0.625, + output_per_million: 5.0, + }, + ModelPricing { + model: "gpt-5.5", + input_per_million: 5.0, + output_per_million: 30.0, + }, + ModelPricing { + model: "gpt-5.5-pro", + input_per_million: 30.0, + output_per_million: 180.0, + }, ModelPricing { model: "gpt-4o", input_per_million: 2.5, @@ -99,24 +134,76 @@ pub static KNOWN_PRICING: &[ModelPricing] = &[ input_per_million: 0.5, output_per_million: 1.5, }, - // DeepSeek + // OpenAI reasoning models (o-series). Listed price excludes the + // reasoning-token surcharge — token billing infra (PR #TODO) adds the 3× + // multiplier on `reasoning_tokens` for accurate cost. + ModelPricing { + model: "o1", + input_per_million: 15.0, + output_per_million: 60.0, + }, + ModelPricing { + model: "o1-mini", + input_per_million: 3.0, + output_per_million: 12.0, + }, + ModelPricing { + model: "o3", + input_per_million: 15.0, + output_per_million: 60.0, + }, + ModelPricing { + model: "o3-mini", + input_per_million: 3.0, + output_per_million: 12.0, + }, + // OpenAI open-source weights served on Groq free tier (gpt-oss). + ModelPricing { + model: "gpt-oss-20b", + input_per_million: 0.0, + output_per_million: 0.0, + }, + ModelPricing { + model: "gpt-oss-120b", + input_per_million: 0.0, + output_per_million: 0.0, + }, + // ----------------------------------------------------------------- + // DeepSeek — V4 family. Legacy endpoints retire 2026-07-24. + // V4-Pro promo ($0.435/$0.87) ended 2026-05-05 → list price. + // ----------------------------------------------------------------- + ModelPricing { + model: "deepseek-v4-flash", + input_per_million: 0.14, + output_per_million: 0.28, + }, + ModelPricing { + model: "deepseek-v4-pro", + input_per_million: 1.74, + output_per_million: 3.48, + }, + // Legacy endpoints reroute to V4-Flash since 2026-Q1. ModelPricing { model: "deepseek-chat", - input_per_million: 0.27, - output_per_million: 1.10, + input_per_million: 0.14, + output_per_million: 0.28, }, ModelPricing { model: "deepseek-reasoner", - input_per_million: 0.55, - output_per_million: 2.19, + input_per_million: 0.14, + output_per_million: 0.28, }, - // Devstral + // ----------------------------------------------------------------- + // Devstral (Mistral AI coding model). + // ----------------------------------------------------------------- ModelPricing { model: "devstral-small", input_per_million: 0.10, output_per_million: 0.30, }, - // Chinese Models + // ----------------------------------------------------------------- + // Z.ai / GLM family. + // ----------------------------------------------------------------- ModelPricing { model: "glm-4", input_per_million: 0.14, @@ -132,12 +219,21 @@ pub static KNOWN_PRICING: &[ModelPricing] = &[ input_per_million: 0.14, output_per_million: 0.14, }, - // GLM-4.7-Flash (Z.ai, released 2026-01-19) — free tier per Z.ai pricing + // GLM-4.5-Flash and 4.7-Flash are free tier on Z.ai for registered users. + ModelPricing { + model: "glm-4.5-flash", + input_per_million: 0.0, + output_per_million: 0.0, + }, ModelPricing { model: "glm-4.7-flash", input_per_million: 0.0, output_per_million: 0.0, }, + // ----------------------------------------------------------------- + // MiniMax — M2.5 list price corrected to $0.15/$0.95 (was billed at + // $0.30/$1.20 — 2× over). + // ----------------------------------------------------------------- ModelPricing { model: "MiniMax-M2", input_per_million: 0.30, @@ -148,18 +244,18 @@ pub static KNOWN_PRICING: &[ModelPricing] = &[ input_per_million: 0.30, output_per_million: 1.20, }, - // MiniMax M2.5 (released 2026-02-12) — same input price as M2, same output ModelPricing { model: "MiniMax-M2.5", - input_per_million: 0.30, - output_per_million: 1.20, + input_per_million: 0.15, + output_per_million: 0.95, }, ModelPricing { model: "minimax-m2.5", - input_per_million: 0.30, - output_per_million: 1.20, + input_per_million: 0.15, + output_per_million: 0.95, }, - // M2.5-Lightning: identical capability, 2x throughput, 2x output cost + // M2.5-Lightning tier prices unverified — keep historical rate as + // best-effort until Anthropic-compat endpoint publishes a list. ModelPricing { model: "MiniMax-M2.5-Lightning", input_per_million: 0.30, @@ -170,6 +266,17 @@ pub static KNOWN_PRICING: &[ModelPricing] = &[ input_per_million: 0.30, output_per_million: 2.40, }, + // M2.7 released March 2026; assumes M2.5 rates pending official table. + ModelPricing { + model: "MiniMax-M2.7", + input_per_million: 0.15, + output_per_million: 0.95, + }, + ModelPricing { + model: "minimax-m2.7", + input_per_million: 0.15, + output_per_million: 0.95, + }, ModelPricing { model: "kimi-k2", input_per_million: 2.00, @@ -180,7 +287,9 @@ pub static KNOWN_PRICING: &[ModelPricing] = &[ input_per_million: 2.00, output_per_million: 8.00, }, - // Groq + // ----------------------------------------------------------------- + // Groq (Llama family on dedicated inference). + // ----------------------------------------------------------------- ModelPricing { model: "llama-3.1-70b", input_per_million: 0.59, @@ -191,23 +300,46 @@ pub static KNOWN_PRICING: &[ModelPricing] = &[ input_per_million: 0.05, output_per_million: 0.08, }, - // Llama 3.3 70B (Groq versatile endpoint) ModelPricing { model: "llama-3.3-70b-versatile", input_per_million: 0.59, output_per_million: 0.79, }, - // Inception Labs (diffusion LLM, fast) + // Llama 4 Scout 17B on Groq — free tier with 30K TPM cap. + ModelPricing { + model: "llama-4-scout-17b", + input_per_million: 0.0, + output_per_million: 0.0, + }, + // ----------------------------------------------------------------- + // Inception Labs (diffusion LLM). Mercury 2 supersedes mercury-coder-small. + // ----------------------------------------------------------------- + ModelPricing { + model: "mercury-2", + input_per_million: 0.25, + output_per_million: 0.75, + }, ModelPricing { model: "mercury-coder-small", input_per_million: 0.25, output_per_million: 1.25, }, - // Google Gemini + // ----------------------------------------------------------------- + // xAI Grok. + // ----------------------------------------------------------------- + ModelPricing { + model: "grok-4.1-fast", + input_per_million: 0.20, + output_per_million: 0.50, + }, + // ----------------------------------------------------------------- + // Google Gemini — 2.5 Pro output is $10/M (was billed $5/M — 2× under). + // Prices apply to the ≤200K-context tier; >200K tier is $2.50/$15. + // ----------------------------------------------------------------- ModelPricing { model: "gemini-2.5-pro", input_per_million: 1.25, - output_per_million: 5.00, + output_per_million: 10.00, }, ]; @@ -221,7 +353,7 @@ static PRICING_MAP: std::sync::LazyLock< /// /// ``` /// use grob::pricing::pricing; -/// let p = pricing("claude-opus-4-6").unwrap(); +/// let p = pricing("claude-opus-4-7").unwrap(); /// assert!(p.input_per_million > 0.0); /// assert!(pricing("unknown-model-xyz").is_none()); /// ``` @@ -238,14 +370,68 @@ mod tests { use super::*; #[test] - fn minimax_m25_lookup() { + fn opus_47_listed_at_new_price() { + let p = pricing("claude-opus-4-7").expect("opus 4.7 listed"); + assert_eq!(p.input_per_million, 5.0); + assert_eq!(p.output_per_million, 25.0); + } + + #[test] + fn opus_46_corrected_to_new_price() { + // Regression: was $15/$75 (legacy 4.5-era rate) — 3× over-billing. + let p = pricing("claude-opus-4-6").expect("opus 4.6 listed"); + assert_eq!(p.input_per_million, 5.0); + assert_eq!(p.output_per_million, 25.0); + } + + #[test] + fn haiku_45_corrected() { + // Regression: was $0.80/$4 — raised to $1/$5. + let p = pricing("claude-haiku-4-5").expect("haiku 4.5 listed"); + assert_eq!(p.input_per_million, 1.0); + assert_eq!(p.output_per_million, 5.0); + } + + #[test] + fn gpt_5_listed() { + let p = pricing("gpt-5").expect("gpt-5 listed"); + assert_eq!(p.input_per_million, 0.625); + assert_eq!(p.output_per_million, 5.0); + } + + #[test] + fn gpt_55_listed() { + let p = pricing("gpt-5.5").expect("gpt-5.5 listed"); + assert_eq!(p.input_per_million, 5.0); + assert_eq!(p.output_per_million, 30.0); + } + + #[test] + fn deepseek_v4_flash_listed() { + let p = pricing("deepseek-v4-flash").expect("V4-Flash listed"); + assert_eq!(p.input_per_million, 0.14); + assert_eq!(p.output_per_million, 0.28); + } + + #[test] + fn deepseek_legacy_rerouted_to_v4_rates() { + // Legacy deepseek-chat endpoint reroutes to V4-Flash since 2026-Q1. + let p = pricing("deepseek-chat").expect("legacy chat listed"); + assert_eq!(p.input_per_million, 0.14); + assert_eq!(p.output_per_million, 0.28); + } + + #[test] + fn minimax_m25_corrected() { + // Regression: was $0.30/$1.20 — 2× over-billing. let p = pricing("MiniMax-M2.5").expect("M2.5 listed"); - assert_eq!(p.input_per_million, 0.30); - assert_eq!(p.output_per_million, 1.20); + assert_eq!(p.input_per_million, 0.15); + assert_eq!(p.output_per_million, 0.95); } #[test] - fn minimax_m25_lightning_double_output_cost() { + fn minimax_m25_lightning_unchanged() { + // Lightning tier list unverified; kept as historical rate. let p = pricing("MiniMax-M2.5-Lightning").expect("Lightning listed"); assert_eq!(p.input_per_million, 0.30); assert_eq!(p.output_per_million, 2.40); @@ -266,23 +452,38 @@ mod tests { } #[test] - fn mercury_coder_small_listed() { - let p = pricing("mercury-coder-small").expect("listed"); + fn mercury_2_listed() { + let p = pricing("mercury-2").expect("mercury 2 listed"); + assert_eq!(p.input_per_million, 0.25); + assert_eq!(p.output_per_million, 0.75); + } + + #[test] + fn mercury_coder_small_kept_for_backcompat() { + let p = pricing("mercury-coder-small").expect("legacy mercury listed"); assert_eq!(p.input_per_million, 0.25); assert_eq!(p.output_per_million, 1.25); } #[test] - fn gemini_25_pro_listed() { + fn grok_41_fast_listed() { + let p = pricing("grok-4.1-fast").expect("grok 4.1 fast listed"); + assert_eq!(p.input_per_million, 0.20); + assert_eq!(p.output_per_million, 0.50); + } + + #[test] + fn gemini_25_pro_output_corrected() { + // Regression: output was $5/M — actually $10/M on ≤200k tier (2× under). let p = pricing("gemini-2.5-pro").expect("listed"); assert_eq!(p.input_per_million, 1.25); - assert_eq!(p.output_per_million, 5.00); + assert_eq!(p.output_per_million, 10.00); } #[test] fn calculate_one_million_input() { let p = pricing("MiniMax-M2.5").unwrap(); let cost = p.calculate(1_000_000, 0); - assert!((cost - 0.30).abs() < 1e-9); + assert!((cost - 0.15).abs() < 1e-9); } }