From f4f567a3e21279c9c3e40e69dc1da0bc522fb441 Mon Sep 17 00:00:00 2001
From: zhangminchao1 <zhangminchao1@jd.com>
Date: Fri, 22 May 2026 00:03:55 +0800
Subject: [PATCH] feat: expose cached token usage in responses.

---
 tests/api_service/usage_json_test.cpp         | 95 +++++++++++++++++++
 .../framework/block/block_manager_test.cpp    | 48 ++++++++++
 xllm/api_service/chat_service_impl.cpp        |  8 +-
 xllm/api_service/utils.h                      | 21 +++-
 xllm/c_api/internal/helper.cpp                |  1 +
 xllm/c_api/test/utils.cpp                     |  2 +
 xllm/c_api/test/xllm_test.proto               |  3 +-
 xllm/c_api/types.h                            |  3 +
 xllm/cc_api/internal.h                        |  1 +
 xllm/cc_api/types.h                           |  3 +
 xllm/core/framework/request/request.cpp       |  8 ++
 xllm/core/framework/request/request_output.h  |  3 +
 xllm/core/framework/request/sequence.cpp      | 34 +++++++
 xllm/core/framework/request/sequence.h        |  8 ++
 .../framework/request/sequence_kv_state.cpp   |  7 ++
 .../framework/request/sequence_kv_state.h     |  1 +
 xllm/core/runtime/xservice_client.cpp         |  2 +
 xllm/parser/detector_registry.cpp             |  2 +-
 xllm/parser/reasoning_detector.cpp            |  5 +-
 xllm/proto/common.proto                       | 24 +++++
 xllm/proto/disagg_pd.proto                    |  2 +
 xllm/pybind/bind.cpp                          |  8 +-
 22 files changed, 276 insertions(+), 13 deletions(-)
 create mode 100644 tests/api_service/usage_json_test.cpp
diff --git a/tests/api_service/usage_json_test.cpp b/tests/api_service/usage_json_test.cpp
new file mode 100644
index 0000000000..7091553c7c
--- /dev/null
+++ b/tests/api_service/usage_json_test.cpp
@@ -0,0 +1,95 @@
+/* Copyright 2026 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include <json2pb/pb_to_json.h>
+
+#include <nlohmann/json.hpp>
+#include <string>
+
+#include "api_service/utils.h"
+#include "chat.pb.h"
+
+namespace xllm {
+namespace {
+
+TEST(UsageJsonTest, ChatUsageSerializesOpenAICachedTokensField) {
+  Usage usage;
+  usage.num_prompt_tokens = 1024;
+  usage.num_generated_tokens = 50;
+  usage.num_total_tokens = 1074;
+  usage.num_cached_tokens = 896;
+
+  proto::ChatResponse response;
+  api_service::set_proto_usage(response.mutable_usage(), usage);
+
+  json2pb::Pb2JsonOptions options;
+  options.bytes_to_base64 = false;
+  options.jsonify_empty_array = true;
+  options.always_print_primitive_fields = true;
+
+  std::string json_text;
+  std::string error_message;
+  ASSERT_TRUE(json2pb::ProtoMessageToJson(
+      response, &json_text, options, &error_message))
+      << error_message;
+
+  nlohmann::json json = nlohmann::json::parse(json_text);
+  ASSERT_TRUE(json.contains("usage"));
+  EXPECT_EQ(json["usage"]["prompt_tokens"], 1024);
+  EXPECT_EQ(json["usage"]["completion_tokens"], 50);
+  EXPECT_EQ(json["usage"]["total_tokens"], 1074);
+  ASSERT_TRUE(json["usage"].contains("prompt_tokens_details"));
+  EXPECT_EQ(json["usage"]["prompt_tokens_details"]["cached_tokens"], 896);
+  EXPECT_EQ(json["usage"]["prompt_tokens_details"]["audio_tokens"], 0);
+  ASSERT_TRUE(json["usage"].contains("completion_tokens_details"));
+  EXPECT_EQ(json["usage"]["completion_tokens_details"]["reasoning_tokens"], 0);
+  EXPECT_EQ(json["usage"]["completion_tokens_details"]["audio_tokens"], 0);
+  EXPECT_EQ(json["usage"]["completion_tokens_details"].size(), 2);
+}
+
+TEST(UsageJsonTest, ChatUsagePrintsZeroCachedTokens) {
+  Usage usage;
+  usage.num_prompt_tokens = 12;
+  usage.num_generated_tokens = 3;
+  usage.num_total_tokens = 15;
+  usage.num_cached_tokens = 0;
+
+  proto::ChatResponse response;
+  api_service::set_proto_usage(response.mutable_usage(), usage);
+
+  json2pb::Pb2JsonOptions options;
+  options.bytes_to_base64 = false;
+  options.jsonify_empty_array = true;
+  options.always_print_primitive_fields = true;
+
+  std::string json_text;
+  std::string error_message;
+  ASSERT_TRUE(json2pb::ProtoMessageToJson(
+      response, &json_text, options, &error_message))
+      << error_message;
+
+  nlohmann::json json = nlohmann::json::parse(json_text);
+  ASSERT_TRUE(json["usage"].contains("prompt_tokens_details"));
+  EXPECT_EQ(json["usage"]["prompt_tokens_details"]["cached_tokens"], 0);
+  EXPECT_EQ(json["usage"]["prompt_tokens_details"]["audio_tokens"], 0);
+  ASSERT_TRUE(json["usage"].contains("completion_tokens_details"));
+  EXPECT_EQ(json["usage"]["completion_tokens_details"]["reasoning_tokens"], 0);
+  EXPECT_EQ(json["usage"]["completion_tokens_details"]["audio_tokens"], 0);
+  EXPECT_EQ(json["usage"]["completion_tokens_details"].size(), 2);
+}
+
+}  // namespace
+}  // namespace xllm
diff --git a/tests/core/framework/block/block_manager_test.cpp b/tests/core/framework/block/block_manager_test.cpp
index 411668e5b8..a9d840636c 100644
--- a/tests/core/framework/block/block_manager_test.cpp
+++ b/tests/core/framework/block/block_manager_test.cpp
@@ -330,4 +330,52 @@ TEST(BlockManagerPoolTest, SequenceCopyDoesNotReuseSingleBlockSlot) {
   EXPECT_NE(GetSingleBlockIdOrFail(clone), GetSingleBlockIdOrFail(src));
 }
 
+TEST(BlockManagerPoolTest, PrefixCacheHitsAreReportedAsCachedTokens) {
+  ScopedValue<int32_t> max_seqs_guard(
+      &SchedulerConfig::get_instance().max_seqs_per_batch(), 2);
+
+  BlockManagerPool::Options options;
+  options.num_blocks(16).host_num_blocks(0).block_size(4).enable_prefix_cache(
+      true);
+  BlockManagerPool pool(options, /*dp_size=*/1);
+
+  {
+    Sequence seq1 =
+        MakeSequence(0, /*prompt_tokens=*/{1, 2, 3, 4, 5, 6, 7, 8, 9});
+    ASSERT_TRUE(pool.allocate(&seq1));
+    seq1.kv_state().set_kv_cache_tokens_num(seq1.num_prompt_tokens());
+    EXPECT_EQ(seq1.num_cached_tokens(), 0);
+    pool.cache(&seq1);
+
+    Sequence seq2 =
+        MakeSequence(0, /*prompt_tokens=*/{1, 2, 3, 4, 5, 6, 7, 8, 10});
+    ASSERT_TRUE(pool.allocate(&seq2));
+
+    EXPECT_EQ(seq2.num_cached_tokens(), 8);
+    EXPECT_LE(seq2.num_cached_tokens(), seq2.num_prompt_tokens());
+    EXPECT_EQ(seq2.num_cached_tokens() % options.block_size(), 0);
+  }
+
+  int32_t dp_rank = 0;
+  auto eviction_blocks =
+      pool.allocate((options.num_blocks() - 1) * options.block_size(), dp_rank);
+  EXPECT_EQ(eviction_blocks.size(), options.num_blocks() - 1);
+  EXPECT_EQ(pool.num_blocks_in_prefix_cache()[0], 0);
+}
+
+TEST(BlockManagerPoolTest, PrefixCacheDisabledReportsZeroCachedTokens) {
+  ScopedValue<int32_t> max_seqs_guard(
+      &SchedulerConfig::get_instance().max_seqs_per_batch(), 2);
+
+  BlockManagerPool::Options options;
+  options.num_blocks(16).host_num_blocks(0).block_size(4).enable_prefix_cache(
+      false);
+  BlockManagerPool pool(options, /*dp_size=*/1);
+
+  Sequence seq = MakeSequence(0, /*prompt_tokens=*/{1, 2, 3, 4, 5, 6, 7, 8, 9});
+  ASSERT_TRUE(pool.allocate(&seq));
+
+  EXPECT_EQ(seq.num_cached_tokens(), 0);
+}
+
 }  // namespace xllm
diff --git a/xllm/api_service/chat_service_impl.cpp b/xllm/api_service/chat_service_impl.cpp
index 0503de4c67..0b65c88d2e 100644
--- a/xllm/api_service/chat_service_impl.cpp
+++ b/xllm/api_service/chat_service_impl.cpp
@@ -373,9 +373,7 @@ bool send_delta_to_client_brpc(
     response.set_created(created_time);
     response.set_model(model);
     auto* proto_usage = response.mutable_usage();
-    proto_usage->set_prompt_tokens(usage.num_prompt_tokens);
-    proto_usage->set_completion_tokens(usage.num_generated_tokens);
-    proto_usage->set_total_tokens(usage.num_total_tokens);
+    api_service::set_proto_usage(proto_usage, usage);
     if (!call->write(response)) {
       return false;
     }
@@ -460,9 +458,7 @@ bool send_result_to_client_brpc(std::shared_ptr<ChatCall> call,
   if (req_output.usage.has_value()) {
     const auto& usage = req_output.usage.value();
     auto* proto_usage = response.mutable_usage();
-    proto_usage->set_prompt_tokens(usage.num_prompt_tokens);
-    proto_usage->set_completion_tokens(usage.num_generated_tokens);
-    proto_usage->set_total_tokens(usage.num_total_tokens);
+    api_service::set_proto_usage(proto_usage, usage);
   }
 
   return call->write_and_finish(response);
diff --git a/xllm/api_service/utils.h b/xllm/api_service/utils.h
index 5b4b2de410..fe0697812c 100644
--- a/xllm/api_service/utils.h
+++ b/xllm/api_service/utils.h
@@ -27,7 +27,9 @@ limitations under the License.
 
 #include "api_service/stream_output_parser.h"
 #include "chat.pb.h"
+#include "common.pb.h"
 #include "core/common/types.h"
+#include "core/framework/request/request_output.h"
 #include "function_call/function_call.h"
 
 namespace xllm {
@@ -36,6 +38,23 @@ namespace api_service {
 // Check for unstreamed tool arguments and send them using the provided sender
 // This is shared between Chat API and Anthropic API implementations
 using SendFunc = std::function<bool(const std::string&, int)>;
+
+inline void set_proto_usage(proto::Usage* proto_usage,
+                            const xllm::Usage& usage) {
+  CHECK(proto_usage != nullptr);
+  proto_usage->set_prompt_tokens(usage.num_prompt_tokens);
+  proto_usage->set_completion_tokens(usage.num_generated_tokens);
+  proto_usage->set_total_tokens(usage.num_total_tokens);
+  auto* prompt_tokens_details = proto_usage->mutable_prompt_tokens_details();
+  prompt_tokens_details->set_cached_tokens(usage.num_cached_tokens);
+  prompt_tokens_details->set_audio_tokens(0);
+
+  auto* completion_tokens_details =
+      proto_usage->mutable_completion_tokens_details();
+  completion_tokens_details->set_reasoning_tokens(0);
+  completion_tokens_details->set_audio_tokens(0);
+}
+
 inline bool check_for_unstreamed_tool_args(
     std::shared_ptr<StreamOutputParser> stream_parser,
     size_t index,
@@ -154,4 +173,4 @@ inline nlohmann::json struct_to_json(
 }
 
 }  // namespace api_service
-}  // namespace xllm
\ No newline at end of file
+}  // namespace xllm
diff --git a/xllm/c_api/internal/helper.cpp b/xllm/c_api/internal/helper.cpp
index fdd976e28c..1b5eb3ff7b 100644
--- a/xllm/c_api/internal/helper.cpp
+++ b/xllm/c_api/internal/helper.cpp
@@ -311,6 +311,7 @@ XLLM_Response* build_success_response(const InferenceType& inference_type,
     response->usage.prompt_tokens = usage.num_prompt_tokens;
     response->usage.completion_tokens = usage.num_generated_tokens;
     response->usage.total_tokens = usage.num_total_tokens;
+    response->usage.cached_tokens = usage.num_cached_tokens;
   }
 
   return response;
diff --git a/xllm/c_api/test/utils.cpp b/xllm/c_api/test/utils.cpp
index da72775f43..915f8848a1 100644
--- a/xllm/c_api/test/utils.cpp
+++ b/xllm/c_api/test/utils.cpp
@@ -475,12 +475,14 @@ void PbToXllmUsage(const c_api_test::XLLM_Usage& pb, XLLM_Usage* out) {
   out->prompt_tokens = pb.prompt_tokens();
   out->completion_tokens = pb.completion_tokens();
   out->total_tokens = pb.total_tokens();
+  out->cached_tokens = pb.cached_tokens();
 }
 
 void XllmUsageToPb(const XLLM_Usage& in, c_api_test::XLLM_Usage* pb) {
   pb->set_prompt_tokens(in.prompt_tokens);
   pb->set_completion_tokens(in.completion_tokens);
   pb->set_total_tokens(in.total_tokens);
+  pb->set_cached_tokens(in.cached_tokens);
 }
 
 void PbToXllmLogProbs(const c_api_test::XLLM_LogProbs& pb,
diff --git a/xllm/c_api/test/xllm_test.proto b/xllm/c_api/test/xllm_test.proto
index cdf86a290a..15954f7a25 100644
--- a/xllm/c_api/test/xllm_test.proto
+++ b/xllm/c_api/test/xllm_test.proto
@@ -162,6 +162,7 @@ message XLLM_Usage {
   int32 prompt_tokens = 1;
   int32 completion_tokens = 2;
   int32 total_tokens = 3;
+  int32 cached_tokens = 4;
 }
 
 // --- XLLM_LogProb / XLLM_LogProbs ---
@@ -221,4 +222,4 @@ message XLLM_DumpRecord {
 // --backend: rec -> xllm_rec_*; llm -> xllm_llm_*).
 service XllmRecCapiService {
   rpc Inference(XLLM_Request) returns (XLLM_Response);
-}
\ No newline at end of file
+}
diff --git a/xllm/c_api/types.h b/xllm/c_api/types.h
index 5ecda608db..ca04a6a57c 100644
--- a/xllm/c_api/types.h
+++ b/xllm/c_api/types.h
@@ -302,6 +302,9 @@ typedef struct XLLM_CAPI_EXPORT XLLM_Usage {
 
   /** Total tokens used (prompt + completion) */
   int32_t total_tokens;
+
+  /** Number of prompt tokens served from prefix cache */
+  int32_t cached_tokens;
 } XLLM_Usage;
 
 /**
diff --git a/xllm/cc_api/internal.h b/xllm/cc_api/internal.h
index 292cc1e5fa..c9b50bc1bd 100644
--- a/xllm/cc_api/internal.h
+++ b/xllm/cc_api/internal.h
@@ -147,6 +147,7 @@ XLLM_Response build_success_response(const RequestOutput& output,
     response.usage.prompt_tokens = usage.num_prompt_tokens;
     response.usage.completion_tokens = usage.num_generated_tokens;
     response.usage.total_tokens = usage.num_total_tokens;
+    response.usage.cached_tokens = usage.num_cached_tokens;
   }
 
   return response;
diff --git a/xllm/cc_api/types.h b/xllm/cc_api/types.h
index 5fd1a1bf1c..bfa15fabef 100644
--- a/xllm/cc_api/types.h
+++ b/xllm/cc_api/types.h
@@ -239,6 +239,9 @@ struct XLLM_CAPI_EXPORT XLLM_Usage {
 
   // The total number of tokens used in the request (prompt + completion).
   int32_t total_tokens;
+
+  // The number of prompt tokens served from prefix cache.
+  int32_t cached_tokens;
 };
 
 struct XLLM_CAPI_EXPORT XLLM_LogProbData {
diff --git a/xllm/core/framework/request/request.cpp b/xllm/core/framework/request/request.cpp
index 388221ff44..7ee40ae8bf 100644
--- a/xllm/core/framework/request/request.cpp
+++ b/xllm/core/framework/request/request.cpp
@@ -20,7 +20,9 @@ limitations under the License.
 #include <absl/time/time.h>
 #include <glog/logging.h>
 
+#include <algorithm>
 #include <cstdint>
+#include <limits>
 #include <string>
 #include <vector>
 
@@ -99,6 +101,7 @@ void Request::log_statistic(double total_latency) {
               << "finish_reason: "
               << seq->finish_reason().to_string().value_or("") << ", "
               << "prompt_tokens: " << seq->num_prompt_tokens() << ", "
+              << "cached_tokens: " << seq->num_cached_tokens() << ", "
               << "generated_tokens: " << gen_tokens << ", " << std::fixed
               << std::setprecision(1) << "ttft: " << ttft * 1000 << "ms, "
               << "total_latency: " << total_latency * 1000 << "ms, "
@@ -155,13 +158,18 @@ RequestOutput Request::generate_output(const Tokenizer& tokenizer,
   // summarize statistics for all sequences
   Usage usage;
   usage.num_prompt_tokens = state_.prompt_tokens.size();
+  size_t num_cached_tokens = 0;
   for (const auto& seq : sequences()) {
     usage.num_generated_tokens += seq->num_generated_tokens();
+    num_cached_tokens = std::max(num_cached_tokens, seq->num_cached_tokens());
     // NOTE: Avoid counting the extra execution step in overlap scenario.
     if (state_.enable_schedule_overlap) {
       usage.num_generated_tokens--;
     }
   }
+  CHECK_LE(num_cached_tokens,
+           static_cast<size_t>(std::numeric_limits<int32_t>::max()));
+  usage.num_cached_tokens = static_cast<int32_t>(num_cached_tokens);
   usage.num_total_tokens = usage.num_prompt_tokens + usage.num_generated_tokens;
 
   RequestOutput output;
diff --git a/xllm/core/framework/request/request_output.h b/xllm/core/framework/request/request_output.h
index 3807d8133a..26614ef4b0 100644
--- a/xllm/core/framework/request/request_output.h
+++ b/xllm/core/framework/request/request_output.h
@@ -35,6 +35,9 @@ struct Usage {
 
   // the total number of tokens used in the request (prompt + completion).
   int32_t num_total_tokens = 0;
+
+  // the number of prompt tokens served from prefix cache.
+  int32_t num_cached_tokens = 0;
 };
 
 struct LogProbData {
diff --git a/xllm/core/framework/request/sequence.cpp b/xllm/core/framework/request/sequence.cpp
index 4c6f75b9f1..3f8d8493cb 100644
--- a/xllm/core/framework/request/sequence.cpp
+++ b/xllm/core/framework/request/sequence.cpp
@@ -289,6 +289,7 @@ Sequence::Sequence(const Sequence& other)
       num_tokens_(other.num_tokens_),
       token_to_count_map_(other.token_to_count_map_),
       num_prompt_tokens_(other.num_prompt_tokens_),
+      num_cached_tokens_(other.num_cached_tokens_),
       onerec_state_(other.onerec_state_),
       volatile_num_prompt_tokens_(other.volatile_num_prompt_tokens_),
       request_id_(other.request_id_),
@@ -690,8 +691,38 @@ void Sequence::add_host_kv_blocks(const std::vector<Block>& blocks) {
   host_kv_state_.add_kv_blocks(blocks);
 }
 
+size_t Sequence::current_num_cached_tokens() const {
+  size_t cached_tokens = std::max(kv_state_.shared_kv_tokens_num(),
+                                  host_kv_state_.shared_kv_tokens_num());
+  if (cached_tokens <= num_prompt_tokens_) {
+    return cached_tokens;
+  }
+
+  size_t block_size = 0;
+  if (kv_state_.shared_kv_blocks_num() > 0 && kv_state_.num_kv_blocks() > 0) {
+    block_size = kv_state_.kv_blocks()[0].size();
+  } else if (host_kv_state_.shared_kv_blocks_num() > 0 &&
+             host_kv_state_.num_kv_blocks() > 0) {
+    block_size = host_kv_state_.kv_blocks()[0].size();
+  }
+  if (block_size == 0) {
+    return 0;
+  }
+  return (num_prompt_tokens_ / block_size) * block_size;
+}
+
+void Sequence::record_cached_tokens() {
+  num_cached_tokens_ =
+      std::max(num_cached_tokens_, current_num_cached_tokens());
+}
+
+size_t Sequence::num_cached_tokens() const {
+  return std::max(num_cached_tokens_, current_num_cached_tokens());
+}
+
 // release all cache blocks
 void Sequence::reset() {
+  record_cached_tokens();
   kv_state_.reset();
   host_kv_state_.reset();
   timer_.reset();
@@ -702,10 +733,12 @@ void Sequence::reset() {
 
 void Sequence::add_shared_kv_blocks(std::vector<Block>&& blocks) {
   kv_state_.add_shared_kv_blocks(std::move(blocks), num_tokens_);
+  record_cached_tokens();
 }
 
 void Sequence::add_shared_host_kv_blocks(std::vector<Block>&& blocks) {
   host_kv_state_.add_shared_kv_blocks(std::move(blocks), num_tokens_);
+  record_cached_tokens();
 }
 
 bool Sequence::finished() const {
@@ -805,6 +838,7 @@ bool Sequence::update_prefetch_result(uint32_t timeout, uint32_t& success_cnt) {
     host_kv_state_.incr_kv_cache_tokens_num(
         success_cnt * host_kv_state_.kv_blocks()[0].size());
     host_kv_state_.incr_shared_kv_blocks_num(success_cnt);
+    record_cached_tokens();
   }
   prefetch_results_.clear();
   return true;
diff --git a/xllm/core/framework/request/sequence.h b/xllm/core/framework/request/sequence.h
index 0880832146..49fbeddcde 100644
--- a/xllm/core/framework/request/sequence.h
+++ b/xllm/core/framework/request/sequence.h
@@ -175,6 +175,8 @@ class Sequence final {
                     host_kv_state_.kv_cache_tokens_num());
   }
 
+  size_t num_cached_tokens() const;
+
   // add a new token id to the sequence and update the count
   // the token would be discarded if the sequence is still in prefill stage
   void append_token(const Token& token);
@@ -406,6 +408,8 @@ class Sequence final {
 
  private:
   void record_first_token(const Token& token);
+  size_t current_num_cached_tokens() const;
+  void record_cached_tokens();
 
   SequenceOutputType output_type();
   void generate_embeddings_output(SequenceOutput& output);
@@ -481,6 +485,10 @@ class Sequence final {
   // the length of the prompt tokens
   size_t num_prompt_tokens_ = 0;
 
+  // Prefix-cache hits must survive KV block release until final usage is
+  // emitted.
+  size_t num_cached_tokens_ = 0;
+
   std::optional<OneRecState> onerec_state_;
 
   // NOTE: MUST FIXME Later
diff --git a/xllm/core/framework/request/sequence_kv_state.cpp b/xllm/core/framework/request/sequence_kv_state.cpp
index 8e723ab00d..2e99ecb7b8 100644
--- a/xllm/core/framework/request/sequence_kv_state.cpp
+++ b/xllm/core/framework/request/sequence_kv_state.cpp
@@ -36,6 +36,13 @@ size_t KVCacheState::shared_kv_blocks_num() const {
   return num_owned_shared_blocks_;
 }
 
+size_t KVCacheState::shared_kv_tokens_num() const {
+  if (blocks_.empty() || num_owned_shared_blocks_ == 0) {
+    return 0;
+  }
+  return num_owned_shared_blocks_ * blocks_[0].size();
+}
+
 size_t KVCacheState::kv_cache_tokens_num() const {
   return kv_cache_tokens_num_;
 }
diff --git a/xllm/core/framework/request/sequence_kv_state.h b/xllm/core/framework/request/sequence_kv_state.h
index 29caa3f750..4c58921344 100644
--- a/xllm/core/framework/request/sequence_kv_state.h
+++ b/xllm/core/framework/request/sequence_kv_state.h
@@ -32,6 +32,7 @@ class KVCacheState {
   void incr_kv_cache_tokens_num(size_t num);
   // get the number of shared blocks.
   size_t shared_kv_blocks_num() const;
+  size_t shared_kv_tokens_num() const;
 
   void add_kv_blocks(const std::vector<Block>& new_blocks);
   void add_shared_kv_blocks(std::vector<Block>&& blocks,
diff --git a/xllm/core/runtime/xservice_client.cpp b/xllm/core/runtime/xservice_client.cpp
index c9040d22d7..c4ec9f9247 100644
--- a/xllm/core/runtime/xservice_client.cpp
+++ b/xllm/core/runtime/xservice_client.cpp
@@ -556,6 +556,8 @@ std::vector<bool> XServiceClient::generations(
       proto_usage->set_num_generated_tokens(
           output.usage.value().num_generated_tokens);
       proto_usage->set_num_total_tokens(output.usage.value().num_total_tokens);
+      proto_usage->set_num_cached_tokens(
+          output.usage.value().num_cached_tokens);
     }
     req->mutable_outputs()->Reserve(output.outputs.size());
     for (auto& seq_output : output.outputs) {
diff --git a/xllm/parser/detector_registry.cpp b/xllm/parser/detector_registry.cpp
index 3b8bbef06b..eb895d4deb 100644
--- a/xllm/parser/detector_registry.cpp
+++ b/xllm/parser/detector_registry.cpp
@@ -39,7 +39,7 @@ namespace {
 // Maps reasoning_parser name to supported model_types
 const std::unordered_map<std::string, std::string> auto_paser_map = {
     // {"deepseek_v3", "deepseek-v3"},
-    // {"qwen3", "qwen3"},
+    {"qwen3", "qwen3"},
     {"glm4_moe", "glm45"},
     {"deepseek_v32", "deepseekv32"},
     {"kimi_k2", "kimi"},
diff --git a/xllm/parser/reasoning_detector.cpp b/xllm/parser/reasoning_detector.cpp
index 30b628557c..f0fff4211f 100644
--- a/xllm/parser/reasoning_detector.cpp
+++ b/xllm/parser/reasoning_detector.cpp
@@ -82,8 +82,9 @@ ReasoningResult ReasoningDetector::parse_streaming_increment(
   // Strip `<think>` token if present
   if (!stripped_think_start_ &&
       absl::StrContains(current_text, think_start_token_)) {
-    current_text = absl::StrReplaceAll(
-        {{absl::string_view(think_start_token_), ""}}, &current_text);
+    absl::StrReplaceAll(
+        {{absl::string_view(think_start_token_), absl::string_view()}},
+        &current_text);
     stripped_think_start_ = true;
     in_reasoning_ = true;
   }
diff --git a/xllm/proto/common.proto b/xllm/proto/common.proto
index e692a65585..cb3792f89b 100644
--- a/xllm/proto/common.proto
+++ b/xllm/proto/common.proto
@@ -48,6 +48,30 @@ message Usage {
 
   // the total number of tokens used in the request (prompt + completion).
   optional int32 total_tokens = 3 [json_name="total_tokens"];
+
+  message PromptTokensDetails {
+    // the number of prompt tokens served from prefix cache.
+    optional int32 cached_tokens = 1 [json_name="cached_tokens"];
+
+    // the number of audio input tokens present in the prompt.
+    optional int32 audio_tokens = 2 [json_name="audio_tokens"];
+  }
+
+  // details about prompt token accounting.
+  optional PromptTokensDetails prompt_tokens_details = 4
+      [json_name="prompt_tokens_details"];
+
+  message CompletionTokensDetails {
+    // the number of tokens generated by the model for reasoning.
+    optional int32 reasoning_tokens = 1 [json_name="reasoning_tokens"];
+
+    // the number of audio tokens generated by the model.
+    optional int32 audio_tokens = 2 [json_name="audio_tokens"];
+  }
+
+  // details about completion token accounting.
+  optional CompletionTokensDetails completion_tokens_details = 5
+      [json_name="completion_tokens_details"];
 }
 
 // Options for streaming response.
diff --git a/xllm/proto/disagg_pd.proto b/xllm/proto/disagg_pd.proto
index c494b9f028..92f665b738 100644
--- a/xllm/proto/disagg_pd.proto
+++ b/xllm/proto/disagg_pd.proto
@@ -138,6 +138,8 @@ message OutputUsage {
   int32 num_generated_tokens = 2;
   // the total number of tokens used in the request (prompt + completion).
   int32 num_total_tokens = 3;
+  // the number of prompt tokens served from prefix cache.
+  int32 num_cached_tokens = 4;
 }
 
 message LogProbData {
diff --git a/xllm/pybind/bind.cpp b/xllm/pybind/bind.cpp
index f171820d1a..59cb30cf62 100644
--- a/xllm/pybind/bind.cpp
+++ b/xllm/pybind/bind.cpp
@@ -218,14 +218,18 @@ PYBIND11_MODULE(xllm_export, m) {
       .def_readwrite("num_prompt_tokens", &Usage::num_prompt_tokens)
       .def_readwrite("num_generated_tokens", &Usage::num_generated_tokens)
       .def_readwrite("num_total_tokens", &Usage::num_total_tokens)
+      .def_readwrite("num_cached_tokens", &Usage::num_cached_tokens)
       .def_property_readonly(
           "prompt_tokens",
           [](const Usage& self) { return self.num_prompt_tokens; })
       .def_property_readonly(
           "completion_tokens",
           [](const Usage& self) { return self.num_generated_tokens; })
-      .def_property_readonly("total_tokens", [](const Usage& self) {
-        return self.num_total_tokens;
+      .def_property_readonly(
+          "total_tokens",
+          [](const Usage& self) { return self.num_total_tokens; })
+      .def_property_readonly("cached_tokens", [](const Usage& self) {
+        return self.num_cached_tokens;
       });
   // 5. export RequestOutput
   py::class_<RequestOutput>(m, "RequestOutput")