From f4f567a3e21279c9c3e40e69dc1da0bc522fb441 Mon Sep 17 00:00:00 2001 From: zhangminchao1 Date: Fri, 22 May 2026 00:03:55 +0800 Subject: [PATCH] feat: expose cached token usage in responses. --- tests/api_service/usage_json_test.cpp | 95 +++++++++++++++++++ .../framework/block/block_manager_test.cpp | 48 ++++++++++ xllm/api_service/chat_service_impl.cpp | 8 +- xllm/api_service/utils.h | 21 +++- xllm/c_api/internal/helper.cpp | 1 + xllm/c_api/test/utils.cpp | 2 + xllm/c_api/test/xllm_test.proto | 3 +- xllm/c_api/types.h | 3 + xllm/cc_api/internal.h | 1 + xllm/cc_api/types.h | 3 + xllm/core/framework/request/request.cpp | 8 ++ xllm/core/framework/request/request_output.h | 3 + xllm/core/framework/request/sequence.cpp | 34 +++++++ xllm/core/framework/request/sequence.h | 8 ++ .../framework/request/sequence_kv_state.cpp | 7 ++ .../framework/request/sequence_kv_state.h | 1 + xllm/core/runtime/xservice_client.cpp | 2 + xllm/parser/detector_registry.cpp | 2 +- xllm/parser/reasoning_detector.cpp | 5 +- xllm/proto/common.proto | 24 +++++ xllm/proto/disagg_pd.proto | 2 + xllm/pybind/bind.cpp | 8 +- 22 files changed, 276 insertions(+), 13 deletions(-) create mode 100644 tests/api_service/usage_json_test.cpp diff --git a/tests/api_service/usage_json_test.cpp b/tests/api_service/usage_json_test.cpp new file mode 100644 index 0000000000..7091553c7c --- /dev/null +++ b/tests/api_service/usage_json_test.cpp @@ -0,0 +1,95 @@ +/* Copyright 2026 The xLLM Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://github.com/jd-opensource/xllm/blob/main/LICENSE + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include +#include + +#include "api_service/utils.h" +#include "chat.pb.h" + +namespace xllm { +namespace { + +TEST(UsageJsonTest, ChatUsageSerializesOpenAICachedTokensField) { + Usage usage; + usage.num_prompt_tokens = 1024; + usage.num_generated_tokens = 50; + usage.num_total_tokens = 1074; + usage.num_cached_tokens = 896; + + proto::ChatResponse response; + api_service::set_proto_usage(response.mutable_usage(), usage); + + json2pb::Pb2JsonOptions options; + options.bytes_to_base64 = false; + options.jsonify_empty_array = true; + options.always_print_primitive_fields = true; + + std::string json_text; + std::string error_message; + ASSERT_TRUE(json2pb::ProtoMessageToJson( + response, &json_text, options, &error_message)) + << error_message; + + nlohmann::json json = nlohmann::json::parse(json_text); + ASSERT_TRUE(json.contains("usage")); + EXPECT_EQ(json["usage"]["prompt_tokens"], 1024); + EXPECT_EQ(json["usage"]["completion_tokens"], 50); + EXPECT_EQ(json["usage"]["total_tokens"], 1074); + ASSERT_TRUE(json["usage"].contains("prompt_tokens_details")); + EXPECT_EQ(json["usage"]["prompt_tokens_details"]["cached_tokens"], 896); + EXPECT_EQ(json["usage"]["prompt_tokens_details"]["audio_tokens"], 0); + ASSERT_TRUE(json["usage"].contains("completion_tokens_details")); + EXPECT_EQ(json["usage"]["completion_tokens_details"]["reasoning_tokens"], 0); + EXPECT_EQ(json["usage"]["completion_tokens_details"]["audio_tokens"], 0); + EXPECT_EQ(json["usage"]["completion_tokens_details"].size(), 2); +} + +TEST(UsageJsonTest, ChatUsagePrintsZeroCachedTokens) { + Usage usage; + usage.num_prompt_tokens = 12; + usage.num_generated_tokens = 3; + usage.num_total_tokens = 15; + usage.num_cached_tokens = 0; + + proto::ChatResponse response; + api_service::set_proto_usage(response.mutable_usage(), usage); + + json2pb::Pb2JsonOptions options; + options.bytes_to_base64 = false; + options.jsonify_empty_array = true; + options.always_print_primitive_fields = true; + + std::string json_text; + std::string error_message; + ASSERT_TRUE(json2pb::ProtoMessageToJson( + response, &json_text, options, &error_message)) + << error_message; + + nlohmann::json json = nlohmann::json::parse(json_text); + ASSERT_TRUE(json["usage"].contains("prompt_tokens_details")); + EXPECT_EQ(json["usage"]["prompt_tokens_details"]["cached_tokens"], 0); + EXPECT_EQ(json["usage"]["prompt_tokens_details"]["audio_tokens"], 0); + ASSERT_TRUE(json["usage"].contains("completion_tokens_details")); + EXPECT_EQ(json["usage"]["completion_tokens_details"]["reasoning_tokens"], 0); + EXPECT_EQ(json["usage"]["completion_tokens_details"]["audio_tokens"], 0); + EXPECT_EQ(json["usage"]["completion_tokens_details"].size(), 2); +} + +} // namespace +} // namespace xllm diff --git a/tests/core/framework/block/block_manager_test.cpp b/tests/core/framework/block/block_manager_test.cpp index 411668e5b8..a9d840636c 100644 --- a/tests/core/framework/block/block_manager_test.cpp +++ b/tests/core/framework/block/block_manager_test.cpp @@ -330,4 +330,52 @@ TEST(BlockManagerPoolTest, SequenceCopyDoesNotReuseSingleBlockSlot) { EXPECT_NE(GetSingleBlockIdOrFail(clone), GetSingleBlockIdOrFail(src)); } +TEST(BlockManagerPoolTest, PrefixCacheHitsAreReportedAsCachedTokens) { + ScopedValue max_seqs_guard( + &SchedulerConfig::get_instance().max_seqs_per_batch(), 2); + + BlockManagerPool::Options options; + options.num_blocks(16).host_num_blocks(0).block_size(4).enable_prefix_cache( + true); + BlockManagerPool pool(options, /*dp_size=*/1); + + { + Sequence seq1 = + MakeSequence(0, /*prompt_tokens=*/{1, 2, 3, 4, 5, 6, 7, 8, 9}); + ASSERT_TRUE(pool.allocate(&seq1)); + seq1.kv_state().set_kv_cache_tokens_num(seq1.num_prompt_tokens()); + EXPECT_EQ(seq1.num_cached_tokens(), 0); + pool.cache(&seq1); + + Sequence seq2 = + MakeSequence(0, /*prompt_tokens=*/{1, 2, 3, 4, 5, 6, 7, 8, 10}); + ASSERT_TRUE(pool.allocate(&seq2)); + + EXPECT_EQ(seq2.num_cached_tokens(), 8); + EXPECT_LE(seq2.num_cached_tokens(), seq2.num_prompt_tokens()); + EXPECT_EQ(seq2.num_cached_tokens() % options.block_size(), 0); + } + + int32_t dp_rank = 0; + auto eviction_blocks = + pool.allocate((options.num_blocks() - 1) * options.block_size(), dp_rank); + EXPECT_EQ(eviction_blocks.size(), options.num_blocks() - 1); + EXPECT_EQ(pool.num_blocks_in_prefix_cache()[0], 0); +} + +TEST(BlockManagerPoolTest, PrefixCacheDisabledReportsZeroCachedTokens) { + ScopedValue max_seqs_guard( + &SchedulerConfig::get_instance().max_seqs_per_batch(), 2); + + BlockManagerPool::Options options; + options.num_blocks(16).host_num_blocks(0).block_size(4).enable_prefix_cache( + false); + BlockManagerPool pool(options, /*dp_size=*/1); + + Sequence seq = MakeSequence(0, /*prompt_tokens=*/{1, 2, 3, 4, 5, 6, 7, 8, 9}); + ASSERT_TRUE(pool.allocate(&seq)); + + EXPECT_EQ(seq.num_cached_tokens(), 0); +} + } // namespace xllm diff --git a/xllm/api_service/chat_service_impl.cpp b/xllm/api_service/chat_service_impl.cpp index 0503de4c67..0b65c88d2e 100644 --- a/xllm/api_service/chat_service_impl.cpp +++ b/xllm/api_service/chat_service_impl.cpp @@ -373,9 +373,7 @@ bool send_delta_to_client_brpc( response.set_created(created_time); response.set_model(model); auto* proto_usage = response.mutable_usage(); - proto_usage->set_prompt_tokens(usage.num_prompt_tokens); - proto_usage->set_completion_tokens(usage.num_generated_tokens); - proto_usage->set_total_tokens(usage.num_total_tokens); + api_service::set_proto_usage(proto_usage, usage); if (!call->write(response)) { return false; } @@ -460,9 +458,7 @@ bool send_result_to_client_brpc(std::shared_ptr call, if (req_output.usage.has_value()) { const auto& usage = req_output.usage.value(); auto* proto_usage = response.mutable_usage(); - proto_usage->set_prompt_tokens(usage.num_prompt_tokens); - proto_usage->set_completion_tokens(usage.num_generated_tokens); - proto_usage->set_total_tokens(usage.num_total_tokens); + api_service::set_proto_usage(proto_usage, usage); } return call->write_and_finish(response); diff --git a/xllm/api_service/utils.h b/xllm/api_service/utils.h index 5b4b2de410..fe0697812c 100644 --- a/xllm/api_service/utils.h +++ b/xllm/api_service/utils.h @@ -27,7 +27,9 @@ limitations under the License. #include "api_service/stream_output_parser.h" #include "chat.pb.h" +#include "common.pb.h" #include "core/common/types.h" +#include "core/framework/request/request_output.h" #include "function_call/function_call.h" namespace xllm { @@ -36,6 +38,23 @@ namespace api_service { // Check for unstreamed tool arguments and send them using the provided sender // This is shared between Chat API and Anthropic API implementations using SendFunc = std::function; + +inline void set_proto_usage(proto::Usage* proto_usage, + const xllm::Usage& usage) { + CHECK(proto_usage != nullptr); + proto_usage->set_prompt_tokens(usage.num_prompt_tokens); + proto_usage->set_completion_tokens(usage.num_generated_tokens); + proto_usage->set_total_tokens(usage.num_total_tokens); + auto* prompt_tokens_details = proto_usage->mutable_prompt_tokens_details(); + prompt_tokens_details->set_cached_tokens(usage.num_cached_tokens); + prompt_tokens_details->set_audio_tokens(0); + + auto* completion_tokens_details = + proto_usage->mutable_completion_tokens_details(); + completion_tokens_details->set_reasoning_tokens(0); + completion_tokens_details->set_audio_tokens(0); +} + inline bool check_for_unstreamed_tool_args( std::shared_ptr stream_parser, size_t index, @@ -154,4 +173,4 @@ inline nlohmann::json struct_to_json( } } // namespace api_service -} // namespace xllm \ No newline at end of file +} // namespace xllm diff --git a/xllm/c_api/internal/helper.cpp b/xllm/c_api/internal/helper.cpp index fdd976e28c..1b5eb3ff7b 100644 --- a/xllm/c_api/internal/helper.cpp +++ b/xllm/c_api/internal/helper.cpp @@ -311,6 +311,7 @@ XLLM_Response* build_success_response(const InferenceType& inference_type, response->usage.prompt_tokens = usage.num_prompt_tokens; response->usage.completion_tokens = usage.num_generated_tokens; response->usage.total_tokens = usage.num_total_tokens; + response->usage.cached_tokens = usage.num_cached_tokens; } return response; diff --git a/xllm/c_api/test/utils.cpp b/xllm/c_api/test/utils.cpp index da72775f43..915f8848a1 100644 --- a/xllm/c_api/test/utils.cpp +++ b/xllm/c_api/test/utils.cpp @@ -475,12 +475,14 @@ void PbToXllmUsage(const c_api_test::XLLM_Usage& pb, XLLM_Usage* out) { out->prompt_tokens = pb.prompt_tokens(); out->completion_tokens = pb.completion_tokens(); out->total_tokens = pb.total_tokens(); + out->cached_tokens = pb.cached_tokens(); } void XllmUsageToPb(const XLLM_Usage& in, c_api_test::XLLM_Usage* pb) { pb->set_prompt_tokens(in.prompt_tokens); pb->set_completion_tokens(in.completion_tokens); pb->set_total_tokens(in.total_tokens); + pb->set_cached_tokens(in.cached_tokens); } void PbToXllmLogProbs(const c_api_test::XLLM_LogProbs& pb, diff --git a/xllm/c_api/test/xllm_test.proto b/xllm/c_api/test/xllm_test.proto index cdf86a290a..15954f7a25 100644 --- a/xllm/c_api/test/xllm_test.proto +++ b/xllm/c_api/test/xllm_test.proto @@ -162,6 +162,7 @@ message XLLM_Usage { int32 prompt_tokens = 1; int32 completion_tokens = 2; int32 total_tokens = 3; + int32 cached_tokens = 4; } // --- XLLM_LogProb / XLLM_LogProbs --- @@ -221,4 +222,4 @@ message XLLM_DumpRecord { // --backend: rec -> xllm_rec_*; llm -> xllm_llm_*). service XllmRecCapiService { rpc Inference(XLLM_Request) returns (XLLM_Response); -} \ No newline at end of file +} diff --git a/xllm/c_api/types.h b/xllm/c_api/types.h index 5ecda608db..ca04a6a57c 100644 --- a/xllm/c_api/types.h +++ b/xllm/c_api/types.h @@ -302,6 +302,9 @@ typedef struct XLLM_CAPI_EXPORT XLLM_Usage { /** Total tokens used (prompt + completion) */ int32_t total_tokens; + + /** Number of prompt tokens served from prefix cache */ + int32_t cached_tokens; } XLLM_Usage; /** diff --git a/xllm/cc_api/internal.h b/xllm/cc_api/internal.h index 292cc1e5fa..c9b50bc1bd 100644 --- a/xllm/cc_api/internal.h +++ b/xllm/cc_api/internal.h @@ -147,6 +147,7 @@ XLLM_Response build_success_response(const RequestOutput& output, response.usage.prompt_tokens = usage.num_prompt_tokens; response.usage.completion_tokens = usage.num_generated_tokens; response.usage.total_tokens = usage.num_total_tokens; + response.usage.cached_tokens = usage.num_cached_tokens; } return response; diff --git a/xllm/cc_api/types.h b/xllm/cc_api/types.h index 5fd1a1bf1c..bfa15fabef 100644 --- a/xllm/cc_api/types.h +++ b/xllm/cc_api/types.h @@ -239,6 +239,9 @@ struct XLLM_CAPI_EXPORT XLLM_Usage { // The total number of tokens used in the request (prompt + completion). int32_t total_tokens; + + // The number of prompt tokens served from prefix cache. + int32_t cached_tokens; }; struct XLLM_CAPI_EXPORT XLLM_LogProbData { diff --git a/xllm/core/framework/request/request.cpp b/xllm/core/framework/request/request.cpp index 388221ff44..7ee40ae8bf 100644 --- a/xllm/core/framework/request/request.cpp +++ b/xllm/core/framework/request/request.cpp @@ -20,7 +20,9 @@ limitations under the License. #include #include +#include #include +#include #include #include @@ -99,6 +101,7 @@ void Request::log_statistic(double total_latency) { << "finish_reason: " << seq->finish_reason().to_string().value_or("") << ", " << "prompt_tokens: " << seq->num_prompt_tokens() << ", " + << "cached_tokens: " << seq->num_cached_tokens() << ", " << "generated_tokens: " << gen_tokens << ", " << std::fixed << std::setprecision(1) << "ttft: " << ttft * 1000 << "ms, " << "total_latency: " << total_latency * 1000 << "ms, " @@ -155,13 +158,18 @@ RequestOutput Request::generate_output(const Tokenizer& tokenizer, // summarize statistics for all sequences Usage usage; usage.num_prompt_tokens = state_.prompt_tokens.size(); + size_t num_cached_tokens = 0; for (const auto& seq : sequences()) { usage.num_generated_tokens += seq->num_generated_tokens(); + num_cached_tokens = std::max(num_cached_tokens, seq->num_cached_tokens()); // NOTE: Avoid counting the extra execution step in overlap scenario. if (state_.enable_schedule_overlap) { usage.num_generated_tokens--; } } + CHECK_LE(num_cached_tokens, + static_cast(std::numeric_limits::max())); + usage.num_cached_tokens = static_cast(num_cached_tokens); usage.num_total_tokens = usage.num_prompt_tokens + usage.num_generated_tokens; RequestOutput output; diff --git a/xllm/core/framework/request/request_output.h b/xllm/core/framework/request/request_output.h index 3807d8133a..26614ef4b0 100644 --- a/xllm/core/framework/request/request_output.h +++ b/xllm/core/framework/request/request_output.h @@ -35,6 +35,9 @@ struct Usage { // the total number of tokens used in the request (prompt + completion). int32_t num_total_tokens = 0; + + // the number of prompt tokens served from prefix cache. + int32_t num_cached_tokens = 0; }; struct LogProbData { diff --git a/xllm/core/framework/request/sequence.cpp b/xllm/core/framework/request/sequence.cpp index 4c6f75b9f1..3f8d8493cb 100644 --- a/xllm/core/framework/request/sequence.cpp +++ b/xllm/core/framework/request/sequence.cpp @@ -289,6 +289,7 @@ Sequence::Sequence(const Sequence& other) num_tokens_(other.num_tokens_), token_to_count_map_(other.token_to_count_map_), num_prompt_tokens_(other.num_prompt_tokens_), + num_cached_tokens_(other.num_cached_tokens_), onerec_state_(other.onerec_state_), volatile_num_prompt_tokens_(other.volatile_num_prompt_tokens_), request_id_(other.request_id_), @@ -690,8 +691,38 @@ void Sequence::add_host_kv_blocks(const std::vector& blocks) { host_kv_state_.add_kv_blocks(blocks); } +size_t Sequence::current_num_cached_tokens() const { + size_t cached_tokens = std::max(kv_state_.shared_kv_tokens_num(), + host_kv_state_.shared_kv_tokens_num()); + if (cached_tokens <= num_prompt_tokens_) { + return cached_tokens; + } + + size_t block_size = 0; + if (kv_state_.shared_kv_blocks_num() > 0 && kv_state_.num_kv_blocks() > 0) { + block_size = kv_state_.kv_blocks()[0].size(); + } else if (host_kv_state_.shared_kv_blocks_num() > 0 && + host_kv_state_.num_kv_blocks() > 0) { + block_size = host_kv_state_.kv_blocks()[0].size(); + } + if (block_size == 0) { + return 0; + } + return (num_prompt_tokens_ / block_size) * block_size; +} + +void Sequence::record_cached_tokens() { + num_cached_tokens_ = + std::max(num_cached_tokens_, current_num_cached_tokens()); +} + +size_t Sequence::num_cached_tokens() const { + return std::max(num_cached_tokens_, current_num_cached_tokens()); +} + // release all cache blocks void Sequence::reset() { + record_cached_tokens(); kv_state_.reset(); host_kv_state_.reset(); timer_.reset(); @@ -702,10 +733,12 @@ void Sequence::reset() { void Sequence::add_shared_kv_blocks(std::vector&& blocks) { kv_state_.add_shared_kv_blocks(std::move(blocks), num_tokens_); + record_cached_tokens(); } void Sequence::add_shared_host_kv_blocks(std::vector&& blocks) { host_kv_state_.add_shared_kv_blocks(std::move(blocks), num_tokens_); + record_cached_tokens(); } bool Sequence::finished() const { @@ -805,6 +838,7 @@ bool Sequence::update_prefetch_result(uint32_t timeout, uint32_t& success_cnt) { host_kv_state_.incr_kv_cache_tokens_num( success_cnt * host_kv_state_.kv_blocks()[0].size()); host_kv_state_.incr_shared_kv_blocks_num(success_cnt); + record_cached_tokens(); } prefetch_results_.clear(); return true; diff --git a/xllm/core/framework/request/sequence.h b/xllm/core/framework/request/sequence.h index 0880832146..49fbeddcde 100644 --- a/xllm/core/framework/request/sequence.h +++ b/xllm/core/framework/request/sequence.h @@ -175,6 +175,8 @@ class Sequence final { host_kv_state_.kv_cache_tokens_num()); } + size_t num_cached_tokens() const; + // add a new token id to the sequence and update the count // the token would be discarded if the sequence is still in prefill stage void append_token(const Token& token); @@ -406,6 +408,8 @@ class Sequence final { private: void record_first_token(const Token& token); + size_t current_num_cached_tokens() const; + void record_cached_tokens(); SequenceOutputType output_type(); void generate_embeddings_output(SequenceOutput& output); @@ -481,6 +485,10 @@ class Sequence final { // the length of the prompt tokens size_t num_prompt_tokens_ = 0; + // Prefix-cache hits must survive KV block release until final usage is + // emitted. + size_t num_cached_tokens_ = 0; + std::optional onerec_state_; // NOTE: MUST FIXME Later diff --git a/xllm/core/framework/request/sequence_kv_state.cpp b/xllm/core/framework/request/sequence_kv_state.cpp index 8e723ab00d..2e99ecb7b8 100644 --- a/xllm/core/framework/request/sequence_kv_state.cpp +++ b/xllm/core/framework/request/sequence_kv_state.cpp @@ -36,6 +36,13 @@ size_t KVCacheState::shared_kv_blocks_num() const { return num_owned_shared_blocks_; } +size_t KVCacheState::shared_kv_tokens_num() const { + if (blocks_.empty() || num_owned_shared_blocks_ == 0) { + return 0; + } + return num_owned_shared_blocks_ * blocks_[0].size(); +} + size_t KVCacheState::kv_cache_tokens_num() const { return kv_cache_tokens_num_; } diff --git a/xllm/core/framework/request/sequence_kv_state.h b/xllm/core/framework/request/sequence_kv_state.h index 29caa3f750..4c58921344 100644 --- a/xllm/core/framework/request/sequence_kv_state.h +++ b/xllm/core/framework/request/sequence_kv_state.h @@ -32,6 +32,7 @@ class KVCacheState { void incr_kv_cache_tokens_num(size_t num); // get the number of shared blocks. size_t shared_kv_blocks_num() const; + size_t shared_kv_tokens_num() const; void add_kv_blocks(const std::vector& new_blocks); void add_shared_kv_blocks(std::vector&& blocks, diff --git a/xllm/core/runtime/xservice_client.cpp b/xllm/core/runtime/xservice_client.cpp index c9040d22d7..c4ec9f9247 100644 --- a/xllm/core/runtime/xservice_client.cpp +++ b/xllm/core/runtime/xservice_client.cpp @@ -556,6 +556,8 @@ std::vector XServiceClient::generations( proto_usage->set_num_generated_tokens( output.usage.value().num_generated_tokens); proto_usage->set_num_total_tokens(output.usage.value().num_total_tokens); + proto_usage->set_num_cached_tokens( + output.usage.value().num_cached_tokens); } req->mutable_outputs()->Reserve(output.outputs.size()); for (auto& seq_output : output.outputs) { diff --git a/xllm/parser/detector_registry.cpp b/xllm/parser/detector_registry.cpp index 3b8bbef06b..eb895d4deb 100644 --- a/xllm/parser/detector_registry.cpp +++ b/xllm/parser/detector_registry.cpp @@ -39,7 +39,7 @@ namespace { // Maps reasoning_parser name to supported model_types const std::unordered_map auto_paser_map = { // {"deepseek_v3", "deepseek-v3"}, - // {"qwen3", "qwen3"}, + {"qwen3", "qwen3"}, {"glm4_moe", "glm45"}, {"deepseek_v32", "deepseekv32"}, {"kimi_k2", "kimi"}, diff --git a/xllm/parser/reasoning_detector.cpp b/xllm/parser/reasoning_detector.cpp index 30b628557c..f0fff4211f 100644 --- a/xllm/parser/reasoning_detector.cpp +++ b/xllm/parser/reasoning_detector.cpp @@ -82,8 +82,9 @@ ReasoningResult ReasoningDetector::parse_streaming_increment( // Strip `` token if present if (!stripped_think_start_ && absl::StrContains(current_text, think_start_token_)) { - current_text = absl::StrReplaceAll( - {{absl::string_view(think_start_token_), ""}}, ¤t_text); + absl::StrReplaceAll( + {{absl::string_view(think_start_token_), absl::string_view()}}, + ¤t_text); stripped_think_start_ = true; in_reasoning_ = true; } diff --git a/xllm/proto/common.proto b/xllm/proto/common.proto index e692a65585..cb3792f89b 100644 --- a/xllm/proto/common.proto +++ b/xllm/proto/common.proto @@ -48,6 +48,30 @@ message Usage { // the total number of tokens used in the request (prompt + completion). optional int32 total_tokens = 3 [json_name="total_tokens"]; + + message PromptTokensDetails { + // the number of prompt tokens served from prefix cache. + optional int32 cached_tokens = 1 [json_name="cached_tokens"]; + + // the number of audio input tokens present in the prompt. + optional int32 audio_tokens = 2 [json_name="audio_tokens"]; + } + + // details about prompt token accounting. + optional PromptTokensDetails prompt_tokens_details = 4 + [json_name="prompt_tokens_details"]; + + message CompletionTokensDetails { + // the number of tokens generated by the model for reasoning. + optional int32 reasoning_tokens = 1 [json_name="reasoning_tokens"]; + + // the number of audio tokens generated by the model. + optional int32 audio_tokens = 2 [json_name="audio_tokens"]; + } + + // details about completion token accounting. + optional CompletionTokensDetails completion_tokens_details = 5 + [json_name="completion_tokens_details"]; } // Options for streaming response. diff --git a/xllm/proto/disagg_pd.proto b/xllm/proto/disagg_pd.proto index c494b9f028..92f665b738 100644 --- a/xllm/proto/disagg_pd.proto +++ b/xllm/proto/disagg_pd.proto @@ -138,6 +138,8 @@ message OutputUsage { int32 num_generated_tokens = 2; // the total number of tokens used in the request (prompt + completion). int32 num_total_tokens = 3; + // the number of prompt tokens served from prefix cache. + int32 num_cached_tokens = 4; } message LogProbData { diff --git a/xllm/pybind/bind.cpp b/xllm/pybind/bind.cpp index f171820d1a..59cb30cf62 100644 --- a/xllm/pybind/bind.cpp +++ b/xllm/pybind/bind.cpp @@ -218,14 +218,18 @@ PYBIND11_MODULE(xllm_export, m) { .def_readwrite("num_prompt_tokens", &Usage::num_prompt_tokens) .def_readwrite("num_generated_tokens", &Usage::num_generated_tokens) .def_readwrite("num_total_tokens", &Usage::num_total_tokens) + .def_readwrite("num_cached_tokens", &Usage::num_cached_tokens) .def_property_readonly( "prompt_tokens", [](const Usage& self) { return self.num_prompt_tokens; }) .def_property_readonly( "completion_tokens", [](const Usage& self) { return self.num_generated_tokens; }) - .def_property_readonly("total_tokens", [](const Usage& self) { - return self.num_total_tokens; + .def_property_readonly( + "total_tokens", + [](const Usage& self) { return self.num_total_tokens; }) + .def_property_readonly("cached_tokens", [](const Usage& self) { + return self.num_cached_tokens; }); // 5. export RequestOutput py::class_(m, "RequestOutput")