jd-opensource · pjgao · May 9, 2026 · May 11, 2026 · yingxudeng · May 20, 2026
diff --git a/tests/core/framework/kv_cache/kv_cache_estimation_test.cpp b/tests/core/framework/kv_cache/kv_cache_estimation_test.cpp
@@ -37,6 +37,7 @@ KVCacheEstimateOptions make_estimate_options() {
   options.world_size = 1;
   options.n_local_kv_heads = 2;
   options.max_seqs_per_batch = 8;
+  options.max_concurrent_requests = 8;
   return options;
 }
 

@@ -437,6 +437,8 @@ KVCacheCapacity LLMEngine::estimate_kv_cache_capacity() {
   estimate_options.n_local_linear_v_heads = n_local_linear_v_heads_;
   estimate_options.max_seqs_per_batch =
       static_cast<int64_t>(options_.max_seqs_per_batch());
+  estimate_options.max_concurrent_requests =
+      static_cast<int64_t>(::xllm::SchedulerConfig::get_instance().max_concurrent_requests());
   estimate_options.is_draft_engine = options_.is_draft_engine();
   estimate_options.enable_prefix_cache =
       ::xllm::KVCacheConfig::get_instance().enable_prefix_cache();
@@ -452,6 +454,7 @@ KVCacheCapacity LLMEngine::estimate_kv_cache_capacity() {
     DeviceMonitor::get_instance().set_total_activation_memory(device.index());
   }
 
+
   return kv_cache_cap;
 }
 

@@ -84,7 +84,7 @@ BlockManagerPool::BlockManagerPool(const Options& options, int32_t dp_size)
     // separate and are addressed via transport fields.
     single_block_managers_.emplace_back(std::make_unique<SingleBlockManager>(
         /*num_blocks=*/::xllm::SchedulerConfig::get_instance()
-                .max_seqs_per_batch() +
+                .max_concurrent_requests() +
             2,
         /*resource_name=*/"single block",
         /*exhaustion_message=*/"No more single-block ids available"));

@@ -22,6 +22,10 @@ DEFINE_int32(max_tokens_per_batch, 10240, "Max number of tokens per batch.");
 
 DEFINE_int32(max_seqs_per_batch, 1024, "Max number of sequences per batch.");
 
+DEFINE_int32(max_concurrent_requests,
+             200,
+             "Max number of concurrent requests.");
+
 DEFINE_bool(enable_schedule_overlap,
             false,
             "Whether to enable schedule overlap.");
@@ -78,6 +82,7 @@ namespace xllm {
 void SchedulerConfig::from_flags() {
   max_tokens_per_batch(FLAGS_max_tokens_per_batch)
       .max_seqs_per_batch(FLAGS_max_seqs_per_batch)
+      .max_concurrent_requests(FLAGS_max_concurrent_requests)
       .enable_schedule_overlap(FLAGS_enable_schedule_overlap)
       .prefill_scheduling_memory_usage_threshold(
           FLAGS_prefill_scheduling_memory_usage_threshold)
@@ -99,6 +104,8 @@ void SchedulerConfig::from_json(const JsonReader& json) {
       json.value_or<int32_t>("max_tokens_per_batch", max_tokens_per_batch()))
       .max_seqs_per_batch(
           json.value_or<int32_t>("max_seqs_per_batch", max_seqs_per_batch()))
+      .max_concurrent_requests(json.value_or<int32_t>(
+          "max_concurrent_requests", max_concurrent_requests()))
       .enable_schedule_overlap(json.value_or<bool>("enable_schedule_overlap",
                                                    enable_schedule_overlap()))
       .prefill_scheduling_memory_usage_threshold(

@@ -41,6 +41,7 @@ class SchedulerConfig final {
         "SCHEDULER OPTIONS",
         {"max_tokens_per_batch",
          "max_seqs_per_batch",
+         "max_concurrent_requests",
          "enable_schedule_overlap",
          "prefill_scheduling_memory_usage_threshold",
          "enable_chunked_prefill",
@@ -61,6 +62,8 @@ class SchedulerConfig final {
 
   PROPERTY(int32_t, max_seqs_per_batch) = 1024;
 
+  PROPERTY(int32_t, max_concurrent_requests) = 200;
+
   PROPERTY(bool, enable_schedule_overlap) = false;
 
   PROPERTY(double, prefill_scheduling_memory_usage_threshold) = 0.95;

@@ -215,7 +215,7 @@ void init_dsv4_counts(const ModelArgs& model_args,
 void init_standard_counts(const ModelArgs& model_args,
                           const KVCacheEstimateOptions& options,
                           KVCacheCapacity* kv_cache_cap) {
-  kv_cache_cap->num_linear_state_blocks(options.max_seqs_per_batch + 2);
+  kv_cache_cap->num_linear_state_blocks(options.max_concurrent_requests + 2);
   for (int64_t layer_id = 0; layer_id < kv_cache_cap->n_layers(); ++layer_id) {
     if (is_full_attention_layer(model_args, layer_id)) {
       ++kv_cache_cap->num_full_attention_layers();
@@ -241,8 +241,8 @@ void init_standard_counts(const ModelArgs& model_args,
              kv_cache_cap->linear_cache_size_in_bytes())
         << "failed to reserve linear state cache for linear-attention "
            "layers: "
-        << "max_seqs_per_batch (" << options.max_seqs_per_batch
-        << ") is too large. Please reduce max_seqs_per_batch to less than "
+        << "max_concurrent_requests (" << options.max_concurrent_requests
+        << ") is too large. Please reduce max_concurrent_requests to less than "
         << kv_cache_cap->cache_size_in_bytes() /
                    (kv_cache_cap->num_linear_attention_layers() *
                     kv_cache_cap->linear_slot_size()) -

@@ -36,6 +36,7 @@ struct KVCacheEstimateOptions {
   int64_t n_local_linear_k_heads = 0;
   int64_t n_local_linear_v_heads = 0;
   int64_t max_seqs_per_batch = 0;
+  int64_t max_concurrent_requests = 0;
   bool is_draft_engine = false;
   bool enable_prefix_cache = false;
 };