Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/core/framework/kv_cache/kv_cache_estimation_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ KVCacheEstimateOptions make_estimate_options() {
options.world_size = 1;
options.n_local_kv_heads = 2;
options.max_seqs_per_batch = 8;
options.max_concurrent_requests = 8;
return options;
}

Expand Down
3 changes: 3 additions & 0 deletions xllm/core/distributed_runtime/llm_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,8 @@ KVCacheCapacity LLMEngine::estimate_kv_cache_capacity() {
estimate_options.n_local_linear_v_heads = n_local_linear_v_heads_;
estimate_options.max_seqs_per_batch =
static_cast<int64_t>(options_.max_seqs_per_batch());
estimate_options.max_concurrent_requests =
static_cast<int64_t>(::xllm::SchedulerConfig::get_instance().max_concurrent_requests());
estimate_options.is_draft_engine = options_.is_draft_engine();
estimate_options.enable_prefix_cache =
::xllm::KVCacheConfig::get_instance().enable_prefix_cache();
Expand All @@ -452,6 +454,7 @@ KVCacheCapacity LLMEngine::estimate_kv_cache_capacity() {
DeviceMonitor::get_instance().set_total_activation_memory(device.index());
}


return kv_cache_cap;
}

Expand Down
2 changes: 1 addition & 1 deletion xllm/core/framework/block/block_manager_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ BlockManagerPool::BlockManagerPool(const Options& options, int32_t dp_size)
// separate and are addressed via transport fields.
single_block_managers_.emplace_back(std::make_unique<SingleBlockManager>(
/*num_blocks=*/::xllm::SchedulerConfig::get_instance()
.max_seqs_per_batch() +
.max_concurrent_requests() +
2,
/*resource_name=*/"single block",
/*exhaustion_message=*/"No more single-block ids available"));
Expand Down
7 changes: 7 additions & 0 deletions xllm/core/framework/config/scheduler_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ DEFINE_int32(max_tokens_per_batch, 10240, "Max number of tokens per batch.");

DEFINE_int32(max_seqs_per_batch, 1024, "Max number of sequences per batch.");

DEFINE_int32(max_concurrent_requests,
200,
"Max number of concurrent requests.");

DEFINE_bool(enable_schedule_overlap,
false,
"Whether to enable schedule overlap.");
Expand Down Expand Up @@ -78,6 +82,7 @@ namespace xllm {
void SchedulerConfig::from_flags() {
max_tokens_per_batch(FLAGS_max_tokens_per_batch)
.max_seqs_per_batch(FLAGS_max_seqs_per_batch)
.max_concurrent_requests(FLAGS_max_concurrent_requests)
.enable_schedule_overlap(FLAGS_enable_schedule_overlap)
.prefill_scheduling_memory_usage_threshold(
FLAGS_prefill_scheduling_memory_usage_threshold)
Expand All @@ -99,6 +104,8 @@ void SchedulerConfig::from_json(const JsonReader& json) {
json.value_or<int32_t>("max_tokens_per_batch", max_tokens_per_batch()))
.max_seqs_per_batch(
json.value_or<int32_t>("max_seqs_per_batch", max_seqs_per_batch()))
.max_concurrent_requests(json.value_or<int32_t>(
"max_concurrent_requests", max_concurrent_requests()))
.enable_schedule_overlap(json.value_or<bool>("enable_schedule_overlap",
enable_schedule_overlap()))
.prefill_scheduling_memory_usage_threshold(
Expand Down
3 changes: 3 additions & 0 deletions xllm/core/framework/config/scheduler_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class SchedulerConfig final {
"SCHEDULER OPTIONS",
{"max_tokens_per_batch",
"max_seqs_per_batch",
"max_concurrent_requests",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

主分支重构后,这个参数丢失了嘛?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是的,看着像是#1430 这个PR重构的

"enable_schedule_overlap",
"prefill_scheduling_memory_usage_threshold",
"enable_chunked_prefill",
Expand All @@ -61,6 +62,8 @@ class SchedulerConfig final {

PROPERTY(int32_t, max_seqs_per_batch) = 1024;

PROPERTY(int32_t, max_concurrent_requests) = 200;

PROPERTY(bool, enable_schedule_overlap) = false;

PROPERTY(double, prefill_scheduling_memory_usage_threshold) = 0.95;
Expand Down
6 changes: 3 additions & 3 deletions xllm/core/framework/kv_cache/kv_cache_estimation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ void init_dsv4_counts(const ModelArgs& model_args,
void init_standard_counts(const ModelArgs& model_args,
const KVCacheEstimateOptions& options,
KVCacheCapacity* kv_cache_cap) {
kv_cache_cap->num_linear_state_blocks(options.max_seqs_per_batch + 2);
kv_cache_cap->num_linear_state_blocks(options.max_concurrent_requests + 2);
for (int64_t layer_id = 0; layer_id < kv_cache_cap->n_layers(); ++layer_id) {
if (is_full_attention_layer(model_args, layer_id)) {
++kv_cache_cap->num_full_attention_layers();
Expand All @@ -241,8 +241,8 @@ void init_standard_counts(const ModelArgs& model_args,
kv_cache_cap->linear_cache_size_in_bytes())
<< "failed to reserve linear state cache for linear-attention "
"layers: "
<< "max_seqs_per_batch (" << options.max_seqs_per_batch
<< ") is too large. Please reduce max_seqs_per_batch to less than "
<< "max_concurrent_requests (" << options.max_concurrent_requests
<< ") is too large. Please reduce max_concurrent_requests to less than "
<< kv_cache_cap->cache_size_in_bytes() /
(kv_cache_cap->num_linear_attention_layers() *
kv_cache_cap->linear_slot_size()) -
Expand Down
1 change: 1 addition & 0 deletions xllm/core/framework/kv_cache/kv_cache_estimation.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ struct KVCacheEstimateOptions {
int64_t n_local_linear_k_heads = 0;
int64_t n_local_linear_v_heads = 0;
int64_t max_seqs_per_batch = 0;
int64_t max_concurrent_requests = 0;
bool is_draft_engine = false;
bool enable_prefix_cache = false;
};
Expand Down
Loading