From 2dee0beb7d819202b550b3ae8b5e25c2b91eeffd Mon Sep 17 00:00:00 2001
From: Pedram Razavi <pedram.razavi@gmail.com>
Date: Tue, 16 Jun 2026 21:27:27 -0700
Subject: [PATCH] Expose --queue-size, --queue-timeout-secs, and
 --rate-limit-tokens-per-second CLI flags

The Rust vllm-router binary hardcoded queue_size=100, queue_timeout_secs=60, and
rate_limit_tokens_per_second=None in CliArgs::to_router_config(), even though
RouterConfig supports all three and the Python launcher (router_args.py) already
exposes them. This drift means binary users cannot disable the concurrency queue
(--queue-size 0) for fail-fast 429 shedding, tune the queue timeout, or set an
explicit token-bucket refill rate.

Add the three flags to CliArgs and thread them through to_router_config().
Defaults match the previously hardcoded values (100 / 60 / None), so behavior is
unchanged unless a flag is explicitly passed.

Signed-off-by: Pedram Razavi <pedram.razavi@gmail.com>
---
 src/main.rs | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/src/main.rs b/src/main.rs
index a642898b..3cb39941 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -251,6 +251,21 @@ struct CliArgs {
     #[arg(long, default_value_t = 32768)]
     max_concurrent_requests: usize,
 
+    /// Queue size for pending requests when the max-concurrent limit is
+    /// reached (0 = no queue: shed immediately with HTTP 429)
+    #[arg(long, default_value_t = 100)]
+    queue_size: usize,
+
+    /// Maximum time in seconds a request may wait in the concurrency queue
+    /// before timing out
+    #[arg(long, default_value_t = 60)]
+    queue_timeout_secs: u64,
+
+    /// Token-bucket refill rate in tokens per second. Defaults to
+    /// --max-concurrent-requests when unset.
+    #[arg(long)]
+    rate_limit_tokens_per_second: Option<usize>,
+
     /// CORS allowed origins
     #[arg(long, num_args = 0..)]
     cors_allowed_origins: Vec<String>,
@@ -523,8 +538,8 @@ impl CliArgs {
                 Some(self.request_id_headers.clone())
             },
             max_concurrent_requests: self.max_concurrent_requests,
-            queue_size: 100,        // Default queue size
-            queue_timeout_secs: 60, // Default timeout
+            queue_size: self.queue_size,
+            queue_timeout_secs: self.queue_timeout_secs,
             cors_allowed_origins: self.cors_allowed_origins.clone(),
             retry: RetryConfig {
                 max_retries: self.retry_max_retries,
@@ -549,7 +564,7 @@ impl CliArgs {
                 endpoint: self.health_check_endpoint.clone(),
             },
             enable_igw: self.enable_igw,
-            rate_limit_tokens_per_second: None,
+            rate_limit_tokens_per_second: self.rate_limit_tokens_per_second,
             history_backend: match self.history_backend.as_str() {
                 "none" => HistoryBackend::None,
                 _ => HistoryBackend::Memory,