From 2dc73629e2950e83314bf00cbe3a497df94b9578 Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Tue, 26 May 2026 10:56:25 -0700 Subject: [PATCH 1/5] llm_bench: add --targets for sending traffic to multiple deployments Adds a repeatable --targets flag (format 'url[|model][|api_key]') so a single load test can drive multiple deployments at once. Users are assigned round-robin across targets, and each request is tagged with the target URL in the locust stats name so per-target latency and throughput are visible without running separate locust processes. When --targets is unset, behavior is unchanged. --- llm_bench/load_test.py | 62 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index 0371d8a..3dce4d4 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -1164,6 +1164,47 @@ def _load_curl_like_data(text): class LLMUser(HttpUser): # no wait time, so every user creates a continuous load, sending requests as quickly as possible + _target_counter = 0 + _target_counter_lock = threading.Lock() + + @staticmethod + def _parse_targets(environment): + raw = environment.parsed_options.targets or [] + default_host = environment.host + default_model = environment.parsed_options.model + default_api_key = environment.parsed_options.api_key + if not raw: + return [{ + "url": default_host, + "model": default_model, + "api_key": default_api_key, + "label": default_host or "default", + }] + parsed = [] + for spec in raw: + parts = spec.split("|") + url = parts[0] or default_host + model = (parts[1] if len(parts) > 1 and parts[1] else default_model) + api_key = (parts[2] if len(parts) > 2 and parts[2] else default_api_key) + parsed.append({ + "url": url, + "model": model, + "api_key": api_key, + "label": url, + }) + return parsed + + def __init__(self, environment): + targets = self._parse_targets(environment) + with LLMUser._target_counter_lock: + idx = LLMUser._target_counter % len(targets) + LLMUser._target_counter += 1 + self._target = targets[idx] + # Override self.host before HttpUser.__init__ creates the HttpSession + if self._target["url"]: + self.host = self._target["url"] + super().__init__(environment) + def on_start(self): try: self._on_start() @@ -1173,7 +1214,7 @@ def on_start(self): sys.exit(1) def _guess_provider(self): - self.model = self.environment.parsed_options.model + self.model = self._target.get("model") or self.environment.parsed_options.model self.provider = self.environment.parsed_options.provider # guess based on URL if self.provider is None: @@ -1223,8 +1264,9 @@ def _guess_provider(self): def _on_start(self): self.client.headers["Content-Type"] = "application/json" - if self.environment.parsed_options.api_key: - self.client.headers["Authorization"] = "Bearer " + self.environment.parsed_options.api_key + api_key = self._target.get("api_key") or self.environment.parsed_options.api_key + if api_key: + self.client.headers["Authorization"] = "Bearer " + api_key if self.environment.parsed_options.header: for header in self.environment.parsed_options.header: key, val = header.split(":", 1) @@ -1444,6 +1486,7 @@ def _do_generate_text(self): stream=True, catch_response=True, timeout=60, + name=f"[{self._target['label']}] {self.provider_formatter.get_url()}", ) as response: combined_text = "" done = False @@ -1675,6 +1718,19 @@ def init_parser(parser): type=str, help="The model to use for generating text. If not specified we will pick the first model from the service as returned by /v1/models", ) + parser.add_argument( + "--targets", + action="append", + default=[], + help=( + "Target deployment to forward traffic to. Format: 'url[|model][|api_key]' " + "(pipe-separated, all but url optional). Repeat the flag for multiple targets. " + "When set, users are assigned round-robin across targets; --host/--model/--api-key " + "act as fallbacks for fields the per-target spec leaves blank. " + "To match load per target, pass --users N*. " + "Request stats are grouped per target via the request name prefix." + ), + ) parser.add_argument( "--tokenizer", env_var="TOKENIZER", From bd1a2e8fabcac919525bb02cc5c944b800b2aef6 Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Tue, 26 May 2026 11:06:35 -0700 Subject: [PATCH 2/5] llm_bench: interpret -u/--users as per-target with --targets Multiply num_users and spawn_rate by the number of --targets in an events.init hook so '-u 100 --targets a --targets b' spawns 200 total users (100 per target). Matches what 'load per deployment' actually means in side-by-side comparisons, removes the manual multiplication step from the call site. --- llm_bench/load_test.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index 3dce4d4..486ac48 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -627,6 +627,30 @@ def _defer_run_time_to_after_spawn(environment, **_kwargs): logger.info(f"Will stop after {max_requests} requests complete") +@events.init.add_listener +def _scale_users_by_targets(environment, **_kwargs): + """Multiply -u/--users and --spawn-rate by the number of --targets entries. + + With this, -u 100 means "100 users per target": the user specifies per-target + load and the script scales the total locust user count up to match. spawn-rate + is multiplied proportionally so the ramp-up rate per target stays the same. + """ + targets = getattr(environment.parsed_options, "targets", None) or [] + n = len(targets) + if n <= 1: + return + num_users = getattr(environment.parsed_options, "num_users", None) + if num_users: + environment.parsed_options.num_users = num_users * n + logger.info( + f"Scaling --users by {n} targets: {num_users} -> {num_users * n} " + f"(stays {num_users} per target)" + ) + spawn_rate = getattr(environment.parsed_options, "spawn_rate", None) + if spawn_rate: + environment.parsed_options.spawn_rate = spawn_rate * n + + @dataclass class ChunkMetadata: text: str @@ -1727,7 +1751,8 @@ def init_parser(parser): "(pipe-separated, all but url optional). Repeat the flag for multiple targets. " "When set, users are assigned round-robin across targets; --host/--model/--api-key " "act as fallbacks for fields the per-target spec leaves blank. " - "To match load per target, pass --users N*. " + "-u/--users is interpreted per-target: -u 100 with two targets spawns 200 total " + "users (100 hitting each target). --spawn-rate is scaled the same way. " "Request stats are grouped per target via the request name prefix." ), ) From 3d61e81f9738874350f39175df370c4a3dc40785 Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Tue, 26 May 2026 11:11:18 -0700 Subject: [PATCH 3/5] llm_bench: tighten --targets help text --- llm_bench/load_test.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index 486ac48..f26e18e 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -1747,13 +1747,9 @@ def init_parser(parser): action="append", default=[], help=( - "Target deployment to forward traffic to. Format: 'url[|model][|api_key]' " - "(pipe-separated, all but url optional). Repeat the flag for multiple targets. " - "When set, users are assigned round-robin across targets; --host/--model/--api-key " - "act as fallbacks for fields the per-target spec leaves blank. " - "-u/--users is interpreted per-target: -u 100 with two targets spawns 200 total " - "users (100 hitting each target). --spawn-rate is scaled the same way. " - "Request stats are grouped per target via the request name prefix." + "Target URL. Repeat for multiple targets. Format: 'url[|model][|api_key]' " + "(model and api_key optional, fall back to --model/--api-key). " + "With multiple targets, -u/--users is per-target (total = users * num_targets)." ), ) parser.add_argument( From 993516db426b7d9d9b956b14a2bd86af373e0257 Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Tue, 26 May 2026 11:24:14 -0700 Subject: [PATCH 4/5] llm_bench: skip model comparison in notify_init when --targets is set Multi-target runs intentionally assign a different model per user, but notify_init asserts every user shares identical logging_params. Skip the 'model' key in that comparison when --targets is non-empty; the single-target assertion is unchanged. --- llm_bench/load_test.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index f26e18e..2041220 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -473,9 +473,17 @@ def notify_init(cls, environment, logging_params): if cls.logging_params is None: cls.logging_params = logging_params else: + # Multi-target runs intentionally use a different model per user; + # drop fields that are expected to differ across targets before comparing. + multi_target = bool(getattr(environment.parsed_options, "targets", None)) + if multi_target: + existing = {k: v for k, v in cls.logging_params.items() if k != "model"} + incoming = {k: v for k, v in logging_params.items() if k != "model"} + else: + existing, incoming = cls.logging_params, logging_params assert ( - cls.logging_params == logging_params - ), f"Inconsistent settings between workers: {cls.logging_params} != {logging_params}" + existing == incoming + ), f"Inconsistent settings between workers: {existing} != {incoming}" @classmethod def notify_first_request(cls): From 5ff4c812ad8795da539c8b59463dd42749be4640 Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Tue, 26 May 2026 12:25:00 -0700 Subject: [PATCH 5/5] llm_bench: use deployment id as target label when targets share a URL The previous label was the target URL, which collapsed stats into one row when two targets share the same host but differ by model (common with Fireworks deployment-pinned model strings like 'accounts/x/models/y#accounts/x/deployments/z'). Extract the deployment id from the '#' suffix when present, otherwise fall back to 'url model'. --- llm_bench/load_test.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index 2041220..34b9910 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -1205,12 +1205,23 @@ def _parse_targets(environment): default_host = environment.host default_model = environment.parsed_options.model default_api_key = environment.parsed_options.api_key + def _label(url, model): + # Make the label distinguish targets even when they share a URL but + # differ by model (common with Fireworks deployment-pinned model + # strings of the form 'accounts/x/models/y#accounts/x/deployments/z'). + if model and "#" in model: + # Use the deployment id (right of '#') — short and human-recognizable. + return model.rsplit("/", 1)[-1] + if model: + return f"{url} {model}" + return url or "default" + if not raw: return [{ "url": default_host, "model": default_model, "api_key": default_api_key, - "label": default_host or "default", + "label": _label(default_host, default_model), }] parsed = [] for spec in raw: @@ -1222,7 +1233,7 @@ def _parse_targets(environment): "url": url, "model": model, "api_key": api_key, - "label": url, + "label": _label(url, model), }) return parsed