NVIDIA-NeMo · VibhuJawa · Jun 9, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
@@ -60,6 +60,8 @@ class RayClient:
     Args:
         ray_port: The port number of the Ray GCS.
         ray_dashboard_port: The port number of the Ray dashboard.
+        ray_min_worker_port: The first worker port Ray may bind.
+        ray_max_worker_port: The last worker port Ray may bind.
         ray_temp_dir: The temporary directory to use for Ray.
         include_dashboard: Whether to include dashboard integration. If true, adds Ray metrics service discovery.
         ray_metrics_port: The port number of the Ray metrics.
@@ -79,6 +81,8 @@ class RayClient:
     ray_port: int = DEFAULT_RAY_PORT
     ray_dashboard_port: int = DEFAULT_RAY_DASHBOARD_PORT
     ray_client_server_port: int = DEFAULT_RAY_CLIENT_SERVER_PORT
+    ray_min_worker_port: int | None = None
+    ray_max_worker_port: int | None = None
     ray_temp_dir: str = DEFAULT_RAY_TEMP_DIR
     include_dashboard: bool = True
     ray_metrics_port: int = DEFAULT_RAY_METRICS_PORT
@@ -155,6 +159,8 @@ def start(self) -> None:
                 ray_metrics_port=self.ray_metrics_port,
                 ray_client_server_port=self.ray_client_server_port,
                 ray_dashboard_host=self.ray_dashboard_host,
+                ray_min_worker_port=self.ray_min_worker_port,
+                ray_max_worker_port=self.ray_max_worker_port,
                 num_gpus=self.num_gpus,
                 num_cpus=self.num_cpus,
                 object_store_memory=self.object_store_memory,

@@ -290,7 +290,7 @@ def _resolve_effective_router(
 
         - ``mode``: honor ``router.mode`` if set; otherwise auto-pick ``"kv"``
           when any model uses ``mode="disagg"``, else leave unset so the
-          Dynamo frontend falls back to its own ``round_robin`` default.
+          Dynamo frontend falls back to its own ``round-robin`` default.
         - ``kv_events``: when we auto-pick ``mode="kv"`` we also auto-enable
           ``kv_events`` so the router consumes what prefill workers publish
           unconditionally in disagg. If the user set ``router.mode`` explicitly

@@ -36,26 +36,41 @@ def __post_init__(self) -> None:
             raise ValueError(msg)
 
 
+DynamoRouterMode = Literal[
+    "round-robin",
+    "round_robin",
+    "random",
+    "power-of-two",
+    "kv",
+    "direct",
+    "least-loaded",
+    "device-aware-weighted",
+]
+
+
 @dataclass
 class DynamoRouterConfig:
     """Frontend router config for Dynamo.
 
     ``mode=None`` means "auto": Curator picks ``"kv"`` if any model uses
     ``mode="disagg"``, else leaves ``--router-mode`` unset so the Dynamo
-    frontend falls back to its own ``round_robin`` default. ``kv_events``
+    frontend falls back to its own ``round-robin`` default. ``kv_events``
     only applies when ``mode == "kv"``: pass ``kv_events=True`` to opt into
     exact ZMQ KV-cache event publishing; the default uses the router's
     approximate tree-based tracking. Anything else is forwarded to the
     Dynamo frontend as CLI args via ``router_kwargs``.
     """
 
-    mode: Literal["round_robin", "random", "kv", "direct"] | None = None
+    mode: DynamoRouterMode | None = None
     kv_events: bool = False
     router_kwargs: dict[str, Any] = field(default_factory=dict)
 
     _RESERVED_ROUTER_KWARGS: ClassVar[frozenset[str]] = frozenset({"router_mode", "router_kv_events"})
+    _MODE_ALIASES: ClassVar[dict[str, str]] = {"round_robin": "round-robin"}
 
     def __post_init__(self) -> None:
+        if self.mode is not None:
+            self.mode = self._MODE_ALIASES.get(self.mode, self.mode)  # type: ignore[assignment]
         if self.mode is not None and self.mode != "kv" and self.kv_events:
             msg = f"kv_events=True is only meaningful when mode='kv'; got mode={self.mode!r}."
             raise ValueError(msg)

@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 import json
+import os
 import tempfile
 from functools import reduce
 from pathlib import Path
@@ -67,12 +68,19 @@
     "config": {"setup_timeout_seconds": 600},
 }
 
+_USE_DRIVER_ENV_VAR = "NEMO_CURATOR_DYNAMO_USE_DRIVER_ENV"
+
 
 @ray.remote
 def _write_actor_overrides_file(path: str, body: str) -> None:
     Path(path).write_text(body)
 
 
+def _use_driver_env_for_dynamo() -> bool:
+    """Return true when Dynamo actors should use the driver's Python env."""
+    return os.environ.get(_USE_DRIVER_ENV_VAR, "0").lower() in {"1", "true", "yes", "on"}
+
+
 def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> None:
     """Write the actor-venv ``--override`` file at a fixed path on every alive node.
 
@@ -109,13 +117,17 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No
 
 def dynamo_runtime_env(model_config: DynamoVLLMModelConfig) -> dict[str, Any]:
     """Merge the user's ``runtime_env`` with the Dynamo-vLLM package pin."""
+    if _use_driver_env_for_dynamo():
+        return model_config.runtime_env or {}
     return BaseModelConfig.merge_runtime_envs(DYNAMO_VLLM_RUNTIME_ENV, model_config.runtime_env or None)
 
 
 def merge_model_runtime_envs(models: list[DynamoVLLMModelConfig]) -> dict[str, Any]:
     """Merge every model's ``runtime_env`` onto the Dynamo-vLLM pin for the shared frontend actor."""
     envs = [m.runtime_env for m in models if m.runtime_env]
     user_merged = reduce(BaseModelConfig.merge_runtime_envs, envs) if envs else None
+    if _use_driver_env_for_dynamo():
+        return user_merged or {}
     return BaseModelConfig.merge_runtime_envs(DYNAMO_VLLM_RUNTIME_ENV, user_merged)
 
 

@@ -70,11 +70,17 @@ def _deploy(self) -> None:
         llm_configs = [self._to_llm_config(model, quiet_runtime_env=quiet_env) for model in server.models]
 
         build_args: dict[str, Any] = {"llm_configs": llm_configs}
+        ingress_deployment_config = dict(server.backend.ingress_deployment_config)
         if quiet_env:
             # Suppress access logs on the OpenAI ingress deployment too.
-            build_args["ingress_deployment_config"] = {
-                "ray_actor_options": {"runtime_env": quiet_env},
-            }
+            ray_actor_options = dict(ingress_deployment_config.get("ray_actor_options", {}))
+            ray_actor_options["runtime_env"] = BaseModelConfig.merge_runtime_envs(
+                ray_actor_options.get("runtime_env", {}),
+                quiet_env,
+            )
+            ingress_deployment_config["ray_actor_options"] = ray_actor_options
+        if ingress_deployment_config:
+            build_args["ingress_deployment_config"] = ingress_deployment_config
 
         from ray import serve
         from ray.serve.llm import build_openai_app

@@ -31,3 +31,4 @@ class RayServeServerConfig(BaseServerConfig):
     """Server-level Ray Serve config."""
 
     model_configs: ClassVar[tuple[type[BaseModelConfig], ...]] = (RayServeModelConfig,)
+    ingress_deployment_config: dict[str, Any] = field(default_factory=dict)
@@ -139,6 +139,8 @@ def init_cluster(  # noqa: PLR0913
     ray_metrics_port: int,
     ray_client_server_port: int,
     ray_dashboard_host: str,
+    ray_min_worker_port: int | None = None,
+    ray_max_worker_port: int | None = None,
     num_gpus: int | None = None,
     num_cpus: int | None = None,
     object_store_memory: int | None = None,
@@ -164,6 +166,10 @@ def init_cluster(  # noqa: PLR0913
     ray_command.extend(["--dashboard-port", str(ray_dashboard_port)])
     ray_command.extend(["--ray-client-server-port", str(ray_client_server_port)])
     ray_command.extend(["--temp-dir", ray_temp_dir])
+    if ray_min_worker_port is not None:
+        ray_command.extend(["--min-worker-port", str(ray_min_worker_port)])
+    if ray_max_worker_port is not None:
+        ray_command.extend(["--max-worker-port", str(ray_max_worker_port)])
     if object_store_memory is not None:
         ray_command.extend(["--object-store-memory", str(object_store_memory)])
     ray_command.extend(["--disable-usage-stats"])

@@ -15,11 +15,14 @@
 import asyncio
 import secrets
 from abc import ABC, abstractmethod
-from collections.abc import Iterable
+from collections.abc import Awaitable, Callable, Iterable
 from dataclasses import dataclass
+from typing import TypeVar
 
 from loguru import logger
 
+T = TypeVar("T")
+
 
 class ConversationFormatter(ABC):
     """
@@ -116,23 +119,15 @@ async def _query_model_impl(
         msg = "Subclass of AsyncLLMClient must implement '_query_model_impl'"
         raise NotImplementedError(msg)
 
-    async def query_model(  # noqa: C901, PLR0912
-        self,
-        *,
-        messages: Iterable,
-        model: str,
-        conversation_formatter: ConversationFormatter | None = None,
-        generation_config: GenerationConfig | dict | None = None,
-    ) -> list[str]:
-        """
-        Query the model with automatic retry and concurrency control.
-        """
-        # Use default config if none provided
+    @staticmethod
+    def _coerce_generation_config(generation_config: GenerationConfig | dict | None) -> GenerationConfig:
         if generation_config is None:
-            generation_config = GenerationConfig()
-        elif isinstance(generation_config, dict):
-            generation_config = GenerationConfig(**generation_config)
+            return GenerationConfig()
+        if isinstance(generation_config, dict):
+            return GenerationConfig(**generation_config)
+        return generation_config
 
+    async def _run_with_retry_and_concurrency(self, operation: Callable[[], Awaitable[T]]) -> T:  # noqa: C901, PLR0912
         # Initialize semaphore if not already done or if we're in a different event loop
         current_loop = asyncio.get_running_loop()
         if self._semaphore is None or self._semaphore_loop != current_loop:
@@ -179,12 +174,7 @@ async def query_model(  # noqa: C901, PLR0912
 
                 # Attempt the query
                 try:
-                    return await self._query_model_impl(
-                        messages=messages,
-                        model=model,
-                        conversation_formatter=conversation_formatter,
-                        generation_config=generation_config,
-                    )
+                    return await operation()
                 except Exception as e:
                     last_exception = e
                     # If this is the last attempt, provide helpful error message
@@ -208,7 +198,27 @@ async def query_model(  # noqa: C901, PLR0912
                 raise last_exception
 
             # This should never be reached, but add explicit return for linter
-            logger.warning(
-                "Unexpected code path: AsyncLLMClient.query_model completed without returning a result or raising an exception"
+            msg = "Unexpected code path: AsyncLLMClient operation completed without returning a result or raising"
+            raise RuntimeError(msg)
+
+    async def query_model(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: ConversationFormatter | None = None,
+        generation_config: GenerationConfig | dict | None = None,
+    ) -> list[str]:
+        """
+        Query the model with automatic retry and concurrency control.
+        """
+        # Use default config if none provided
+        generation_config = self._coerce_generation_config(generation_config)
+        return await self._run_with_retry_and_concurrency(
+            lambda: self._query_model_impl(
+                messages=messages,
+                model=model,
+                conversation_formatter=conversation_formatter,
+                generation_config=generation_config,
             )
-            return []
+        )
@@ -14,13 +14,25 @@
 
 import warnings
 from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any
 
 from loguru import logger
 from openai import AsyncOpenAI, OpenAI
 
 from nemo_curator.models.client.llm_client import AsyncLLMClient, ConversationFormatter, GenerationConfig, LLMClient
 
 
+@dataclass(frozen=True)
+class OpenAIChatCompletionResult:
+    """OpenAI-compatible chat completion content and aggregate usage."""
+
+    contents: list[str]
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    total_tokens: int | None = None
+
+
 class OpenAIClient(LLMClient):
     """
     A wrapper around OpenAI's Python client for querying models
@@ -45,6 +57,21 @@ def query_model(
         conversation_formatter: ConversationFormatter | None = None,
         generation_config: GenerationConfig | dict | None = None,
     ) -> list[str]:
+        return self.query_model_with_usage(
+            messages=messages,
+            model=model,
+            conversation_formatter=conversation_formatter,
+            generation_config=generation_config,
+        ).contents
+
+    def query_model_with_usage(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: ConversationFormatter | None = None,
+        generation_config: GenerationConfig | dict | None = None,
+    ) -> OpenAIChatCompletionResult:
         if conversation_formatter is not None:
             warnings.warn("conversation_formatter is not used in an OpenAIClient", stacklevel=2)
 
@@ -80,7 +107,7 @@ def query_model(
 
         response = self.client.chat.completions.create(**create_kwargs)
 
-        return [choice.message.content for choice in response.choices]
+        return _completion_result_from_response(response)
 
 
 class AsyncOpenAIClient(AsyncLLMClient):
@@ -122,6 +149,25 @@ async def _query_model_impl(
         """
         Internal implementation of query_model without retry/concurrency logic.
         """
+        result = await self._query_model_with_usage_impl(
+            messages=messages,
+            model=model,
+            conversation_formatter=conversation_formatter,
+            generation_config=generation_config,
+        )
+        return result.contents
+
+    async def _query_model_with_usage_impl(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: ConversationFormatter | None = None,
+        generation_config: GenerationConfig | dict | None = None,
+    ) -> OpenAIChatCompletionResult:
+        """
+        Internal implementation of query_model_with_usage without retry/concurrency logic.
+        """
         if conversation_formatter is not None:
             warnings.warn("conversation_formatter is not used in an AsyncOpenAIClient", stacklevel=2)
 
@@ -157,4 +203,50 @@ async def _query_model_impl(
 
         response = await self.client.chat.completions.create(**create_kwargs)
 
-        return [choice.message.content for choice in response.choices]
+        return _completion_result_from_response(response)
+
+    async def query_model_with_usage(
+        self,
+        *,
+        messages: Iterable,
+        model: str,
+        conversation_formatter: ConversationFormatter | None = None,
+        generation_config: GenerationConfig | dict | None = None,
+    ) -> OpenAIChatCompletionResult:
+        """
+        Query the model and keep OpenAI-compatible usage counters when the server returns them.
+        """
+        generation_config = self._coerce_generation_config(generation_config)
+        return await self._run_with_retry_and_concurrency(
+            lambda: self._query_model_with_usage_impl(
+                messages=messages,
+                model=model,
+                conversation_formatter=conversation_formatter,
+                generation_config=generation_config,
+            )
+        )
+
+
+def _completion_result_from_response(response: Any) -> OpenAIChatCompletionResult:  # noqa: ANN401
+    usage = getattr(response, "usage", None)
+    return OpenAIChatCompletionResult(
+        contents=[choice.message.content for choice in response.choices],
+        prompt_tokens=_usage_int(usage, "prompt_tokens"),
+        completion_tokens=_usage_int(usage, "completion_tokens"),
+        total_tokens=_usage_int(usage, "total_tokens"),
+    )
+
+
+def _usage_int(usage: Any, field: str) -> int | None:  # noqa: ANN401
+    if usage is None:
+        return None
+    value = usage.get(field) if isinstance(usage, dict) else getattr(usage, field, None)
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float) and value.is_integer():
+        return int(value)
+    if isinstance(value, str) and value.isdigit():
+        return int(value)
+    return None
Original file line number	Diff line number	Diff line change
Expand Up		@@ -31,3 +31,4 @@ class RayServeServerConfig(BaseServerConfig):
		"""Server-level Ray Serve config."""

		model_configs: ClassVar[tuple[type[BaseModelConfig], ...]] = (RayServeModelConfig,)
		ingress_deployment_config: dict[str, Any] = field(default_factory=dict)