NVIDIA-NeMo · praateekmahajan · Jun 10, 2026 · Jun 11, 2026 · Jun 11, 2026 · greptile-apps
@@ -37,6 +37,35 @@
 os.environ["RAY_MAX_LIMIT_FROM_API_SERVER"] = str(API_LIMIT)
 os.environ["RAY_MAX_LIMIT_FROM_DATA_SOURCE"] = str(API_LIMIT)
 
+
+def _ensure_ray_dashboard_frontend() -> None:
+    """Stub Ray's dashboard frontend dir once (nightly ray only), before any cluster starts.
+
+    Ray *nightly* wheels omit the prebuilt dashboard frontend (``dashboard/client/build``
+    is an npm artifact built only for releases), so the dashboard process dies with
+    ``FrontendNotFoundError`` and its state API server never registers — which breaks
+    every ``ray.util.state`` call (Xenna drives pipelines through it) with "Could not
+    read 'dashboard' from GCS". Creating the dir (relative to the installed ``ray``, so
+    it works in any venv) lets the dashboard start; the web UI itself is unused.
+
+    Gated to dev/nightly builds so published releases (which ship the frontend) are
+    untouched. Best-effort: a read-only install must not break ``import``.
+    """
+    import contextlib
+    from pathlib import Path
+
+    import ray
+    from packaging.version import Version
+
+    if not Version(ray.__version__).is_devrelease:
+        return
+    # Best-effort: a read-only install must not break ``import nemo_curator``.
+    with contextlib.suppress(OSError):
+        (Path(ray.__file__).parent / "dashboard" / "client" / "build" / "static").mkdir(parents=True, exist_ok=True)
+
+
+_ensure_ray_dashboard_frontend()
+
 # Raise an informative error early to users on unsupported systems
 if sys.platform != "linux":
     _msg = (

@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import importlib.metadata
 import json
 import tempfile
 from functools import reduce
@@ -24,6 +25,7 @@
 
 import ray
 from loguru import logger
+from packaging.requirements import InvalidRequirement, Requirement
 
 from nemo_curator.core.serve.base import BaseModelConfig
 from nemo_curator.core.serve.dynamo.infra import (
@@ -50,19 +52,85 @@
     from nemo_curator.core.serve.placement import ReplicaBundleSpec
 
 
-# ai-dynamo[vllm]'s [vllm] extra carries a hard ray pin, but Ray refuses
-# actor venvs whose ray version differs from the cluster head's. uv has no
-# inline override syntax — only ``--override <file>`` — so we materialize a
-# tiny constraints file at a fixed path on every node via
-# ``ensure_actor_overrides_on_all_nodes``; the content is derived from the
-# driver's ``ray.__version__`` at fan-out time so a future Curator ray bump
-# doesn't need a code change here.
+# The actor venv ``uv pip install`` needs overrides that pyproject's ``[tool.uv]``
+# can't reach (Ray runs it in an empty cwd). uv has no inline override syntax —
+# only ``--override <file>`` — so we materialize a constraints file at a fixed path
+# on every node via ``ensure_actor_overrides_on_all_nodes``. It carries:
+#   * ``ray==<driver version>`` — ai-dynamo[vllm]'s [vllm] extra has a hard ray pin,
+#     but Ray refuses actor venvs whose ray differs from the cluster head's. Derived
+#     from the driver's ``ray.__version__`` so a future Curator ray bump needs no edit.
+#   * ``nixl-cu13`` dropped — ai-dynamo[vllm] pulls the CUDA-13 NIXL backend, whose
+#     eagerly-imported ``nixl_ep_cpp.so`` dlopens libcudart.so.13 (absent on this
+#     CUDA-12.9 image). The base image excludes it via pyproject, but that override
+#     doesn't reach this standalone install; re-apply it here so the cu12 backend wins.
 _ACTOR_VENV_OVERRIDES_PATH = Path(tempfile.gettempdir()) / "nemo_curator_dynamo_actor_overrides.txt"
+_ACTOR_VENV_NIXL_CU13_EXCLUSION = "nixl-cu13 ; sys_platform == 'never'"
+# The CUDA build the actor venv must match (torch ecosystem + vllm wheel variant).
+_ACTOR_VENV_CUDA_TAG = "cu129"
+
+
+def _vllm_cu129_index_url() -> str | None:
+    """The vLLM cu129 wheel index for the exact version ai-dynamo[vllm] pins.
+
+    ai-dynamo's [vllm] extra pins an exact vllm (e.g. ``==0.22.1``) that may
+    differ from Curator's base vllm — the base installs ai-dynamo WITHOUT its
+    [vllm] extra, so its vllm comes from Curator's own pin, while the actor
+    venv installs ``ai-dynamo[vllm]`` and must honor ai-dynamo's pin. vLLM
+    publishes a per-version cu129 wheel index at ``wheels.vllm.ai/<v>/cu129``;
+    pointing at the pinned version means its ``+cu129`` local build sorts above
+    the default cu130 wheel under unsafe-best-match. Derived from ai-dynamo's
+    own metadata so a nightly bump (which changes the vllm pin) needs no edit.
+
+    Returns None if ai-dynamo (or its vllm pin) can't be found — only happens
+    when the dynamo backend isn't actually installed, where this is unused.
+    """
+    try:
+        requirements = importlib.metadata.requires("ai-dynamo") or []
+    except importlib.metadata.PackageNotFoundError:
+        return None
+    for raw in requirements:
+        try:
+            req = Requirement(raw)
+        except InvalidRequirement:
+            continue  # a malformed Requires-Dist line must not break module import
+        # Match vllm only as it applies under the [vllm] extra we install (skip a vllm
+        # pin that some other ai-dynamo extra might add under a different marker).
+        if req.name != "vllm" or (req.marker is not None and not req.marker.evaluate({"extra": "vllm"})):
+            continue
+        pinned = next((spec.version for spec in req.specifier if spec.operator in ("==", "===")), None)
+        if pinned:
+            return f"https://wheels.vllm.ai/{pinned}/{_ACTOR_VENV_CUDA_TAG}"
+    return None
+
+
+# Ray builds the actor venv with a bare ``uv pip install`` in an empty cwd, so it
+# inherits none of the project's ``[tool.uv]`` index/source/prerelease config — only
+# what we pass here. Force CUDA 12.9 the way vLLM documents for uv: --torch-backend
+# routes the torch ecosystem to the cu129 index, and the per-version cu129 vllm index
+# (see ``_vllm_cu129_index_url``) keeps vllm on cu129. ``unsafe-best-match`` is REQUIRED
+# so nixl resolves (its version is split across pypi.nvidia.com and PyPI, which the
+# default first-match strategy can't combine).
+_ACTOR_VENV_UV_OPTIONS = [
+    "--override",
+    str(_ACTOR_VENV_OVERRIDES_PATH),
+    "--torch-backend",
+    _ACTOR_VENV_CUDA_TAG,
+    "--index-strategy",
+    "unsafe-best-match",
+    "--prerelease",
+    "if-necessary-or-explicit",
+    *(
+        arg
+        for url in ("https://pypi.nvidia.com", _vllm_cu129_index_url())
+        if url is not None
+        for arg in ("--extra-index-url", url)
+    ),
+]
 
 DYNAMO_VLLM_RUNTIME_ENV: dict[str, Any] = {
     "uv": {
         "packages": ["ai-dynamo[vllm]"],
-        "uv_pip_install_options": ["--override", str(_ACTOR_VENV_OVERRIDES_PATH)],
+        "uv_pip_install_options": _ACTOR_VENV_UV_OPTIONS,
     },
     "config": {"setup_timeout_seconds": 600},
 }
@@ -78,7 +146,8 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No
 
     The file pins ``ray=={ray.__version__}`` (read from the driver) so the
     actor venv keeps the same ray patch as the cluster head — Ray rejects
-    any mismatch.
+    any mismatch — and drops ``nixl-cu13`` so the cu12 NIXL backend is used
+    (see module comment on :data:`_ACTOR_VENV_OVERRIDES_PATH`).
 
     Must run inside an active Ray context, before any worker spawned with
     :data:`DYNAMO_VLLM_RUNTIME_ENV` lands. The runtime_env_agent on each
@@ -91,7 +160,7 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No
     run_on_each_node(
         _write_actor_overrides_file,
         str(_ACTOR_VENV_OVERRIDES_PATH),
-        f"ray=={ray.__version__}\n",
+        f"ray=={ray.__version__}\n{_ACTOR_VENV_NIXL_CU13_EXCLUSION}\n",
         ignore_head_node=ignore_head_node,
     )
 

@@ -66,7 +66,7 @@ dependencies = [
     "openai>=1.0.0",
     "pandas>=2.1.0",
     "pyarrow",
-    "ray[default,data]>=2.55.1",
+    "ray[default,data]>=2.55.1",  # nightly wheel routed per (python, arch) via [tool.uv.sources]
     "torch",
     "transformers",
 ]
@@ -76,14 +76,18 @@ cuda12 = [
     "gpustat",
     "nvidia-ml-py",
 ]
-vllm = ["vllm>=0.14.1; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]
+vllm = ["vllm[flashinfer,runai,otel]==0.22.0+cu129; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]
 
 # Inference Server (Ray Serve + vLLM) - for serving LLMs alongside Curator pipelines
 inference_server = [
     "nemo_curator[cuda12]",
     "nemo_curator[vllm]",
-    "vllm<0.19; (platform_machine == 'x86_64' and platform_system != 'Darwin')", # Ray Serve LLM 2.55.1 isn't compatible with vllm 0.19+
-    "ai-dynamo==1.1.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')", # pin so the Dynamo actor venv resolves to the same release we test against; gated to x86_64 since vllm wheels are x86_64-only
+    "ai-dynamo>=1.3.0.dev0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
+    # First-party + explicit .dev0 marker so prerelease="if-necessary-or-explicit" enables
+    # nightlies for ai-dynamo-runtime too. ai-dynamo pins it (==<its dev>), but it's a
+    # transitive with stable releases, so without this the newest dynamo nightly can't
+    # resolve (its runtime pin is a disallowed prerelease) and uv falls back to an older dev.
+    "ai-dynamo-runtime>=1.3.0.dev0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
     "boto3>=1.35", # Get rid once https://github.com/ray-project/ray/issues/61269 is fixed
     "nixl-cu12>=0.10.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
     "ray[serve,llm]>=2.55.1",
@@ -216,7 +220,7 @@ text_cuda12 = [
 # Video Curation Dependencies
 video_cpu = [
     "av==13.1.0",
-    "opencv-python",
+    "opencv-python-headless",  # headless: no GUI/FFmpeg (GPL) bundling or libGL system dep; identical for pipeline use and matches vllm/mistral_common/albumentations
     "torchvision",
     "einops",
     "easydict",
@@ -230,7 +234,7 @@ video_cuda12 = [
     "flash-attn<=2.8.3; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
     "pycuda",
     "PyNvVideoCodec==2.0.2; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
-    "torch<=2.10.0",
+    "torch<=2.11.0",
     "torchaudio",
 ]
 
@@ -252,7 +256,7 @@ interleaved_cpu = [
     "albumentations",
     "matplotlib",
     "open_clip_torch",
-    "opencv-python",
+    "opencv-python-headless",  # headless: no GUI/FFmpeg (GPL) bundling or libGL system dep; identical for pipeline use and matches vllm/mistral_common/albumentations
     "Pillow",
     "pypdfium2",
     "s3fs>=2024.12.0",
@@ -290,7 +294,7 @@ all = [
 ]
 
 [dependency-groups]
-build = ["setuptools", "torch<=2.10.0", "Cython", "packaging"]
+build = ["setuptools", "torch<=2.11.0", "Cython", "packaging"]
 dev = ["jupyter"]
 linting = ["pre-commit", "ruff==0.14.10"]
 test = [
@@ -317,6 +321,10 @@ package = true
 managed = true
 default-groups = ["dev", "test"]
 index-strategy = "unsafe-best-match"
+# Default mode: only pick a prerelease when a requirement carries an explicit
+# prerelease marker (ai-dynamo>=1.3.0.dev0) or when every version in range is a
+# prerelease (ray nightly's otel transitives). Avoids blanket prereleases elsewhere.
+prerelease = "if-necessary-or-explicit"
 no-build-isolation-package = ["flash-attn"]
 constraint-dependencies = [
     "aiohttp>=3.13.3", # Addresses CVE GHSA-6mq8-rvhq-8wgg
@@ -340,13 +348,15 @@ override-dependencies = [
     "kaldiio;  sys_platform == 'never'",
     "levenshtein;  sys_platform == 'never'",
     "numpy>=2.0.0,<=2.2.0", # Override nemo-toolkits constraint of <2.0.0, upperbounds for Numba compatibility
+    "numba==0.65.0", # Override RAPIDS/legacy caps for the inference image; vLLM 0.22 requires numba 0.65.0
     "protobuf>=5.29.5,<7.0",  # Override nemo-toolkits constraint of ~=5.29.5; <7.0 due to ray serve FieldDescriptor API breakage
     "setuptools>=80.10.1", # Override setuptools range in other dependencies to address CVE GHSA-58pv-8j8x-9vj2
-    "torch==2.10.0", # Override whisperx's <2.9 cap to match cu129 / vllm 0.18.x
-    "torchaudio==2.10.0", # Override whisperx's <2.9 cap to match cu129 / vllm 0.18.x
-    "torchvision==0.25.0", # Match torch==2.10.0
-    "torchcodec~=0.10.0; platform_machine == 'x86_64' and platform_system != 'Darwin'", # pin to torchcodec 0.10.x for torch 2.10 ABI compatibility — torchcodec doesn't declare a torch dep, so the resolver can't enforce the match; satisfies pyannote-audio's >=0.7.0 floor; x86_64-only since aarch64 lacks wheels
+    "torch==2.11.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match vLLM's CUDA requirements; Linux resolves to cu129 via tool.uv.sources
+    "torchaudio==2.11.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match torch==2.11.0
+    "torchvision==0.26.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match torch==2.11.0
+    "torchcodec~=0.11.0; platform_machine == 'x86_64' and platform_system != 'Darwin'", # pin to torchcodec 0.11.x for torch 2.11 ABI compatibility; torchcodec does not declare a torch dep, so the resolver cannot enforce the match; satisfies pyannote-audio's >=0.7.0 floor; x86_64-only since aarch64 lacks wheels
     "nixl-cu12>=0.10.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",  # Override ray[llm]'s unconditional nixl dep for ARM
+    "nixl-cu13; sys_platform == 'never'", # ray[llm]/nixl hard-pin the CUDA-13 NIXL backend. On this CUDA-12.9 image vLLM's eager `import nixl_ep` would load cu13's nixl_ep_cpp.so and dlopen the absent libcudart.so.13. Drop it; the nixl meta + nixl-cu12 backend (nixl's own default) remain.
     "xgrammar>=0.1.32", # Override vllm's ==0.1.29 pin to address CVE GHSA-7rgv-gqhr-fxg3 (DoS via multi-layer nesting)
 ]
 
@@ -365,6 +375,11 @@ name = "pytorch"
 url = "https://download.pytorch.org/whl/cu129"
 explicit = true
 
+[[tool.uv.index]]
+name = "vllm-cu129"
+url = "https://wheels.vllm.ai/0.22.0/cu129"
+explicit = true
+
 [tool.uv.sources]
 torch = [
     { index = "pytorch", marker = "sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')" },
@@ -382,6 +397,21 @@ torchcodec = [
     { index = "pytorch", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
     { index = "pypi", marker = "platform_machine != 'x86_64' or sys_platform == 'darwin'" },
 ]
+# Ray 3.0 nightly on x86_64 (the CUDA-12.9 stack's only arch): one rolling /latest/ wheel
+# per python tag — a direct URL can't be templated by markers, and Ray ships no nightly index.
+# Re-pinned by re-locking, not frozen. aarch64/other resolve ray from PyPI via the >= floor
+# (avoids dragging ray[llm]'s default cu130 vllm onto aarch64).
+ray = [
+    { url = "https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp311-cp311-manylinux2014_x86_64.whl", marker = "python_version == '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { url = "https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp312-cp312-manylinux2014_x86_64.whl", marker = "python_version == '3.12' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { url = "https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp313-cp313-manylinux2014_x86_64.whl", marker = "python_version == '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+ai-dynamo = { index = "nvidia" }
+ai-dynamo-runtime = { index = "nvidia" }
+vllm = [
+    { index = "vllm-cu129", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
+    { index = "pypi", marker = "platform_machine != 'x86_64' or sys_platform == 'darwin'" },
+]
 nixl = { index = "pypi" }
 nixl-cu12 = { index = "pypi" }
 

@@ -346,12 +346,33 @@ class TestEnsureActorOverridesOnAllNodes:
     ``--override`` constraints file before workers are spawned."""
 
     def test_writes_current_ray_version_at_path(self, shared_ray_client: None, tmp_path: Path) -> None:
-        """The fan-out writes ``ray=={ray.__version__}`` at the configured
-        path on every alive node. Catches regressions where the content is
-        hardcoded and silently drifts after a Curator ray bump.
+        """The fan-out writes ``ray=={ray.__version__}`` plus the nixl-cu13
+        exclusion at the configured path on every alive node. Catches
+        regressions where the content is hardcoded and silently drifts after
+        a Curator ray bump.
         """
         override_path = tmp_path / "override.txt"
         with mock.patch.object(dynamo_vllm, "_ACTOR_VENV_OVERRIDES_PATH", override_path):
             dynamo_vllm.ensure_actor_overrides_on_all_nodes()
 
-        assert override_path.read_text() == f"ray=={ray.__version__}\n"
+        assert override_path.read_text() == f"ray=={ray.__version__}\n{dynamo_vllm._ACTOR_VENV_NIXL_CU13_EXCLUSION}\n"
+
+
+def test_vllm_cu129_index_url_derives_from_dynamo_pin() -> None:
+    """Derives the per-version cu129 index from ai-dynamo's [vllm] pin, and returns
+    None (never a wrong, cu130-prone URL) when there's no exact pin or ai-dynamo is
+    absent; a malformed Requires-Dist line is skipped, not fatal."""
+    meta = dynamo_vllm.importlib.metadata
+    expected = f"https://wheels.vllm.ai/0.22.1/{dynamo_vllm._ACTOR_VENV_CUDA_TAG}"
+
+    # Exact [vllm]-extra pin wins; a malformed sibling line is skipped, not fatal.
+    with mock.patch.object(
+        meta, "requires", return_value=["bad req!!!", "vllm[flashinfer]==0.22.1 ; extra == 'vllm'"]
+    ):
+        assert dynamo_vllm._vllm_cu129_index_url() == expected
+    # No exact `==` pin -> None rather than guessing an index.
+    with mock.patch.object(meta, "requires", return_value=["vllm>=0.20 ; extra == 'vllm'"]):
+        assert dynamo_vllm._vllm_cu129_index_url() is None
+    # ai-dynamo not installed -> None.
+    with mock.patch.object(meta, "requires", side_effect=meta.PackageNotFoundError()):
+        assert dynamo_vllm._vllm_cu129_index_url() is None