Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions nemo_curator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,35 @@
os.environ["RAY_MAX_LIMIT_FROM_API_SERVER"] = str(API_LIMIT)
os.environ["RAY_MAX_LIMIT_FROM_DATA_SOURCE"] = str(API_LIMIT)


def _ensure_ray_dashboard_frontend() -> None:
"""Stub Ray's dashboard frontend dir once (nightly ray only), before any cluster starts.

Ray *nightly* wheels omit the prebuilt dashboard frontend (``dashboard/client/build``
is an npm artifact built only for releases), so the dashboard process dies with
``FrontendNotFoundError`` and its state API server never registers — which breaks
every ``ray.util.state`` call (Xenna drives pipelines through it) with "Could not
read 'dashboard' from GCS". Creating the dir (relative to the installed ``ray``, so
it works in any venv) lets the dashboard start; the web UI itself is unused.

Gated to dev/nightly builds so published releases (which ship the frontend) are
untouched. Best-effort: a read-only install must not break ``import``.
"""
import contextlib
from pathlib import Path

import ray
from packaging.version import Version

if not Version(ray.__version__).is_devrelease:
return
# Best-effort: a read-only install must not break ``import nemo_curator``.
with contextlib.suppress(OSError):
(Path(ray.__file__).parent / "dashboard" / "client" / "build" / "static").mkdir(parents=True, exist_ok=True)


_ensure_ray_dashboard_frontend()

# Raise an informative error early to users on unsupported systems
if sys.platform != "linux":
_msg = (
Expand Down
89 changes: 79 additions & 10 deletions nemo_curator/core/serve/dynamo/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from __future__ import annotations

import importlib.metadata
import json
import tempfile
from functools import reduce
Expand All @@ -24,6 +25,7 @@

import ray
from loguru import logger
from packaging.requirements import InvalidRequirement, Requirement

from nemo_curator.core.serve.base import BaseModelConfig
from nemo_curator.core.serve.dynamo.infra import (
Expand All @@ -50,19 +52,85 @@
from nemo_curator.core.serve.placement import ReplicaBundleSpec


# ai-dynamo[vllm]'s [vllm] extra carries a hard ray pin, but Ray refuses
# actor venvs whose ray version differs from the cluster head's. uv has no
# inline override syntax — only ``--override <file>`` — so we materialize a
# tiny constraints file at a fixed path on every node via
# ``ensure_actor_overrides_on_all_nodes``; the content is derived from the
# driver's ``ray.__version__`` at fan-out time so a future Curator ray bump
# doesn't need a code change here.
# The actor venv ``uv pip install`` needs overrides that pyproject's ``[tool.uv]``
# can't reach (Ray runs it in an empty cwd). uv has no inline override syntax —
# only ``--override <file>`` — so we materialize a constraints file at a fixed path
# on every node via ``ensure_actor_overrides_on_all_nodes``. It carries:
# * ``ray==<driver version>`` — ai-dynamo[vllm]'s [vllm] extra has a hard ray pin,
# but Ray refuses actor venvs whose ray differs from the cluster head's. Derived
# from the driver's ``ray.__version__`` so a future Curator ray bump needs no edit.
# * ``nixl-cu13`` dropped — ai-dynamo[vllm] pulls the CUDA-13 NIXL backend, whose
# eagerly-imported ``nixl_ep_cpp.so`` dlopens libcudart.so.13 (absent on this
# CUDA-12.9 image). The base image excludes it via pyproject, but that override
# doesn't reach this standalone install; re-apply it here so the cu12 backend wins.
_ACTOR_VENV_OVERRIDES_PATH = Path(tempfile.gettempdir()) / "nemo_curator_dynamo_actor_overrides.txt"
_ACTOR_VENV_NIXL_CU13_EXCLUSION = "nixl-cu13 ; sys_platform == 'never'"
# The CUDA build the actor venv must match (torch ecosystem + vllm wheel variant).
_ACTOR_VENV_CUDA_TAG = "cu129"


def _vllm_cu129_index_url() -> str | None:
"""The vLLM cu129 wheel index for the exact version ai-dynamo[vllm] pins.

ai-dynamo's [vllm] extra pins an exact vllm (e.g. ``==0.22.1``) that may
differ from Curator's base vllm — the base installs ai-dynamo WITHOUT its
[vllm] extra, so its vllm comes from Curator's own pin, while the actor
venv installs ``ai-dynamo[vllm]`` and must honor ai-dynamo's pin. vLLM
publishes a per-version cu129 wheel index at ``wheels.vllm.ai/<v>/cu129``;
pointing at the pinned version means its ``+cu129`` local build sorts above
the default cu130 wheel under unsafe-best-match. Derived from ai-dynamo's
own metadata so a nightly bump (which changes the vllm pin) needs no edit.

Returns None if ai-dynamo (or its vllm pin) can't be found — only happens
when the dynamo backend isn't actually installed, where this is unused.
"""
try:
requirements = importlib.metadata.requires("ai-dynamo") or []
except importlib.metadata.PackageNotFoundError:
return None
for raw in requirements:
try:
req = Requirement(raw)
except InvalidRequirement:
continue # a malformed Requires-Dist line must not break module import
# Match vllm only as it applies under the [vllm] extra we install (skip a vllm
# pin that some other ai-dynamo extra might add under a different marker).
if req.name != "vllm" or (req.marker is not None and not req.marker.evaluate({"extra": "vllm"})):
continue
pinned = next((spec.version for spec in req.specifier if spec.operator in ("==", "===")), None)
if pinned:
return f"https://wheels.vllm.ai/{pinned}/{_ACTOR_VENV_CUDA_TAG}"
Comment on lines +91 to +102

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 vLLM version lookup ignores extras marker, picks first matching requirement

importlib.metadata.requires("ai-dynamo") returns every requirement across all extras. The [vllm] extra's entry looks like "vllm==0.22.x ; extra == \"vllm\"", but so would any vllm pin in a hypothetical [vllm-dev] or other extra. The loop returns on the first vllm-named req without checking req.marker for extra == "vllm", so a future ai-dynamo refactor that adds a secondary vllm constraint could silently pick the wrong index URL.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

return None
Comment on lines +72 to +103

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing test coverage: _vllm_cu129_index_url() has four distinct return paths (PackageNotFoundError → None, no vllm in requirements → None, vllm present but no == pin → None, happy path → URL string) and is called at module level to build _ACTOR_VENV_UV_OPTIONS. Since a wrong URL here silently installs a cu130 vllm wheel in the actor venv (the exact failure this PR is preventing), it would be good to have unit tests — at minimum for the happy path and the "ai-dynamo not installed" fallback, using unittest.mock.patch over importlib.metadata.requires.



# Ray builds the actor venv with a bare ``uv pip install`` in an empty cwd, so it
# inherits none of the project's ``[tool.uv]`` index/source/prerelease config — only
# what we pass here. Force CUDA 12.9 the way vLLM documents for uv: --torch-backend
# routes the torch ecosystem to the cu129 index, and the per-version cu129 vllm index
# (see ``_vllm_cu129_index_url``) keeps vllm on cu129. ``unsafe-best-match`` is REQUIRED
# so nixl resolves (its version is split across pypi.nvidia.com and PyPI, which the
# default first-match strategy can't combine).
_ACTOR_VENV_UV_OPTIONS = [
"--override",
str(_ACTOR_VENV_OVERRIDES_PATH),
"--torch-backend",
_ACTOR_VENV_CUDA_TAG,
"--index-strategy",
"unsafe-best-match",
"--prerelease",
"if-necessary-or-explicit",
*(
arg
for url in ("https://pypi.nvidia.com", _vllm_cu129_index_url())
if url is not None
for arg in ("--extra-index-url", url)
),
]

DYNAMO_VLLM_RUNTIME_ENV: dict[str, Any] = {
"uv": {
"packages": ["ai-dynamo[vllm]"],
"uv_pip_install_options": ["--override", str(_ACTOR_VENV_OVERRIDES_PATH)],
"uv_pip_install_options": _ACTOR_VENV_UV_OPTIONS,
},
"config": {"setup_timeout_seconds": 600},
}
Expand All @@ -78,7 +146,8 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No

The file pins ``ray=={ray.__version__}`` (read from the driver) so the
actor venv keeps the same ray patch as the cluster head — Ray rejects
any mismatch.
any mismatch — and drops ``nixl-cu13`` so the cu12 NIXL backend is used
(see module comment on :data:`_ACTOR_VENV_OVERRIDES_PATH`).

Must run inside an active Ray context, before any worker spawned with
:data:`DYNAMO_VLLM_RUNTIME_ENV` lands. The runtime_env_agent on each
Expand All @@ -91,7 +160,7 @@ def ensure_actor_overrides_on_all_nodes(*, ignore_head_node: bool = False) -> No
run_on_each_node(
_write_actor_overrides_file,
str(_ACTOR_VENV_OVERRIDES_PATH),
f"ray=={ray.__version__}\n",
f"ray=={ray.__version__}\n{_ACTOR_VENV_NIXL_CU13_EXCLUSION}\n",
ignore_head_node=ignore_head_node,
)

Expand Down
54 changes: 42 additions & 12 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ dependencies = [
"openai>=1.0.0",
"pandas>=2.1.0",
"pyarrow",
"ray[default,data]>=2.55.1",
"ray[default,data]>=2.55.1", # nightly wheel routed per (python, arch) via [tool.uv.sources]
"torch",
"transformers",
]
Expand All @@ -76,14 +76,18 @@ cuda12 = [
"gpustat",
"nvidia-ml-py",
]
vllm = ["vllm>=0.14.1; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]
vllm = ["vllm[flashinfer,runai,otel]==0.22.0+cu129; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]

# Inference Server (Ray Serve + vLLM) - for serving LLMs alongside Curator pipelines
inference_server = [
"nemo_curator[cuda12]",
"nemo_curator[vllm]",
"vllm<0.19; (platform_machine == 'x86_64' and platform_system != 'Darwin')", # Ray Serve LLM 2.55.1 isn't compatible with vllm 0.19+
"ai-dynamo==1.1.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')", # pin so the Dynamo actor venv resolves to the same release we test against; gated to x86_64 since vllm wheels are x86_64-only
"ai-dynamo>=1.3.0.dev0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
# First-party + explicit .dev0 marker so prerelease="if-necessary-or-explicit" enables
# nightlies for ai-dynamo-runtime too. ai-dynamo pins it (==<its dev>), but it's a
# transitive with stable releases, so without this the newest dynamo nightly can't
# resolve (its runtime pin is a disallowed prerelease) and uv falls back to an older dev.
"ai-dynamo-runtime>=1.3.0.dev0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
"boto3>=1.35", # Get rid once https://github.com/ray-project/ray/issues/61269 is fixed
"nixl-cu12>=0.10.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
"ray[serve,llm]>=2.55.1",
Expand Down Expand Up @@ -216,7 +220,7 @@ text_cuda12 = [
# Video Curation Dependencies
video_cpu = [
"av==13.1.0",
"opencv-python",
"opencv-python-headless", # headless: no GUI/FFmpeg (GPL) bundling or libGL system dep; identical for pipeline use and matches vllm/mistral_common/albumentations
"torchvision",
"einops",
"easydict",
Expand All @@ -230,7 +234,7 @@ video_cuda12 = [
"flash-attn<=2.8.3; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
"pycuda",
"PyNvVideoCodec==2.0.2; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
"torch<=2.10.0",
"torch<=2.11.0",
"torchaudio",
]

Expand All @@ -252,7 +256,7 @@ interleaved_cpu = [
"albumentations",
"matplotlib",
"open_clip_torch",
"opencv-python",
"opencv-python-headless", # headless: no GUI/FFmpeg (GPL) bundling or libGL system dep; identical for pipeline use and matches vllm/mistral_common/albumentations
"Pillow",
"pypdfium2",
"s3fs>=2024.12.0",
Expand Down Expand Up @@ -290,7 +294,7 @@ all = [
]

[dependency-groups]
build = ["setuptools", "torch<=2.10.0", "Cython", "packaging"]
build = ["setuptools", "torch<=2.11.0", "Cython", "packaging"]
dev = ["jupyter"]
linting = ["pre-commit", "ruff==0.14.10"]
test = [
Expand All @@ -317,6 +321,10 @@ package = true
managed = true
default-groups = ["dev", "test"]
index-strategy = "unsafe-best-match"
# Default mode: only pick a prerelease when a requirement carries an explicit
# prerelease marker (ai-dynamo>=1.3.0.dev0) or when every version in range is a
# prerelease (ray nightly's otel transitives). Avoids blanket prereleases elsewhere.
prerelease = "if-necessary-or-explicit"
no-build-isolation-package = ["flash-attn"]
constraint-dependencies = [
"aiohttp>=3.13.3", # Addresses CVE GHSA-6mq8-rvhq-8wgg
Expand All @@ -340,13 +348,15 @@ override-dependencies = [
"kaldiio; sys_platform == 'never'",
"levenshtein; sys_platform == 'never'",
"numpy>=2.0.0,<=2.2.0", # Override nemo-toolkits constraint of <2.0.0, upperbounds for Numba compatibility
"numba==0.65.0", # Override RAPIDS/legacy caps for the inference image; vLLM 0.22 requires numba 0.65.0
"protobuf>=5.29.5,<7.0", # Override nemo-toolkits constraint of ~=5.29.5; <7.0 due to ray serve FieldDescriptor API breakage
"setuptools>=80.10.1", # Override setuptools range in other dependencies to address CVE GHSA-58pv-8j8x-9vj2
"torch==2.10.0", # Override whisperx's <2.9 cap to match cu129 / vllm 0.18.x
"torchaudio==2.10.0", # Override whisperx's <2.9 cap to match cu129 / vllm 0.18.x
"torchvision==0.25.0", # Match torch==2.10.0
"torchcodec~=0.10.0; platform_machine == 'x86_64' and platform_system != 'Darwin'", # pin to torchcodec 0.10.x for torch 2.10 ABI compatibilitytorchcodec doesn't declare a torch dep, so the resolver can't enforce the match; satisfies pyannote-audio's >=0.7.0 floor; x86_64-only since aarch64 lacks wheels
"torch==2.11.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match vLLM's CUDA requirements; Linux resolves to cu129 via tool.uv.sources
"torchaudio==2.11.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match torch==2.11.0
"torchvision==0.26.0; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # Match torch==2.11.0
"torchcodec~=0.11.0; platform_machine == 'x86_64' and platform_system != 'Darwin'", # pin to torchcodec 0.11.x for torch 2.11 ABI compatibility; torchcodec does not declare a torch dep, so the resolver cannot enforce the match; satisfies pyannote-audio's >=0.7.0 floor; x86_64-only since aarch64 lacks wheels
"nixl-cu12>=0.10.0; (platform_machine == 'x86_64' and platform_system != 'Darwin')", # Override ray[llm]'s unconditional nixl dep for ARM
"nixl-cu13; sys_platform == 'never'", # ray[llm]/nixl hard-pin the CUDA-13 NIXL backend. On this CUDA-12.9 image vLLM's eager `import nixl_ep` would load cu13's nixl_ep_cpp.so and dlopen the absent libcudart.so.13. Drop it; the nixl meta + nixl-cu12 backend (nixl's own default) remain.
"xgrammar>=0.1.32", # Override vllm's ==0.1.29 pin to address CVE GHSA-7rgv-gqhr-fxg3 (DoS via multi-layer nesting)
]

Expand All @@ -365,6 +375,11 @@ name = "pytorch"
url = "https://download.pytorch.org/whl/cu129"
explicit = true

[[tool.uv.index]]
name = "vllm-cu129"
url = "https://wheels.vllm.ai/0.22.0/cu129"
explicit = true

[tool.uv.sources]
torch = [
{ index = "pytorch", marker = "sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')" },
Expand All @@ -382,6 +397,21 @@ torchcodec = [
{ index = "pytorch", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
{ index = "pypi", marker = "platform_machine != 'x86_64' or sys_platform == 'darwin'" },
]
# Ray 3.0 nightly on x86_64 (the CUDA-12.9 stack's only arch): one rolling /latest/ wheel
# per python tag — a direct URL can't be templated by markers, and Ray ships no nightly index.
# Re-pinned by re-locking, not frozen. aarch64/other resolve ray from PyPI via the >= floor
# (avoids dragging ray[llm]'s default cu130 vllm onto aarch64).
ray = [
{ url = "https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp311-cp311-manylinux2014_x86_64.whl", marker = "python_version == '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
{ url = "https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp312-cp312-manylinux2014_x86_64.whl", marker = "python_version == '3.12' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
{ url = "https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp313-cp313-manylinux2014_x86_64.whl", marker = "python_version == '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
]
ai-dynamo = { index = "nvidia" }
ai-dynamo-runtime = { index = "nvidia" }
vllm = [
{ index = "vllm-cu129", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
{ index = "pypi", marker = "platform_machine != 'x86_64' or sys_platform == 'darwin'" },
]
nixl = { index = "pypi" }
nixl-cu12 = { index = "pypi" }

Expand Down
29 changes: 25 additions & 4 deletions tests/core/serve/dynamo/test_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,12 +346,33 @@ class TestEnsureActorOverridesOnAllNodes:
``--override`` constraints file before workers are spawned."""

def test_writes_current_ray_version_at_path(self, shared_ray_client: None, tmp_path: Path) -> None:
"""The fan-out writes ``ray=={ray.__version__}`` at the configured
path on every alive node. Catches regressions where the content is
hardcoded and silently drifts after a Curator ray bump.
"""The fan-out writes ``ray=={ray.__version__}`` plus the nixl-cu13
exclusion at the configured path on every alive node. Catches
regressions where the content is hardcoded and silently drifts after
a Curator ray bump.
"""
override_path = tmp_path / "override.txt"
with mock.patch.object(dynamo_vllm, "_ACTOR_VENV_OVERRIDES_PATH", override_path):
dynamo_vllm.ensure_actor_overrides_on_all_nodes()

assert override_path.read_text() == f"ray=={ray.__version__}\n"
assert override_path.read_text() == f"ray=={ray.__version__}\n{dynamo_vllm._ACTOR_VENV_NIXL_CU13_EXCLUSION}\n"


def test_vllm_cu129_index_url_derives_from_dynamo_pin() -> None:
"""Derives the per-version cu129 index from ai-dynamo's [vllm] pin, and returns
None (never a wrong, cu130-prone URL) when there's no exact pin or ai-dynamo is
absent; a malformed Requires-Dist line is skipped, not fatal."""
meta = dynamo_vllm.importlib.metadata
expected = f"https://wheels.vllm.ai/0.22.1/{dynamo_vllm._ACTOR_VENV_CUDA_TAG}"

# Exact [vllm]-extra pin wins; a malformed sibling line is skipped, not fatal.
with mock.patch.object(
meta, "requires", return_value=["bad req!!!", "vllm[flashinfer]==0.22.1 ; extra == 'vllm'"]
):
assert dynamo_vllm._vllm_cu129_index_url() == expected
# No exact `==` pin -> None rather than guessing an index.
with mock.patch.object(meta, "requires", return_value=["vllm>=0.20 ; extra == 'vllm'"]):
assert dynamo_vllm._vllm_cu129_index_url() is None
# ai-dynamo not installed -> None.
with mock.patch.object(meta, "requires", side_effect=meta.PackageNotFoundError()):
assert dynamo_vllm._vllm_cu129_index_url() is None
Loading
Loading