Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 13 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ The built image MUST:
2. Bake function discovery output (manifest) at build time:

```dockerfile
RUN mkdir -p /app/.tensorhub && python -m gen_worker.discover > /app/.tensorhub/endpoint.lock
RUN mkdir -p /app/.cozy && python -m gen_worker.discover > /app/.tensorhub/endpoint.lock
```

3. Use the Cozy worker runtime as the ENTRYPOINT:
Expand Down Expand Up @@ -50,7 +50,7 @@ uv add gen-worker[torch]

```python
import msgspec
from gen_worker import RequestContext, worker_function
from gen_worker import ActionContext, worker_function

class Input(msgspec.Struct):
prompt: str
Expand All @@ -59,7 +59,7 @@ class Output(msgspec.Struct):
text: str

@worker_function()
def generate(ctx: RequestContext, payload: Input) -> Output:
def generate(ctx: ActionContext, payload: Input) -> Output:
return Output(text=f"Hello, {payload.prompt}!")
```

Expand All @@ -69,7 +69,7 @@ def generate(ctx: RequestContext, payload: Input) -> Output:
- **Schema generation** - Input/output schemas extracted from msgspec types
- **Model injection** - Dependency injection for ML models with caching
- **Streaming output** - Support for incremental/streaming responses
- **Progress reporting** - Built-in progress events via `RequestContext`
- **Progress reporting** - Built-in progress events via `ActionContext`
- **Perf metrics** - Best-effort per-run metrics emitted to gen-orchestrator (`metrics.*` worker events)
- **Trainer runtime mode** - SDK-native trainer loop via `WORKER_MODE=trainer`
- **File handling** - Upload/download assets via Cozy hub file API
Expand All @@ -81,7 +81,7 @@ def generate(ctx: RequestContext, payload: Input) -> Output:

```python
import msgspec
from gen_worker import RequestContext, worker_function
from gen_worker import ActionContext, worker_function

class Input(msgspec.Struct):
prompt: str
Expand All @@ -90,7 +90,7 @@ class Output(msgspec.Struct):
result: str

@worker_function()
def my_function(ctx: RequestContext, payload: Input) -> Output:
def my_function(ctx: ActionContext, payload: Input) -> Output:
return Output(result=f"Processed: {payload.prompt}")
```

Expand All @@ -103,7 +103,7 @@ class Delta(msgspec.Struct):
chunk: str

@worker_function()
def stream(ctx: RequestContext, payload: Input) -> Iterator[Delta]:
def stream(ctx: ActionContext, payload: Input) -> Iterator[Delta]:
for word in payload.prompt.split():
if ctx.is_canceled():
raise InterruptedError("canceled")
Expand All @@ -126,7 +126,7 @@ from gen_worker.injection import ModelRef, ModelRefSource as Src

@worker_function()
def generate(
ctx: RequestContext,
ctx: ActionContext,
pipe: Annotated[DiffusionPipeline, ModelRef(Src.FIXED, "sd15")],
payload: Input,
) -> Output:
Expand All @@ -149,7 +149,7 @@ flux = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["bf16"] }
from typing import Annotated
import msgspec
from diffusers import DiffusionPipeline
from gen_worker import RequestContext, worker_function
from gen_worker import ActionContext, worker_function
from gen_worker.injection import ModelRef, ModelRefSource as Src

class Input(msgspec.Struct):
Expand All @@ -158,7 +158,7 @@ class Input(msgspec.Struct):

@worker_function()
def generate(
ctx: RequestContext,
ctx: ActionContext,
pipe: Annotated[DiffusionPipeline, ModelRef(Src.PAYLOAD, "model")],
payload: Input,
):
Expand All @@ -173,7 +173,7 @@ arbitrary repo refs in the payload.

```python
@worker_function()
def process(ctx: RequestContext, payload: Input) -> Output:
def process(ctx: ActionContext, payload: Input) -> Output:
# Save bytes and get asset reference
asset = ctx.save_bytes(f"runs/{ctx.request_id}/outputs/output.png", image_bytes)
return Output(result=asset.ref)
Expand Down Expand Up @@ -278,7 +278,7 @@ Orchestrator-injected (production contract):
|----------|---------|-------------|
| `WORKER_MODE` | `inference` | Runtime mode selector (`inference` or `trainer`) |
| `SCHEDULER_PUBLIC_ADDR` | - | Scheduler address workers should dial |
| `SCHEDULER_ADDRS` | - | Optional comma-separated LB seed addresses |
| `SCHEDULER_ADDRS` | - | Optional comma-separated seed addresses for leader discovery |
| `WORKER_JWT` | - | Worker-connect JWT (required; claims are authoritative) |

Local dev / advanced (not injected by orchestrator):
Expand All @@ -289,11 +289,6 @@ Local dev / advanced (not injected by orchestrator):
| `SCHEDULER_JWT_ISSUER` | - | Optional: expected `iss` when verifying WORKER_JWT locally |
| `SCHEDULER_JWT_AUDIENCE` | - | Optional: expected `aud` when verifying WORKER_JWT locally |
| `USE_TLS` | `false` | Local-dev knob for plaintext vs TLS gRPC; production typically terminates TLS upstream |
| `LB_ONLY_RETRIES` | `true` | Retry via configured LB endpoint(s) only; ignore direct owner redirect hints |
| `RECONNECT_DELAY` | `0.1` | Base reconnect backoff in seconds (exponential) |
| `RECONNECT_MAX_DELAY` | `1.0` | Reconnect backoff cap in seconds |
| `RECONNECT_JITTER_SECONDS` | `0.1` | Added jitter upper bound in seconds, capped by `RECONNECT_MAX_DELAY` |
| `MAX_RECONNECT_ATTEMPTS` | `0` | Max reconnect attempts (`0` = infinite retries) |
| `WORKER_MAX_CONCURRENCY` | - | Max concurrent task executions |
| `WORKER_MAX_INPUT_BYTES` | - | Max input payload size |
| `WORKER_MAX_OUTPUT_BYTES` | - | Max output payload size |
Expand Down Expand Up @@ -456,7 +451,7 @@ cache.get_vram_models() # ["model-a"]
from gen_worker.errors import RetryableError, ValidationError, FatalError

@worker_function()
def process(ctx: RequestContext, payload: Input) -> Output:
def process(ctx: ActionContext, payload: Input) -> Output:
if not payload.prompt:
raise ValidationError("prompt is required") # 400, no retry

Expand Down
2 changes: 0 additions & 2 deletions docs/terminology.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ This repo follows Cozy's canonical naming. There is no backward-compat layer for
- `function`: invokable unit inside an endpoint release.
- Invoke reference: `owner/endpoint/function[:tag]` (default tag is `prod`).
- `release_id`: immutable identifier for a published endpoint release.
- In control-plane release payloads, endpoint identity is endpoint-centric:
`endpoint.owner` / `endpoint.endpoint_name` are primary; `release.owner` is legacy compatibility.
- `invoker` / `invoker_id`: the identity performing an invocation.
- `artifacts`: uploaded source code and endpoint-owned build inputs/outputs.
- Cozy Hub stores endpoint source tarballs in the endpoint artifacts bucket (`s3.endpoint_artifacts.*` / `S3_ENDPOINT_ARTIFACTS_*`).
Expand Down
14 changes: 7 additions & 7 deletions examples/firered-image-edit/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# Tenant-supplied Dockerfile example (GPU).
#
# - Installs torch in a stable cacheable layer.
# - Installs tenant deps from uv.lock without replacing torch.
# - Installs torch + gen-worker in stable cacheable layers.
# - Installs tenant deps from uv.lock without replacing torch/gen-worker.
# - Bakes discovered functions into /app/.tensorhub/endpoint.lock at build time.
#
ARG PYTHON_VERSION=3.12
FROM ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS cozy_base

ARG CUDA_VERSION=12.8
ARG UV_TORCH_BACKEND=
ARG UV_TORCH_BACKEND=cu126
ARG TORCH_SPEC="~=2.10.0"

WORKDIR /app
Expand All @@ -23,17 +22,18 @@ RUN --mount=type=cache,id=cozy-apt-cache,target=/var/cache/apt,sharing=locked \
&& apt-get clean

RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
backend="${UV_TORCH_BACKEND:-cu$(printf '%s' "${CUDA_VERSION}" | tr -d '.')}" \
&& uv pip install --system --break-system-packages --torch-backend "$backend" \
uv pip install --system --break-system-packages --torch-backend ${UV_TORCH_BACKEND} \
"torch${TORCH_SPEC}"
RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
uv pip install --system --break-system-packages gen-worker==0.3.0

FROM cozy_base AS cozy_final

COPY pyproject.toml uv.lock /app/

RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
uv export --no-dev --no-hashes --no-sources --no-emit-project --no-emit-local \
--no-emit-package torch \
--no-emit-package torch --no-emit-package gen-worker \
-o /tmp/requirements.all.txt \
&& grep -Ev '^(torch|triton|nvidia-|cuda-)' /tmp/requirements.all.txt > /tmp/requirements.txt \
&& uv pip install --system --break-system-packages --no-deps -r /tmp/requirements.txt
Expand Down
2 changes: 1 addition & 1 deletion examples/firered-image-edit/endpoint.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ main = "firered_image_edit.main"
firered_image_edit = { ref = "fireredteam/firered-image-edit-1.0", dtypes = ["bf16"] }

[host.requirements]
cuda = "12.8"
cuda = "12.6"

[resources]
vram_gb = 24
Expand Down
2 changes: 1 addition & 1 deletion examples/firered-image-edit/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.0"
description = "FireRed Image Edit example (inference-only; Cozy manifest via endpoint.toml)"
requires-python = ">=3.12"
dependencies = [
"gen-worker[torch]",
"gen-worker[torch]==0.3.0",
"diffusers @ git+https://github.com/huggingface/diffusers",
"transformers>=4.51.3",
"accelerate",
Expand Down
4 changes: 2 additions & 2 deletions examples/firered-image-edit/src/firered_image_edit/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from diffusers import QwenImageEditPlusPipeline
from PIL import Image

from gen_worker import RequestContext, ResourceRequirements, worker_function
from gen_worker import ActionContext, ResourceRequirements, worker_function
from gen_worker.injection import ModelRef, ModelRefSource as Src
from gen_worker.types import Asset

Expand Down Expand Up @@ -47,7 +47,7 @@ class EditOutput(msgspec.Struct):

@worker_function(_firered_resources)
def edit(
ctx: RequestContext,
ctx: ActionContext,
pipeline: Annotated[
QwenImageEditPlusPipeline,
ModelRef(Src.FIXED, "firered_image_edit"),
Expand Down
2 changes: 1 addition & 1 deletion examples/firered-image-edit/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 8 additions & 8 deletions examples/flux2-klein-4b/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Tenant-supplied Dockerfile example.
#
# - Installs torch in a stable cacheable layer.
# - Installs tenant deps from uv.lock without replacing torch.
# - Installs torch + gen-worker in stable cacheable layers.
# - Installs tenant deps from uv.lock without replacing torch/gen-worker.
# - Bakes discovered functions into /app/.tensorhub/endpoint.lock at build time.
#
# Local build (GPU):
Expand All @@ -10,8 +10,7 @@
ARG PYTHON_VERSION=3.12
FROM ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS cozy_base

ARG CUDA_VERSION=12.8
ARG UV_TORCH_BACKEND=
ARG UV_TORCH_BACKEND=cu126
ARG TORCH_SPEC="~=2.10.0"

WORKDIR /app
Expand All @@ -27,19 +26,20 @@ RUN --mount=type=cache,id=cozy-apt-cache,target=/var/cache/apt,sharing=locked \

# Stable runtime layers (avoid re-downloading torch/cu libs on every tenant build).
RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
backend="${UV_TORCH_BACKEND:-cu$(printf '%s' "${CUDA_VERSION}" | tr -d '.')}" \
&& uv pip install --system --break-system-packages --torch-backend "$backend" \
uv pip install --system --break-system-packages --torch-backend ${UV_TORCH_BACKEND} \
"torch${TORCH_SPEC}"
RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
uv pip install --system --break-system-packages gen-worker==0.3.0

FROM cozy_base AS cozy_final

# Copy lock metadata first so dependency layers are cacheable across source edits.
COPY pyproject.toml uv.lock /app/

# Install tenant dependencies into the global environment without replacing torch.
# Install tenant dependencies into the global environment without replacing torch/gen-worker.
RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
uv export --no-dev --no-hashes --no-sources --no-emit-project --no-emit-local \
--no-emit-package torch \
--no-emit-package torch --no-emit-package gen-worker \
-o /tmp/requirements.all.txt \
&& grep -Ev '^(torch|triton|nvidia-|cuda-)' /tmp/requirements.all.txt > /tmp/requirements.txt \
&& uv pip install --system --break-system-packages --no-deps -r /tmp/requirements.txt
Expand Down
45 changes: 30 additions & 15 deletions examples/flux2-klein-4b/README.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,40 @@
# flux2-klein-4b

FLUX.2-klein 4B endpoint with separate base/turbo functions and dtype-specific variants.
FLUX.2-klein turbo example using Cozy’s injection pattern (4B + 9B variants).

Naming convention in this repo:
- The worker function only defines input/output + runs inference.
- Fixed model selection is declared in code via `ModelRef(Src.FIXED, "<model-key>")`.
- Model refs/dtypes are declared in `endpoint.toml [models]`.
- This model is treated as a turbo model: the worker forces `num_inference_steps=8`.

Steps:

- `num_inference_steps` is accepted in the payload, but it is clamped to `[4, 8]` (rounded) for predictable cost/latency.

- Base model ref: `black-forest-labs/flux.2-klein-4b-base`
- Turbo model ref: `black-forest-labs/flux.2-klein-4b-turbo`
Code uses:

This avoids ambiguity with upstream naming where `flux.2-klein-4b` is commonly used for turbo variants.
```py
pipeline: Annotated[
Flux2KleinPipeline,
ModelRef(Src.FIXED, "flux2-klein-4b"),
]
```

Functions:

- `generate`: base bf16
- `generate_turbo`: turbo bf16
- `generate_fp8`: base fp8
- `generate_turbo_fp8`: turbo fp8
- `generate_nvfp4`: base nvfp4
- `generate_turbo_nvfp4`: turbo nvfp4
- `generate`: 4B bf16 (regular turbo baseline)
- `generate_fp8`: 4B fp8
- `generate_9b`: 9B bf16 (regular turbo baseline)
- `generate_9b_fp8`: 9B fp8
- `generate_int8`: int8-only
- `generate_int4`: int4-only

Notes:
Notes on FP8:

- Fixed model selection is declared in code via `ModelRef(Src.FIXED, "<model-key>")`.
- Model refs/dtypes are declared in `endpoint.toml [models]`.
- `num_inference_steps` is accepted in the payload, but clamped to `[4, 8]`.
- FP8 support here is **weight-only** quantization via `torchao` (Diffusers TorchAoConfig).
- GPUs vary: FP8 acceleration typically requires newer NVIDIA GPUs (e.g. Ada/Hopper class).

Notes on INT8/INT4:

- INT8/INT4 support here is **weight-only** quantization via `torchao` (Diffusers TorchAoConfig).
- INT4 is experimental for diffusion; expect quality regressions or incompatibilities.
16 changes: 8 additions & 8 deletions examples/flux2-klein-4b/endpoint.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
schema_version = 1

name = "flux.2-klein-4b"
name = "flux2-klein-4b"
# Python import path used for function discovery (not a Docker ENTRYPOINT).
main = "flux2_klein_4b.main"

[models]
flux2-klein-4b-base = { ref = "black-forest-labs/flux.2-klein-4b-base", dtypes = ["bf16"] }
flux2-klein-4b-turbo = { ref = "black-forest-labs/flux.2-klein-4b-turbo", dtypes = ["bf16"] }
flux2-klein-4b-base_fp8 = { ref = "black-forest-labs/flux.2-klein-4b-base", dtypes = ["fp8"] }
flux2-klein-4b-turbo_fp8 = { ref = "black-forest-labs/flux.2-klein-4b-turbo", dtypes = ["fp8"] }
flux2-klein-4b-base_nvfp4 = { ref = "black-forest-labs/flux.2-klein-4b-base", dtypes = ["nvfp4"] }
flux2-klein-4b-turbo_nvfp4 = { ref = "black-forest-labs/flux.2-klein-4b-turbo", dtypes = ["nvfp4"] }
flux2-klein-4b = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["bf16"] }
flux2-klein-4b_fp8 = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["fp8"] }
flux2-klein-9b = { ref = "black-forest-labs/flux.2-klein-9b", dtypes = ["bf16"] }
flux2-klein-9b_fp8 = { ref = "black-forest-labs/flux.2-klein-9b", dtypes = ["fp8"] }
flux2-klein-4b_int8 = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["int8"] }
flux2-klein-4b_int4 = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["int4"] }

[host.requirements]
# If cuda is set, the platform treats this worker as requiring an NVIDIA GPU.
cuda = "12.8"
cuda = "12.6"

[resources]
vram_gb = 24
Expand Down
6 changes: 3 additions & 3 deletions examples/flux2-klein-4b/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
[project]
name = "flux2-klein-4b"
version = "0.1.0"
description = "Flux2-klein-4B endpoint (base+turbo; inference-only; Cozy manifest via endpoint.toml)"
description = "Flux2-klein-4B example (inference-only; Cozy manifest via endpoint.toml)"
requires-python = ">=3.12"
dependencies = [
"gen-worker[torch]",
"gen-worker[torch]==0.3.0",
"diffusers @ git+https://github.com/huggingface/diffusers.git@99e2cfff27dec514a43e260e885c5e6eca038b36",
"transformers>=4.56,<5",
"accelerate",
"pillow",
# Needed for fp8/nvfp4 variants (diffusers TorchAoConfig and/or variant-aware loading).
# Needed for fp8 weight-only quantization variants (diffusers TorchAoConfig).
"torchao",
]

Expand Down
Loading
Loading