cozy-creator · ArpBansal · Mar 17, 2026 · Mar 17, 2026
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ The built image MUST:
 2. Bake function discovery output (manifest) at build time:
 
 ```dockerfile
-RUN mkdir -p /app/.tensorhub && python -m gen_worker.discover > /app/.tensorhub/endpoint.lock
+RUN mkdir -p /app/.cozy && python -m gen_worker.discover > /app/.tensorhub/endpoint.lock
 ```
 
 3. Use the Cozy worker runtime as the ENTRYPOINT:
@@ -50,7 +50,7 @@ uv add gen-worker[torch]
 
 ```python
 import msgspec
-from gen_worker import RequestContext, worker_function
+from gen_worker import ActionContext, worker_function
 
 class Input(msgspec.Struct):
     prompt: str
@@ -59,7 +59,7 @@ class Output(msgspec.Struct):
     text: str
 
 @worker_function()
-def generate(ctx: RequestContext, payload: Input) -> Output:
+def generate(ctx: ActionContext, payload: Input) -> Output:
     return Output(text=f"Hello, {payload.prompt}!")
 ```
 
@@ -69,7 +69,7 @@ def generate(ctx: RequestContext, payload: Input) -> Output:
 - **Schema generation** - Input/output schemas extracted from msgspec types
 - **Model injection** - Dependency injection for ML models with caching
 - **Streaming output** - Support for incremental/streaming responses
-- **Progress reporting** - Built-in progress events via `RequestContext`
+- **Progress reporting** - Built-in progress events via `ActionContext`
 - **Perf metrics** - Best-effort per-run metrics emitted to gen-orchestrator (`metrics.*` worker events)
 - **Trainer runtime mode** - SDK-native trainer loop via `WORKER_MODE=trainer`
 - **File handling** - Upload/download assets via Cozy hub file API
@@ -81,7 +81,7 @@ def generate(ctx: RequestContext, payload: Input) -> Output:
 
 ```python
 import msgspec
-from gen_worker import RequestContext, worker_function
+from gen_worker import ActionContext, worker_function
 
 class Input(msgspec.Struct):
     prompt: str
@@ -90,7 +90,7 @@ class Output(msgspec.Struct):
     result: str
 
 @worker_function()
-def my_function(ctx: RequestContext, payload: Input) -> Output:
+def my_function(ctx: ActionContext, payload: Input) -> Output:
     return Output(result=f"Processed: {payload.prompt}")
 ```
 
@@ -103,7 +103,7 @@ class Delta(msgspec.Struct):
     chunk: str
 
 @worker_function()
-def stream(ctx: RequestContext, payload: Input) -> Iterator[Delta]:
+def stream(ctx: ActionContext, payload: Input) -> Iterator[Delta]:
     for word in payload.prompt.split():
         if ctx.is_canceled():
             raise InterruptedError("canceled")
@@ -126,7 +126,7 @@ from gen_worker.injection import ModelRef, ModelRefSource as Src
 
 @worker_function()
 def generate(
-    ctx: RequestContext,
+    ctx: ActionContext,
     pipe: Annotated[DiffusionPipeline, ModelRef(Src.FIXED, "sd15")],
     payload: Input,
 ) -> Output:
@@ -149,7 +149,7 @@ flux = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["bf16"] }
 from typing import Annotated
 import msgspec
 from diffusers import DiffusionPipeline
-from gen_worker import RequestContext, worker_function
+from gen_worker import ActionContext, worker_function
 from gen_worker.injection import ModelRef, ModelRefSource as Src
 
 class Input(msgspec.Struct):
@@ -158,7 +158,7 @@ class Input(msgspec.Struct):
 
 @worker_function()
 def generate(
-    ctx: RequestContext,
+    ctx: ActionContext,
     pipe: Annotated[DiffusionPipeline, ModelRef(Src.PAYLOAD, "model")],
     payload: Input,
 ):
@@ -173,7 +173,7 @@ arbitrary repo refs in the payload.
 
 ```python
 @worker_function()
-def process(ctx: RequestContext, payload: Input) -> Output:
+def process(ctx: ActionContext, payload: Input) -> Output:
     # Save bytes and get asset reference
     asset = ctx.save_bytes(f"runs/{ctx.request_id}/outputs/output.png", image_bytes)
     return Output(result=asset.ref)
@@ -278,7 +278,7 @@ Orchestrator-injected (production contract):
 |----------|---------|-------------|
 | `WORKER_MODE` | `inference` | Runtime mode selector (`inference` or `trainer`) |
 | `SCHEDULER_PUBLIC_ADDR` | - | Scheduler address workers should dial |
-| `SCHEDULER_ADDRS` | - | Optional comma-separated LB seed addresses |
+| `SCHEDULER_ADDRS` | - | Optional comma-separated seed addresses for leader discovery |
 | `WORKER_JWT` | - | Worker-connect JWT (required; claims are authoritative) |
 
 Local dev / advanced (not injected by orchestrator):
@@ -289,11 +289,6 @@ Local dev / advanced (not injected by orchestrator):
 | `SCHEDULER_JWT_ISSUER` | - | Optional: expected `iss` when verifying WORKER_JWT locally |
 | `SCHEDULER_JWT_AUDIENCE` | - | Optional: expected `aud` when verifying WORKER_JWT locally |
 | `USE_TLS` | `false` | Local-dev knob for plaintext vs TLS gRPC; production typically terminates TLS upstream |
-| `LB_ONLY_RETRIES` | `true` | Retry via configured LB endpoint(s) only; ignore direct owner redirect hints |
-| `RECONNECT_DELAY` | `0.1` | Base reconnect backoff in seconds (exponential) |
-| `RECONNECT_MAX_DELAY` | `1.0` | Reconnect backoff cap in seconds |
-| `RECONNECT_JITTER_SECONDS` | `0.1` | Added jitter upper bound in seconds, capped by `RECONNECT_MAX_DELAY` |
-| `MAX_RECONNECT_ATTEMPTS` | `0` | Max reconnect attempts (`0` = infinite retries) |
 | `WORKER_MAX_CONCURRENCY` | - | Max concurrent task executions |
 | `WORKER_MAX_INPUT_BYTES` | - | Max input payload size |
 | `WORKER_MAX_OUTPUT_BYTES` | - | Max output payload size |
@@ -456,7 +451,7 @@ cache.get_vram_models()      # ["model-a"]
 from gen_worker.errors import RetryableError, ValidationError, FatalError
 
 @worker_function()
-def process(ctx: RequestContext, payload: Input) -> Output:
+def process(ctx: ActionContext, payload: Input) -> Output:
     if not payload.prompt:
         raise ValidationError("prompt is required")  # 400, no retry
 

diff --git a/docs/terminology.md b/docs/terminology.md
@@ -7,8 +7,6 @@ This repo follows Cozy's canonical naming. There is no backward-compat layer for
 - `function`: invokable unit inside an endpoint release.
 - Invoke reference: `owner/endpoint/function[:tag]` (default tag is `prod`).
 - `release_id`: immutable identifier for a published endpoint release.
-- In control-plane release payloads, endpoint identity is endpoint-centric:
-  `endpoint.owner` / `endpoint.endpoint_name` are primary; `release.owner` is legacy compatibility.
 - `invoker` / `invoker_id`: the identity performing an invocation.
 - `artifacts`: uploaded source code and endpoint-owned build inputs/outputs.
   - Cozy Hub stores endpoint source tarballs in the endpoint artifacts bucket (`s3.endpoint_artifacts.*` / `S3_ENDPOINT_ARTIFACTS_*`).

diff --git a/examples/firered-image-edit/Dockerfile b/examples/firered-image-edit/Dockerfile
@@ -1,14 +1,13 @@
 # Tenant-supplied Dockerfile example (GPU).
 #
-# - Installs torch in a stable cacheable layer.
-# - Installs tenant deps from uv.lock without replacing torch.
+# - Installs torch + gen-worker in stable cacheable layers.
+# - Installs tenant deps from uv.lock without replacing torch/gen-worker.
 # - Bakes discovered functions into /app/.tensorhub/endpoint.lock at build time.
 #
 ARG PYTHON_VERSION=3.12
 FROM ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS cozy_base
 
-ARG CUDA_VERSION=12.8
-ARG UV_TORCH_BACKEND=
+ARG UV_TORCH_BACKEND=cu126
 ARG TORCH_SPEC="~=2.10.0"
 
 WORKDIR /app
@@ -23,17 +22,18 @@ RUN --mount=type=cache,id=cozy-apt-cache,target=/var/cache/apt,sharing=locked \
     && apt-get clean
 
 RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
-    backend="${UV_TORCH_BACKEND:-cu$(printf '%s' "${CUDA_VERSION}" | tr -d '.')}" \
-    && uv pip install --system --break-system-packages --torch-backend "$backend" \
+    uv pip install --system --break-system-packages --torch-backend ${UV_TORCH_BACKEND} \
       "torch${TORCH_SPEC}"
+RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
+    uv pip install --system --break-system-packages gen-worker==0.3.0
 
 FROM cozy_base AS cozy_final
 
 COPY pyproject.toml uv.lock /app/
 
 RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
     uv export --no-dev --no-hashes --no-sources --no-emit-project --no-emit-local \
-      --no-emit-package torch \
+      --no-emit-package torch --no-emit-package gen-worker \
       -o /tmp/requirements.all.txt \
     && grep -Ev '^(torch|triton|nvidia-|cuda-)' /tmp/requirements.all.txt > /tmp/requirements.txt \
     && uv pip install --system --break-system-packages --no-deps -r /tmp/requirements.txt

diff --git a/examples/firered-image-edit/endpoint.toml b/examples/firered-image-edit/endpoint.toml
@@ -7,7 +7,7 @@ main = "firered_image_edit.main"
 firered_image_edit = { ref = "fireredteam/firered-image-edit-1.0", dtypes = ["bf16"] }
 
 [host.requirements]
-cuda = "12.8"
+cuda = "12.6"
 
 [resources]
 vram_gb = 24

diff --git a/examples/firered-image-edit/pyproject.toml b/examples/firered-image-edit/pyproject.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "FireRed Image Edit example (inference-only; Cozy manifest via endpoint.toml)"
 requires-python = ">=3.12"
 dependencies = [
-    "gen-worker[torch]",
+    "gen-worker[torch]==0.3.0",
     "diffusers @ git+https://github.com/huggingface/diffusers",
     "transformers>=4.51.3",
     "accelerate",

diff --git a/examples/firered-image-edit/src/firered_image_edit/main.py b/examples/firered-image-edit/src/firered_image_edit/main.py
@@ -10,7 +10,7 @@
 from diffusers import QwenImageEditPlusPipeline
 from PIL import Image
 
-from gen_worker import RequestContext, ResourceRequirements, worker_function
+from gen_worker import ActionContext, ResourceRequirements, worker_function
 from gen_worker.injection import ModelRef, ModelRefSource as Src
 from gen_worker.types import Asset
 
@@ -47,7 +47,7 @@ class EditOutput(msgspec.Struct):
 
 @worker_function(_firered_resources)
 def edit(
-    ctx: RequestContext,
+    ctx: ActionContext,
     pipeline: Annotated[
         QwenImageEditPlusPipeline,
         ModelRef(Src.FIXED, "firered_image_edit"),

diff --git a/examples/firered-image-edit/uv.lock b/examples/firered-image-edit/uv.lock
diff --git a/examples/flux2-klein-4b/Dockerfile b/examples/flux2-klein-4b/Dockerfile
@@ -1,7 +1,7 @@
 # Tenant-supplied Dockerfile example.
 #
-# - Installs torch in a stable cacheable layer.
-# - Installs tenant deps from uv.lock without replacing torch.
+# - Installs torch + gen-worker in stable cacheable layers.
+# - Installs tenant deps from uv.lock without replacing torch/gen-worker.
 # - Bakes discovered functions into /app/.tensorhub/endpoint.lock at build time.
 #
 # Local build (GPU):
@@ -10,8 +10,7 @@
 ARG PYTHON_VERSION=3.12
 FROM ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS cozy_base
 
-ARG CUDA_VERSION=12.8
-ARG UV_TORCH_BACKEND=
+ARG UV_TORCH_BACKEND=cu126
 ARG TORCH_SPEC="~=2.10.0"
 
 WORKDIR /app
@@ -27,19 +26,20 @@ RUN --mount=type=cache,id=cozy-apt-cache,target=/var/cache/apt,sharing=locked \
 
 # Stable runtime layers (avoid re-downloading torch/cu libs on every tenant build).
 RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
-    backend="${UV_TORCH_BACKEND:-cu$(printf '%s' "${CUDA_VERSION}" | tr -d '.')}" \
-    && uv pip install --system --break-system-packages --torch-backend "$backend" \
+    uv pip install --system --break-system-packages --torch-backend ${UV_TORCH_BACKEND} \
       "torch${TORCH_SPEC}"
+RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
+    uv pip install --system --break-system-packages gen-worker==0.3.0
 
 FROM cozy_base AS cozy_final
 
 # Copy lock metadata first so dependency layers are cacheable across source edits.
 COPY pyproject.toml uv.lock /app/
 
-# Install tenant dependencies into the global environment without replacing torch.
+# Install tenant dependencies into the global environment without replacing torch/gen-worker.
 RUN --mount=type=cache,id=cozy-uv-cache,target=/var/cache/uv,sharing=locked \
     uv export --no-dev --no-hashes --no-sources --no-emit-project --no-emit-local \
-      --no-emit-package torch \
+      --no-emit-package torch --no-emit-package gen-worker \
       -o /tmp/requirements.all.txt \
     && grep -Ev '^(torch|triton|nvidia-|cuda-)' /tmp/requirements.all.txt > /tmp/requirements.txt \
     && uv pip install --system --break-system-packages --no-deps -r /tmp/requirements.txt

diff --git a/examples/flux2-klein-4b/README.md b/examples/flux2-klein-4b/README.md
@@ -1,25 +1,40 @@
 # flux2-klein-4b
 
-FLUX.2-klein 4B endpoint with separate base/turbo functions and dtype-specific variants.
+FLUX.2-klein turbo example using Cozy’s injection pattern (4B + 9B variants).
 
-Naming convention in this repo:
+- The worker function only defines input/output + runs inference.
+- Fixed model selection is declared in code via `ModelRef(Src.FIXED, "<model-key>")`.
+- Model refs/dtypes are declared in `endpoint.toml [models]`.
+- This model is treated as a turbo model: the worker forces `num_inference_steps=8`.
+
+Steps:
+
+- `num_inference_steps` is accepted in the payload, but it is clamped to `[4, 8]` (rounded) for predictable cost/latency.
 
-- Base model ref: `black-forest-labs/flux.2-klein-4b-base`
-- Turbo model ref: `black-forest-labs/flux.2-klein-4b-turbo`
+Code uses:
 
-This avoids ambiguity with upstream naming where `flux.2-klein-4b` is commonly used for turbo variants.
+```py
+pipeline: Annotated[
+  Flux2KleinPipeline,
+  ModelRef(Src.FIXED, "flux2-klein-4b"),
+]
+```
 
 Functions:
 
-- `generate`: base bf16
-- `generate_turbo`: turbo bf16
-- `generate_fp8`: base fp8
-- `generate_turbo_fp8`: turbo fp8
-- `generate_nvfp4`: base nvfp4
-- `generate_turbo_nvfp4`: turbo nvfp4
+- `generate`: 4B bf16 (regular turbo baseline)
+- `generate_fp8`: 4B fp8
+- `generate_9b`: 9B bf16 (regular turbo baseline)
+- `generate_9b_fp8`: 9B fp8
+- `generate_int8`: int8-only
+- `generate_int4`: int4-only
 
-Notes:
+Notes on FP8:
 
-- Fixed model selection is declared in code via `ModelRef(Src.FIXED, "<model-key>")`.
-- Model refs/dtypes are declared in `endpoint.toml [models]`.
-- `num_inference_steps` is accepted in the payload, but clamped to `[4, 8]`.
+- FP8 support here is **weight-only** quantization via `torchao` (Diffusers TorchAoConfig).
+- GPUs vary: FP8 acceleration typically requires newer NVIDIA GPUs (e.g. Ada/Hopper class).
+
+Notes on INT8/INT4:
+
+- INT8/INT4 support here is **weight-only** quantization via `torchao` (Diffusers TorchAoConfig).
+- INT4 is experimental for diffusion; expect quality regressions or incompatibilities.
diff --git a/examples/flux2-klein-4b/endpoint.toml b/examples/flux2-klein-4b/endpoint.toml
@@ -1,20 +1,20 @@
 schema_version = 1
 
-name = "flux.2-klein-4b"
+name = "flux2-klein-4b"
 # Python import path used for function discovery (not a Docker ENTRYPOINT).
 main = "flux2_klein_4b.main"
 
 [models]
-flux2-klein-4b-base = { ref = "black-forest-labs/flux.2-klein-4b-base", dtypes = ["bf16"] }
-flux2-klein-4b-turbo = { ref = "black-forest-labs/flux.2-klein-4b-turbo", dtypes = ["bf16"] }
-flux2-klein-4b-base_fp8 = { ref = "black-forest-labs/flux.2-klein-4b-base", dtypes = ["fp8"] }
-flux2-klein-4b-turbo_fp8 = { ref = "black-forest-labs/flux.2-klein-4b-turbo", dtypes = ["fp8"] }
-flux2-klein-4b-base_nvfp4 = { ref = "black-forest-labs/flux.2-klein-4b-base", dtypes = ["nvfp4"] }
-flux2-klein-4b-turbo_nvfp4 = { ref = "black-forest-labs/flux.2-klein-4b-turbo", dtypes = ["nvfp4"] }
+flux2-klein-4b = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["bf16"] }
+flux2-klein-4b_fp8 = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["fp8"] }
+flux2-klein-9b = { ref = "black-forest-labs/flux.2-klein-9b", dtypes = ["bf16"] }
+flux2-klein-9b_fp8 = { ref = "black-forest-labs/flux.2-klein-9b", dtypes = ["fp8"] }
+flux2-klein-4b_int8 = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["int8"] }
+flux2-klein-4b_int4 = { ref = "black-forest-labs/flux.2-klein-4b", dtypes = ["int4"] }
 
 [host.requirements]
 # If cuda is set, the platform treats this worker as requiring an NVIDIA GPU.
-cuda = "12.8"
+cuda = "12.6"
 
 [resources]
 vram_gb = 24

diff --git a/examples/flux2-klein-4b/pyproject.toml b/examples/flux2-klein-4b/pyproject.toml
@@ -1,15 +1,15 @@
 [project]
 name = "flux2-klein-4b"
 version = "0.1.0"
-description = "Flux2-klein-4B endpoint (base+turbo; inference-only; Cozy manifest via endpoint.toml)"
+description = "Flux2-klein-4B example (inference-only; Cozy manifest via endpoint.toml)"
 requires-python = ">=3.12"
 dependencies = [
-    "gen-worker[torch]",
+    "gen-worker[torch]==0.3.0",
     "diffusers @ git+https://github.com/huggingface/diffusers.git@99e2cfff27dec514a43e260e885c5e6eca038b36",
     "transformers>=4.56,<5",
     "accelerate",
     "pillow",
-    # Needed for fp8/nvfp4 variants (diffusers TorchAoConfig and/or variant-aware loading).
+    # Needed for fp8 weight-only quantization variants (diffusers TorchAoConfig).
     "torchao",
 ]