From 50778f03bab7a94899d64cb4237974a0c2b2e725 Mon Sep 17 00:00:00 2001
From: Eric Hartford <eric@quixi.ai>
Date: Sun, 21 Jun 2026 14:06:44 +0000
Subject: [PATCH 1/3] Add CDNA ROCm MFMA build path

---
 Makefile                 | 52 ++++++++++++++++++++---
 README.md                | 31 ++++++++++++--
 ds4_agent.c              |  9 ++++
 rocm/ds4_rocm_matmul.cuh | 28 ++++++++++++-
 rocm/ds4_rocm_q8.cuh     | 90 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 200 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index 9711dc1a4..9ae553626 100644
--- a/Makefile
+++ b/Makefile
@@ -33,14 +33,21 @@ CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
 HIPCC ?= $(shell command -v hipcc 2>/dev/null || echo /opt/rocm/bin/hipcc)
 ROCM_ARCH ?= gfx1151
-ROCM_CFLAGS ?= -O3 -ffast-math -g -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
+ROCM_ARCHES = $(strip $(ROCM_ARCH))
+ROCM_PRIMARY_ARCH = $(firstword $(subst :, ,$(firstword $(ROCM_ARCHES))))
+ROCM_OFFLOAD_FLAGS = $(foreach arch,$(ROCM_ARCHES),--offload-arch=$(arch))
+ROCM_WMMA_W32 ?= $(if $(filter gfx11%,$(ROCM_PRIMARY_ARCH)),1,0)
+ROCM_MFMA_F16 ?= $(if $(filter gfx94% gfx95%,$(ROCM_PRIMARY_ARCH)),1,0)
+ROCM_CFLAGS ?= -O3 -ffast-math -g -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -DDS4_ROCM_WMMA_W32=$(ROCM_WMMA_W32) -DDS4_ROCM_MFMA_F16=$(ROCM_MFMA_F16) -Wno-unused-command-line-argument $(ROCM_OFFLOAD_FLAGS)
 ROCM_LDLIBS ?= -lm -pthread -lhipblas -lhipblaslt
+ROCM_TARGETS := ds4 ds4-server ds4-bench ds4-eval ds4-agent
+ROCM_CORE_OBJS := ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o
 DS4_LINK ?= $(NVCC) $(NVCCFLAGS)
 DS4_LINK_LIBS ?= $(CUDA_LDLIBS)
 METAL_LDLIBS := $(LDLIBS)
 endif
 
-.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo rocm
+.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo cdna cdna3 cdna4 mi300x mi325x mi350x mi355x rocm
 
 ifeq ($(UNAME_S),Darwin)
 all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
@@ -85,7 +92,12 @@ help:
 	@echo "  make cuda-generic        Build CUDA for a generic local CUDA GPU"
 	@echo "  make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
 	@echo "  make strix-halo          Build ROCm for Strix Halo / gfx1151"
-	@echo "  make rocm                Alias for make strix-halo"
+	@echo "  make cdna                Build ROCm CDNA3+CDNA4 fat binary / gfx942+gfx950"
+	@echo "  make cdna3               Build ROCm for AMD Instinct MI300X/MI325X / gfx942"
+	@echo "  make cdna4               Build ROCm for AMD Instinct MI350X/MI355X / gfx950"
+	@echo "  make mi300x|mi325x       Alias for make cdna3"
+	@echo "  make mi350x|mi355x       Alias for make cdna4"
+	@echo "  make rocm ROCM_ARCH=gfxN Build ROCm with explicit AMD GPU target(s)"
 	@echo "  make cpu                 Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
 	@echo "  make test                Build and run tests"
 	@echo "  make clean               Remove build outputs"
@@ -104,14 +116,42 @@ cuda:
 	fi
 	$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH="$(CUDA_ARCH)"
 
+strix-halo: ROCM_ARCH := gfx1151
 strix-halo:
-	$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent \
-		CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o" \
+	$(MAKE) -B $(ROCM_TARGETS) \
+		HIPCC="$(HIPCC)" \
+		ROCM_ARCH="$(ROCM_ARCH)" \
+		ROCM_CFLAGS="$(ROCM_CFLAGS)" \
+		ROCM_LDLIBS="$(ROCM_LDLIBS)" \
+		CORE_OBJS="$(ROCM_CORE_OBJS)" \
 		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
 		DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
 		DS4_LINK_LIBS="$(ROCM_LDLIBS)"
 
-rocm: strix-halo
+cdna: ROCM_ARCH := gfx942 gfx950
+cdna3 mi300x mi325x: ROCM_ARCH := gfx942
+cdna4 mi350x mi355x: ROCM_ARCH := gfx950
+cdna cdna3 cdna4 mi300x mi325x mi350x mi355x:
+	$(MAKE) -B $(ROCM_TARGETS) \
+		HIPCC="$(HIPCC)" \
+		ROCM_ARCH="$(ROCM_ARCH)" \
+		ROCM_CFLAGS="$(ROCM_CFLAGS)" \
+		ROCM_LDLIBS="$(ROCM_LDLIBS)" \
+		CORE_OBJS="$(ROCM_CORE_OBJS)" \
+		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
+		DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
+		DS4_LINK_LIBS="$(ROCM_LDLIBS)"
+
+rocm:
+	$(MAKE) -B $(ROCM_TARGETS) \
+		HIPCC="$(HIPCC)" \
+		ROCM_ARCH="$(ROCM_ARCH)" \
+		ROCM_CFLAGS="$(ROCM_CFLAGS)" \
+		ROCM_LDLIBS="$(ROCM_LDLIBS)" \
+		CORE_OBJS="$(ROCM_CORE_OBJS)" \
+		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
+		DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
+		DS4_LINK_LIBS="$(ROCM_LDLIBS)"
 
 ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS)
 	$(DS4_LINK) -o $@ $^ $(DS4_LINK_LIBS)
diff --git a/README.md b/README.md
index 785695284..98d836913 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ and for quality and speed testing.
 We support the following backends:
 * **Metal** is our primary target. Starting from MacBooks with 96GB of RAM (or less, using SSD streaming).
 * **NVIDIA CUDA / DGX Spark**, CUDA with special care for the DGX Spark.
-* **Strix Halo (ROCm)**, systems like the Framework Desktop and other systems based on the same GPU and unified RAM design.
+* **AMD ROCm**, with explicit targets for AMD Instinct CDNA3/CDNA4 accelerators and Strix Halo (`gfx1151`).
 
 This project would not exist without **llama.cpp and GGML**, make sure to read
 the acknowledgements section, a big thank you to Georgi Gerganov and all the
@@ -41,7 +41,7 @@ That said, a few important things about this project:
 * This software is developed with **strong assistance from GPT 5.5** and with humans leading the ideas, testing, and debugging. We say this openly because it shaped how the project was built. If you are not happy with AI-developed code, this software is not for you. The acknowledgement below is equally important: this would not exist without `llama.cpp` and GGML, largely written by hand.
 * This implementation is based on the idea that compressed KV caches like the one of DeepSeek v4 and the fast SSD disks of modern MacBooks should change our idea that KV cache belongs to RAM. **The KV cache is actually a first-class disk citizen**. Fast SSD disks also changed the inference game from the point of view of "model needs to fit RAM": while having more RAM the the model size is still preferred, SSD streaming allows to turn the available amount of RAM from a hard cutoff (can I run this model or not?) to continuous spectrum of speed levels.
 * Our vision is that local inference should be a set of three things working well together, out of the box: A) inference engine with HTTP API + B) GGUF specially crafted to run well under a given engine and given assumptions + C) testing and validation with coding agents implementations. D) Purpose built agents for specific models and execution environments. DwarfStar only runs with the GGUF files provided. It gets tested against officially obtained logits at different context sizes. This project exists because we wanted to make one local model feel finished end to end, not just runnable. However this is beta quality code, so probably we are not still there, especially since recently we introduced large new features: distributed inference, SSD streaming, and other minor improvements.
-* The optimized graph path targets **Metal on macOS** and **CUDA on Linux**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.
+* The optimized graph path targets **Metal on macOS** and **CUDA or ROCm on Linux**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA, ROCm, or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.
 
 ## Acknowledgements to llama.cpp and GGML
 
@@ -145,6 +145,10 @@ Then build:
 make                  # macOS Metal
 make cuda-spark       # Linux CUDA, DGX Spark / GB10
 make cuda-generic     # Linux CUDA, other local CUDA GPUs
+make cdna             # Linux ROCm, AMD Instinct CDNA3+CDNA4 / gfx942+gfx950
+make cdna3            # Linux ROCm, AMD Instinct MI300X/MI325X / gfx942
+make cdna4            # Linux ROCm, AMD Instinct MI350X/MI355X / gfx950
+make strix-halo       # Linux ROCm, Strix Halo / gfx1151
 make cpu              # CPU-only diagnostics build
 ```
 
@@ -1173,11 +1177,13 @@ the kv cache files include the verbatim prompt cached.
 
 ## Backends
 
-The default graph backend is Metal on macOS and CUDA in CUDA builds:
+The default graph backend is Metal on macOS, CUDA in CUDA builds, and ROCm in
+ROCm builds:
 
 ```sh
 ./ds4 -p "Hello" --metal
 ./ds4 -p "Hello" --cuda
+./ds4 -p "Hello" --rocm
 ```
 
 On Linux, plain `make` prints the available build targets instead of selecting a
@@ -1191,6 +1197,25 @@ make cuda CUDA_ARCH=sm_120
 make cuda CUDA_ARCH=native
 ```
 
+For AMD GPUs, use the matching ROCm target. `make cdna` builds a portable
+CDNA3/CDNA4 binary with both `gfx942` and `gfx950` code objects and enables the
+CDNA wave64 MFMA q8 prefill path. If you only need one product generation,
+`make cdna3`, `make mi300x`, and `make mi325x` target `gfx942`; `make cdna4`,
+`make mi350x`, and `make mi355x` target `gfx950`. Strix Halo systems use
+`make strix-halo` and keep the gfx11 wave32 WMMA q8 path. For another AMD GPU,
+set `ROCM_ARCH` explicitly. Multiple targets can be passed as a quoted
+space-separated list:
+
+```sh
+make cdna
+make mi300x
+make mi325x
+make mi355x
+make strix-halo
+make rocm ROCM_ARCH=gfx942
+make rocm ROCM_ARCH="gfx942 gfx950"
+```
+
 There is also a CPU reference/debug path:
 
 ```sh
diff --git a/ds4_agent.c b/ds4_agent.c
index 1c5209154..de8098763 100644
--- a/ds4_agent.c
+++ b/ds4_agent.c
@@ -475,7 +475,11 @@ static float parse_float_range(const char *s, const char *opt, float min, float
 
 static ds4_backend parse_backend(const char *s) {
     if (!strcmp(s, "metal")) return DS4_BACKEND_METAL;
+#ifdef DS4_ROCM_BUILD
+    if (!strcmp(s, "rocm")) return DS4_BACKEND_CUDA;
+#else
     if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA;
+#endif
     if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU;
     fprintf(stderr, "ds4-agent: invalid backend: %s\n", s);
     exit(2);
@@ -592,8 +596,13 @@ static agent_config parse_options(int argc, char **argv) {
             c.engine.backend = parse_backend(need_arg(&i, argc, argv, arg));
         } else if (!strcmp(arg, "--metal")) {
             c.engine.backend = DS4_BACKEND_METAL;
+#ifdef DS4_ROCM_BUILD
+        } else if (!strcmp(arg, "--rocm")) {
+            c.engine.backend = DS4_BACKEND_CUDA;
+#else
         } else if (!strcmp(arg, "--cuda")) {
             c.engine.backend = DS4_BACKEND_CUDA;
+#endif
         } else if (!strcmp(arg, "--cpu")) {
             c.engine.backend = DS4_BACKEND_CPU;
         } else if (!strcmp(arg, "-t") || !strcmp(arg, "--threads")) {
diff --git a/rocm/ds4_rocm_matmul.cuh b/rocm/ds4_rocm_matmul.cuh
index 0f9e06625..0819b4b6e 100644
--- a/rocm/ds4_rocm_matmul.cuh
+++ b/rocm/ds4_rocm_matmul.cuh
@@ -274,7 +274,7 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
         return cuda_ok(cudaGetLastError(), "matmul_q8_0 f32 warp launch");
     }
     if (n_tok > 1) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+#if (defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)) && DS4_ROCM_WMMA_W32
         if (!g_quality_mode && (in_dim % 32u) == 0u &&
             out_dim >= 1024u &&
             n_tok >= 256u &&
@@ -292,6 +292,32 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
                     blocks * 34u);
             return cuda_ok(cudaGetLastError(), "matmul_q8_0 f32 batch wmma 4w launch");
         }
+#endif
+#if (defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)) && DS4_ROCM_MFMA_F16
+        if (!g_quality_mode && (in_dim % 32u) == 0u &&
+            out_dim >= 1024u &&
+            n_tok >= 32u &&
+            in_dim <= UINT32_MAX && out_dim <= UINT32_MAX && n_tok <= UINT32_MAX) {
+            constexpr uint32_t tiles_n = 4u;
+            constexpr uint32_t bm = 16u;
+            constexpr uint32_t bn = 16u;
+            constexpr uint32_t bk = 16u;
+            const dim3 grid((uint32_t)((out_dim + tiles_n * bn - 1u) / (tiles_n * bn)),
+                            (uint32_t)((n_tok + bm - 1u) / bm),
+                            1u);
+            const size_t shmem =
+                ((size_t)bm * bk + (size_t)tiles_n * bk * bn) * sizeof(half) +
+                (size_t)tiles_n * bm * bn * sizeof(float);
+            matmul_q8_0_f32_batch_mfma_w64_onthefly_kernel<tiles_n,bm,bn,bk><<<grid, 256u, shmem>>>(
+                    (float *)out->ptr,
+                    reinterpret_cast<const unsigned char *>(wptr),
+                    (const float *)x->ptr,
+                    (uint32_t)n_tok,
+                    (uint32_t)in_dim,
+                    (uint32_t)out_dim,
+                    blocks * 34u);
+            return cuda_ok(cudaGetLastError(), "matmul_q8_0 f32 batch mfma w64 launch");
+        }
 #endif
         if ((in_dim & 31u) == 0u && out_dim <= UINT32_MAX && n_tok <= UINT32_MAX) {
             const uint32_t rows_per_block = 32u;
diff --git a/rocm/ds4_rocm_q8.cuh b/rocm/ds4_rocm_q8.cuh
index 5b2423de3..ac507fa01 100644
--- a/rocm/ds4_rocm_q8.cuh
+++ b/rocm/ds4_rocm_q8.cuh
@@ -7,6 +7,14 @@
 #include <rocwmma/rocwmma.hpp>
 #endif
 
+#ifndef DS4_ROCM_WMMA_W32
+#define DS4_ROCM_WMMA_W32 0
+#endif
+
+#ifndef DS4_ROCM_MFMA_F16
+#define DS4_ROCM_MFMA_F16 0
+#endif
+
 __device__ __forceinline__ static int32_t load_i8x4_i32_aligned(const int8_t *p) {
     return *(const int32_t *)p;
 }
@@ -670,6 +678,7 @@ __global__ static void matmul_q8_0_f32_batch_sharedx_warp_rows_w32_toktile_kerne
 }
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+#if DS4_ROCM_WMMA_W32
 typedef _Float16 __attribute__((ext_vector_type(16))) ds4_q8_half16_t;
 typedef float    __attribute__((ext_vector_type(8)))  ds4_q8_float8_t;
 
@@ -780,6 +789,7 @@ __global__ static void matmul_q8_0_f32_batch_wmma_4w_kernel(
         }
     }
 }
+#endif
 
 template <int TILES_N=8, int BM=16, int BN=16, int BK=16>
 __global__ static void matmul_q8_0_f32_batch_wmma_onthefly_kernel(
@@ -853,6 +863,86 @@ __global__ static void matmul_q8_0_f32_batch_wmma_onthefly_kernel(
         if (t < n_tokens && row < out_dim) out[(uint64_t)t * out_dim + row] = shC[j];
     }
 }
+
+#if DS4_ROCM_MFMA_F16
+/* CDNA/gfx9 large-batch Q8_0 GEMM path.  Each hardware wave64 owns one
+ * 16-token x 16-row output tile and rocWMMA lowers the fragment multiply to
+ * v_mfma_f32_16x16x16_f16. */
+template <int TILES_N=4, int BM=16, int BN=16, int BK=16>
+__launch_bounds__(256, 2)
+__global__ static void matmul_q8_0_f32_batch_mfma_w64_onthefly_kernel(
+        float *out,
+        const unsigned char *w,
+        const float *x,
+        uint32_t n_tokens,
+        uint32_t in_dim,
+        uint32_t out_dim,
+        uint64_t row_bytes) {
+    extern __shared__ unsigned char raw_sh[];
+    half *shA = reinterpret_cast<half *>(raw_sh);
+    half *shB = shA + BM * BK;
+    float *shC = reinterpret_cast<float *>(shB + TILES_N * BK * BN);
+
+    const uint32_t tid = threadIdx.x;
+    const uint32_t wave = tid >> 6u;
+    const uint32_t t0 = (uint32_t)blockIdx.y * BM;
+    const uint32_t row0 = (uint32_t)blockIdx.x * TILES_N * BN;
+
+    using frag_a = rocwmma::fragment<rocwmma::matrix_a, BM, BN, BK, half, rocwmma::row_major>;
+    using frag_b = rocwmma::fragment<rocwmma::matrix_b, BM, BN, BK, half, rocwmma::row_major>;
+    using frag_c = rocwmma::fragment<rocwmma::accumulator, BM, BN, BK, float>;
+    frag_a a;
+    frag_b b;
+    frag_c acc;
+    if (wave < TILES_N) rocwmma::fill_fragment(acc, 0.0f);
+
+    for (uint32_t k0 = 0; k0 < in_dim; k0 += BK) {
+        for (uint32_t j = tid; j < BM * BK; j += blockDim.x) {
+            const uint32_t m = j / BK;
+            const uint32_t kk = j - m * BK;
+            const uint32_t t = t0 + m;
+            shA[j] = (t < n_tokens && k0 + kk < in_dim)
+                ? __float2half(x[(uint64_t)t * in_dim + k0 + kk])
+                : __float2half(0.0f);
+        }
+        for (uint32_t j = tid; j < TILES_N * BK * BN; j += blockDim.x) {
+            const uint32_t tn = j / (BK * BN);
+            const uint32_t rem = j - tn * BK * BN;
+            const uint32_t kk = rem / BN;
+            const uint32_t nn = rem - kk * BN;
+            const uint32_t row = row0 + tn * BN + nn;
+            const uint32_t k = k0 + kk;
+            if (row < out_dim && k < in_dim) {
+                const unsigned char *blk = w + (uint64_t)row * row_bytes + (uint64_t)(k >> 5u) * 34u;
+                const float d = __half2float(*(const half *)blk);
+                const int8_t q = ((const int8_t *)(blk + 2u))[k & 31u];
+                shB[j] = __float2half(d * (float)q);
+            } else {
+                shB[j] = __float2half(0.0f);
+            }
+        }
+        __syncthreads();
+        if (wave < TILES_N) {
+            rocwmma::load_matrix_sync(a, shA, BK);
+            rocwmma::load_matrix_sync(b, shB + wave * BK * BN, BN);
+            rocwmma::mma_sync(acc, a, b, acc);
+        }
+        __syncthreads();
+    }
+
+    if (wave < TILES_N) rocwmma::store_matrix_sync(shC + wave * BM * BN, acc, BN, rocwmma::mem_row_major);
+    __syncthreads();
+    for (uint32_t j = tid; j < TILES_N * BM * BN; j += blockDim.x) {
+        const uint32_t tn = j / (BM * BN);
+        const uint32_t rem = j - tn * BM * BN;
+        const uint32_t m = rem / BN;
+        const uint32_t nn = rem - m * BN;
+        const uint32_t t = t0 + m;
+        const uint32_t row = row0 + tn * BN + nn;
+        if (t < n_tokens && row < out_dim) out[(uint64_t)t * out_dim + row] = shC[j];
+    }
+}
+#endif
 #endif
 
 __global__ static void matmul_q8_0_pair_f32_warp8_kernel(

From 725661b112f2f880291b013ac5c95145582e9901 Mon Sep 17 00:00:00 2001
From: Eric Hartford <eric@quixi.ai>
Date: Wed, 1 Jul 2026 07:11:17 +0000
Subject: [PATCH 2/3] enable mi300x

---
 Makefile                                  |  45 +-
 README.md                                 |  51 +-
 ds4.c                                     | 196 ++++-
 ds4_cli.c                                 | 909 +++++++++++++++++++++-
 ds4_cuda.cu                               |  12 +
 ds4_distributed.c                         | 376 +++++++--
 ds4_gpu.h                                 |   1 +
 ds4_help.c                                |  17 +-
 ds4_internal.h                            |  36 +
 rocm/ds4_rocm_attention.cuh               |   8 +-
 rocm/ds4_rocm_attention_launch.cuh        |  17 +-
 rocm/ds4_rocm_current_api_compat.cuh      |  12 +
 rocm/ds4_rocm_matmul.cuh                  |  11 +-
 rocm/ds4_rocm_mfma.cuh                    |  61 ++
 rocm/ds4_rocm_moe.cuh                     | 279 +++++--
 rocm/ds4_rocm_moe_launch.cuh              | 172 +++-
 rocm/ds4_rocm_norm_rope.cuh               |   1 +
 rocm/ds4_rocm_q8.cuh                      | 114 ++-
 rocm/ds4_rocm_router.cuh                  |  11 +-
 rocm/ds4_rocm_runtime.cuh                 |  83 +-
 tests/rocm_pro_q4_8gpu_multiturn_smoke.sh | 246 ++++++
 tests/rocm_pro_q4_8gpu_smoke.sh           | 119 +++
 tests/rocm_pro_q4_logits_compare.sh       | 162 ++++
 tests/rocm_q8_mfma_correctness.c          | 252 ++++++
 24 files changed, 2939 insertions(+), 252 deletions(-)
 create mode 100644 ds4_internal.h
 create mode 100644 rocm/ds4_rocm_mfma.cuh
 create mode 100755 tests/rocm_pro_q4_8gpu_multiturn_smoke.sh
 create mode 100755 tests/rocm_pro_q4_8gpu_smoke.sh
 create mode 100755 tests/rocm_pro_q4_logits_compare.sh
 create mode 100644 tests/rocm_q8_mfma_correctness.c

diff --git a/Makefile b/Makefile
index 9ae553626..5abaf4f81 100644
--- a/Makefile
+++ b/Makefile
@@ -36,9 +36,12 @@ ROCM_ARCH ?= gfx1151
 ROCM_ARCHES = $(strip $(ROCM_ARCH))
 ROCM_PRIMARY_ARCH = $(firstword $(subst :, ,$(firstword $(ROCM_ARCHES))))
 ROCM_OFFLOAD_FLAGS = $(foreach arch,$(ROCM_ARCHES),--offload-arch=$(arch))
+ROCM_Q8_MFMA_ARCH ?= $(if $(filter gfx1151,$(ROCM_ARCH)),gfx942,$(ROCM_ARCH))
 ROCM_WMMA_W32 ?= $(if $(filter gfx11%,$(ROCM_PRIMARY_ARCH)),1,0)
 ROCM_MFMA_F16 ?= $(if $(filter gfx94% gfx95%,$(ROCM_PRIMARY_ARCH)),1,0)
-ROCM_CFLAGS ?= -O3 -ffast-math -g -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -DDS4_ROCM_WMMA_W32=$(ROCM_WMMA_W32) -DDS4_ROCM_MFMA_F16=$(ROCM_MFMA_F16) -Wno-unused-command-line-argument $(ROCM_OFFLOAD_FLAGS)
+ROCM_DIRECT_MFMA_F16 ?= $(ROCM_MFMA_F16)
+ROCM_ROCWMMA_F16_FALLBACK ?= 0
+ROCM_CFLAGS ?= -O3 -ffast-math -g -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -DDS4_ROCM_WMMA_W32=$(ROCM_WMMA_W32) -DDS4_ROCM_MFMA_F16=$(ROCM_MFMA_F16) -DDS4_ROCM_DIRECT_MFMA_F16=$(ROCM_DIRECT_MFMA_F16) -DDS4_ROCM_ROCWMMA_F16_FALLBACK=$(ROCM_ROCWMMA_F16_FALLBACK) -Wno-unused-command-line-argument $(ROCM_OFFLOAD_FLAGS)
 ROCM_LDLIBS ?= -lm -pthread -lhipblas -lhipblaslt
 ROCM_TARGETS := ds4 ds4-server ds4-bench ds4-eval ds4-agent
 ROCM_CORE_OBJS := ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o
@@ -47,7 +50,7 @@ DS4_LINK_LIBS ?= $(CUDA_LDLIBS)
 METAL_LDLIBS := $(LDLIBS)
 endif
 
-.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo cdna cdna3 cdna4 mi300x mi325x mi350x mi355x rocm
+.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo cdna cdna3 cdna4 mi300x mi325x mi350x mi355x rocm rocm-q8-mfma-correctness manual-rocm-pro-q4-smoke manual-rocm-pro-q4-multiturn-smoke manual-rocm-pro-q4-logits-compare
 
 ifeq ($(UNAME_S),Darwin)
 all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
@@ -94,7 +97,7 @@ help:
 	@echo "  make strix-halo          Build ROCm for Strix Halo / gfx1151"
 	@echo "  make cdna                Build ROCm CDNA3+CDNA4 fat binary / gfx942+gfx950"
 	@echo "  make cdna3               Build ROCm for AMD Instinct MI300X/MI325X / gfx942"
-	@echo "  make cdna4               Build ROCm for AMD Instinct MI350X/MI355X / gfx950"
+	@echo "  make cdna4               Build ROCm for AMD Instinct MI350X/MI355X / gfx950 (runtime validation pending)"
 	@echo "  make mi300x|mi325x       Alias for make cdna3"
 	@echo "  make mi350x|mi355x       Alias for make cdna4"
 	@echo "  make rocm ROCM_ARCH=gfxN Build ROCm with explicit AMD GPU target(s)"
@@ -179,16 +182,16 @@ cuda-regression: tests/cuda_long_context_smoke
 	./tests/cuda_long_context_smoke
 endif
 
-ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
+ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_gpu.h
 	$(CC) $(CFLAGS) -c -o $@ ds4.c
 
 ds4_ssd.o: ds4_ssd.c ds4_ssd.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_ssd.c
 
-ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
+ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_help.h linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_cli.c
 
-ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4.h ds4_ssd.h
+ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4_internal.h ds4.h ds4_ssd.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_distributed.c
 
 ds4_help.o: ds4_help.c ds4_help.h
@@ -212,6 +215,24 @@ ds4_web.o: ds4_web.c ds4_web.h
 ds4_kvstore.o: ds4_kvstore.c ds4_kvstore.h ds4.h ds4_ssd.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_kvstore.c
 
+manual-rocm-pro-q4-smoke:
+	tests/rocm_pro_q4_8gpu_smoke.sh
+
+manual-rocm-pro-q4-multiturn-smoke:
+	tests/rocm_pro_q4_8gpu_multiturn_smoke.sh
+
+manual-rocm-pro-q4-logits-compare:
+	tests/rocm_pro_q4_logits_compare.sh
+
+rocm-q8-mfma-correctness:
+	$(MAKE) -B tests/rocm_q8_mfma_correctness \
+		HIPCC="$(HIPCC)" \
+		ROCM_ARCH="$(ROCM_Q8_MFMA_ARCH)" \
+		ROCM_LDLIBS="$(ROCM_LDLIBS)" \
+		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD"
+	tests/rocm_q8_mfma_correctness
+	DS4_ROCM_DISABLE_Q8_BATCH_MFMA=1 tests/rocm_q8_mfma_correctness
+
 ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
 	$(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c
 
@@ -221,16 +242,19 @@ ds4_agent_test.o: tests/ds4_agent_test.c ds4_agent.c ds4.h ds4_ssd.h ds4_distrib
 tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
 	$(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c
 
+tests/rocm_q8_mfma_correctness.o: tests/rocm_q8_mfma_correctness.c ds4_gpu.h
+	$(CC) $(CFLAGS) -I. -c -o $@ tests/rocm_q8_mfma_correctness.c
+
 rax.o: rax.c rax.h rax_malloc.h
 	$(CC) $(CFLAGS) -c -o $@ rax.c
 
 linenoise.o: linenoise.c linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ linenoise.c
 
-ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
+ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_gpu.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c
 
-ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
+ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_help.h linenoise.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_cli.c
 
 ds4_server_cpu.o: ds4_server.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
@@ -257,6 +281,9 @@ ds4_rocm.o: ds4_rocm.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(ROCM_SRCS)
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
+tests/rocm_q8_mfma_correctness: tests/rocm_q8_mfma_correctness.o ds4_rocm.o
+	$(HIPCC) $(ROCM_CFLAGS) -o $@ $^ $(ROCM_LDLIBS)
+
 ds4_test: ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS)
 ifeq ($(UNAME_S),Darwin)
 	$(CC) $(CFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
@@ -281,4 +308,4 @@ q4k-dot-test: tests/test_q4k_dot.c
 	./tests/test_q4k_dot
 
 clean:
-	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test ds4_agent_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test ds4_agent_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/rocm_q8_mfma_correctness tests/rocm_q8_mfma_correctness.o
diff --git a/README.md b/README.md
index 98d836913..29911c05a 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ and for quality and speed testing.
 We support the following backends:
 * **Metal** is our primary target. Starting from MacBooks with 96GB of RAM (or less, using SSD streaming).
 * **NVIDIA CUDA / DGX Spark**, CUDA with special care for the DGX Spark.
-* **AMD ROCm**, with explicit targets for AMD Instinct CDNA3/CDNA4 accelerators and Strix Halo (`gfx1151`).
+* **AMD ROCm**, validated on AMD Instinct CDNA3 / MI300X. CDNA4 (`gfx950`) build targets are included but still need runtime validation on CDNA4 hardware. Strix Halo uses the `gfx1151` target.
 
 This project would not exist without **llama.cpp and GGML**, make sure to read
 the acknowledgements section, a big thank you to Georgi Gerganov and all the
@@ -147,7 +147,7 @@ make cuda-spark       # Linux CUDA, DGX Spark / GB10
 make cuda-generic     # Linux CUDA, other local CUDA GPUs
 make cdna             # Linux ROCm, AMD Instinct CDNA3+CDNA4 / gfx942+gfx950
 make cdna3            # Linux ROCm, AMD Instinct MI300X/MI325X / gfx942
-make cdna4            # Linux ROCm, AMD Instinct MI350X/MI355X / gfx950
+make cdna4            # Linux ROCm, AMD Instinct MI350X/MI355X / gfx950 (runtime validation pending)
 make strix-halo       # Linux ROCm, Strix Halo / gfx1151
 make cpu              # CPU-only diagnostics build
 ```
@@ -294,6 +294,34 @@ To build an initial mental model, here are the high level concepts:
 4. Each worker keeps its slice of the KV cache.
 5. Communication is worker-to-worker, there is no need to use the coordinator as relay, so if your coordinator is `A`, and you make a request, activations will flow in `A -> B -> C -> back to A`.
 
+For multiple GPUs in one local Linux host, `./ds4 --gpus` starts the local
+workers for you, assigns one process per listed GPU, picks a localhost
+coordinator port, and splits layers contiguously. It uses the same distributed
+runtime as the manual `--role coordinator` / `--role worker` flow, but you do
+not need a shell loop:
+
+```sh
+./ds4 --rocm -m ds4flash.gguf --gpus 0,1,2,3,4,5,6,7 -p "Hello"
+./ds4 --cuda -m ds4flash.gguf --gpus 0,1,2,3 -p "Hello"
+```
+
+For distributed split GGUFs on one host, repeat `-m` for each shard in any
+order. The local launcher inspects and sorts the shards, assigns GPUs
+by model-cache footprint across the full layer range, and gives the output head
+to the final GPU. If a physical GPU's assigned range crosses a GGUF shard
+boundary, the launcher starts two adjacent local workers on that same GPU so
+each process can keep loading one shard file. For very tight VRAM splits, the
+launcher may default `--prefill-chunk` to 1024 to keep graph scratch below the
+remaining device memory; pass `--prefill-chunk` explicitly to override that:
+
+```sh
+./ds4 --rocm \
+  -m gguf/DeepSeek-V4-Pro-Q4K-Layers00-30.gguf \
+  -m gguf/DeepSeek-V4-Pro-Q4K-Layers-31-output.gguf \
+  --gpus 0,1,2,3,4,5,6,7 \
+  --ctx 262144
+```
+
 ### How it works and how to configure it
 
 The prefill path is pipelined (this is why it can go faster than in a single machine).
@@ -689,8 +717,8 @@ ds4>
 The interactive CLI is a real multi-turn chat. It keeps the rendered chat
 transcript and the live graph KV checkpoint, so each turn extends the previous
 conversation. Useful commands are `/help`, `/think`, `/think-max`, `/nothink`,
-`/ctx N`, `/read FILE`, and `/quit`. Ctrl+C interrupts the current generation
-and returns to `ds4>`.
+`/ctx N`, `/read FILE`, and `/quit`. Ctrl+C interrupts the current generation;
+at `ds4>`, it exits.
 
 The CLI defaults to thinking mode. Use `/nothink` or `--nothink` for direct
 answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional MTP speculative
@@ -1197,9 +1225,11 @@ make cuda CUDA_ARCH=sm_120
 make cuda CUDA_ARCH=native
 ```
 
-For AMD GPUs, use the matching ROCm target. `make cdna` builds a portable
-CDNA3/CDNA4 binary with both `gfx942` and `gfx950` code objects and enables the
-CDNA wave64 MFMA q8 prefill path. If you only need one product generation,
+For AMD GPUs, use the matching ROCm target. `make cdna` builds a CDNA3/CDNA4
+binary with both `gfx942` and `gfx950` code objects and enables the CDNA wave64
+MFMA q8 prefill path. Runtime validation for this path has been performed on
+CDNA3 / MI300X only; the CDNA4 targets compile the `gfx950` path but still need
+validation on MI350X/MI355X hardware. If you only need one product generation,
 `make cdna3`, `make mi300x`, and `make mi325x` target `gfx942`; `make cdna4`,
 `make mi350x`, and `make mi355x` target `gfx950`. Strix Halo systems use
 `make strix-halo` and keep the gfx11 wave32 WMMA q8 path. For another AMD GPU,
@@ -1216,6 +1246,13 @@ make rocm ROCM_ARCH=gfx942
 make rocm ROCM_ARCH="gfx942 gfx950"
 ```
 
+The portable synthetic check for the CDNA MFMA Q8 path is:
+
+```sh
+make rocm-q8-mfma-correctness
+make rocm-q8-mfma-correctness ROCM_Q8_MFMA_ARCH=gfx950  # requires CDNA4 hardware to run
+```
+
 There is also a CPU reference/debug path:
 
 ```sh
diff --git a/ds4.c b/ds4.c
index 640511eb0..708398753 100644
--- a/ds4.c
+++ b/ds4.c
@@ -38,6 +38,7 @@
 
 #include "ds4.h"
 #include "ds4_distributed.h"
+#include "ds4_internal.h"
 
 #ifndef DS4_NO_GPU
 #include "ds4_gpu.h"
@@ -2123,6 +2124,32 @@ static uint64_t accelerator_cuda_preload_span_bytes(void) {
     return mb * 1048576ull;
 }
 
+static bool accelerator_model_cache_preflight(const char *accelerator_name,
+                                              uint64_t    required_bytes) {
+    if (required_bytes == 0 || required_bytes == UINT64_MAX) return true;
+    uint64_t free_bytes = 0;
+    uint64_t total_bytes = 0;
+    if (ds4_gpu_memory_info(&free_bytes, &total_bytes) == 0) return true;
+
+    const uint64_t reserve_bytes = 2ull * 1073741824ull;
+    const uint64_t available_bytes =
+        free_bytes > reserve_bytes ? free_bytes - reserve_bytes : 0;
+    if (required_bytes <= available_bytes) return true;
+
+    fprintf(stderr,
+            "ds4: %s non-streaming model tensor cache needs %.2f GiB, "
+            "but this device has %.2f GiB free (%.2f GiB total, %.2f GiB reserve)\n",
+            accelerator_name,
+            (double)required_bytes / 1073741824.0,
+            (double)free_bytes / 1073741824.0,
+            (double)total_bytes / 1073741824.0,
+            (double)reserve_bytes / 1073741824.0);
+    fprintf(stderr,
+            "ds4: use --ssd-streaming for single-GPU routed-expert streaming, "
+            "or --gpus to shard layers across local GPUs\n");
+    return false;
+}
+
 static bool accelerator_span_filter_contains(uint64_t off,
                                              uint64_t bytes,
                                              const uint64_t *span_offsets,
@@ -2186,6 +2213,34 @@ static bool accelerator_prepare_model_tensor_spans(const ds4_model *m,
     qsort(spans, (size_t)nspan, sizeof(spans[0]), accelerator_tensor_span_cmp);
 
     const uint64_t max_span = accelerator_cuda_preload_span_bytes();
+#ifdef DS4_ROCM_BUILD
+    const char *accelerator_name = "ROCm";
+#else
+    const char *accelerator_name = "CUDA";
+#endif
+    uint64_t required = 0;
+    for (uint64_t i = 0; i < nspan;) {
+        uint64_t off = spans[i].off;
+        uint64_t end = spans[i].end;
+        i++;
+        while (i < nspan &&
+               spans[i].off <= end + 65536u &&
+               spans[i].end - off <= max_span) {
+            if (spans[i].end > end) end = spans[i].end;
+            i++;
+        }
+        const uint64_t bytes = end - off;
+        if (required > UINT64_MAX - bytes) {
+            required = UINT64_MAX;
+            break;
+        }
+        required += bytes;
+    }
+    if (!accelerator_model_cache_preflight(accelerator_name, required)) {
+        free(spans);
+        return false;
+    }
+
     const int tty = ds4_log_is_tty(stderr);
     const uint64_t progress_step = (tty ? 2ull : 16ull) * 1073741824ull;
     uint64_t next_progress = progress_step;
@@ -2193,12 +2248,6 @@ static bool accelerator_prepare_model_tensor_spans(const ds4_model *m,
     uint64_t prepared = 0;
     uint64_t merged = 0;
 
-#ifdef DS4_ROCM_BUILD
-    const char *accelerator_name = "ROCm";
-#else
-    const char *accelerator_name = "CUDA";
-#endif
-
     fprintf(stderr, "%sds4: %s preparing model tensor mappings%s",
             tty ? "\r\033[K" : "",
             accelerator_name,
@@ -10360,6 +10409,8 @@ typedef struct {
     uint32_t spec_prefix1_n_index_comp[DS4_MAX_LAYER];
     bool spec_capture_prefix1;
     uint32_t raw_cap;
+    uint32_t layer_start;
+    uint32_t layer_end;
     /* Maximum compressed-row capacity across layers.  Shared work buffers use
      * this worst-case size because ratio-4 indexer layers can still reach it. */
     uint32_t comp_cap;
@@ -10961,9 +11012,16 @@ static bool metal_graph_alloc_raw_cap(
         uint32_t                raw_cap,
         uint32_t                ctx_size,
         uint32_t                prefill_cap,
+        uint32_t                layer_start,
+        uint32_t                layer_end,
         bool                    enable_mtp) {
     memset(g, 0, sizeof(*g));
     g->mtp_enabled = enable_mtp;
+    if (layer_start >= DS4_N_LAYER) return false;
+    if (layer_end == UINT32_MAX || layer_end >= DS4_N_LAYER) layer_end = DS4_N_LAYER - 1u;
+    if (layer_end < layer_start) return false;
+    g->layer_start = layer_start;
+    g->layer_end = layer_end;
     if (raw_cap == 0) raw_cap = 1;
     if (ctx_size == 0) ctx_size = raw_cap;
     if (prefill_cap == 0) prefill_cap = 1;
@@ -10977,7 +11035,7 @@ static bool metal_graph_alloc_raw_cap(
     g->raw_window = raw_window;
     g->prefill_cap = prefill_cap;
     uint32_t min_ratio = UINT32_MAX;
-    for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
+    for (uint32_t il = layer_start; il <= layer_end; il++) {
         const uint32_t ratio = ds4_layer_compress_ratio(il);
         if (ratio != 0 && ratio < min_ratio) min_ratio = ratio;
     }
@@ -10988,7 +11046,7 @@ static bool metal_graph_alloc_raw_cap(
         g->attn_comp_stage_cap = prefill_cap / min_ratio + 2u;
         if (g->attn_comp_stage_cap < 2u) g->attn_comp_stage_cap = 2u;
     }
-    for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
+    for (uint32_t il = layer_start; il <= layer_end; il++) {
         const uint32_t ratio = ds4_layer_compress_ratio(il);
         if (ratio == 0) {
             g->layer_comp_cap[il] = 0;
@@ -11027,7 +11085,7 @@ static bool metal_graph_alloc_raw_cap(
          * turning memory pressure into a machine-wide lockup.
          */
         fprintf(stderr,
-                "ds4: CUDA using managed KV cache for ctx=%u "
+                "ds4: GPU using managed KV cache for ctx=%u "
                 "(kv cache %.2f GiB, context buffers %.2f GiB); "
                 "this may degrade performance but is needed for very large contexts\n",
                 ctx_size,
@@ -11054,7 +11112,7 @@ static bool metal_graph_alloc_raw_cap(
     g->kv_raw = ds4_gpu_tensor_alloc((uint64_t)DS4_N_HEAD_DIM * sizeof(float));
     g->kv = ds4_gpu_tensor_alloc((uint64_t)DS4_N_HEAD_DIM * sizeof(float));
     bool state_init_ok = true;
-    for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
+    for (uint32_t il = layer_start; il <= layer_end; il++) {
         g->layer_raw_cache[il] = metal_graph_alloc_kv_cache_tensor(
                 managed_kv_cache,
                 (uint64_t)raw_cap * DS4_N_HEAD_DIM * sizeof(float));
@@ -11216,7 +11274,7 @@ static bool metal_graph_alloc_raw_cap(
     g->batch_routed_out = ds4_gpu_tensor_alloc(pc * DS4_N_EMBD * sizeof(float));
 
     bool layer_cache_ok = true;
-    for (uint32_t il = 0; layer_cache_ok && il < DS4_N_LAYER; il++) {
+    for (uint32_t il = layer_start; layer_cache_ok && il <= layer_end; il++) {
         layer_cache_ok = g->layer_raw_cache[il] != NULL;
         const uint32_t ratio = ds4_layer_compress_ratio(il);
         if (layer_cache_ok && ratio != 0) {
@@ -11292,7 +11350,15 @@ static bool metal_graph_alloc(
         ds4_gpu_graph *g,
         const ds4_weights     *weights,
         const ds4_layer_weights *layer) {
-    return metal_graph_alloc_raw_cap(g, weights, layer, DS4_N_SWA, DS4_N_SWA, 1, false);
+    return metal_graph_alloc_raw_cap(g,
+                                     weights,
+                                     layer,
+                                     DS4_N_SWA,
+                                     DS4_N_SWA,
+                                     1,
+                                     0,
+                                     DS4_N_LAYER - 1u,
+                                     false);
 }
 
 static bool metal_graph_install_model_spans(
@@ -20272,7 +20338,11 @@ static bool metal_graph_reset_prefill_state(ds4_gpu_graph *g) {
     memset(g->layer_n_comp, 0, sizeof(g->layer_n_comp));
     memset(g->layer_n_index_comp, 0, sizeof(g->layer_n_index_comp));
     g->mtp_n_raw = 0;
-    for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
+    uint32_t layer_start = g->layer_start;
+    uint32_t layer_end = g->layer_end;
+    if (layer_start >= DS4_N_LAYER) layer_start = 0;
+    if (layer_end >= DS4_N_LAYER || layer_end < layer_start) layer_end = DS4_N_LAYER - 1u;
+    for (uint32_t il = layer_start; il <= layer_end; il++) {
         const uint32_t ratio = ds4_layer_compress_ratio(il);
         if (ratio == 0) continue;
         const uint32_t coff = ratio == 4 ? 2u : 1u;
@@ -21510,7 +21580,8 @@ static int metal_graph_prompt_logits_test(
 
     ds4_gpu_graph g;
     bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0],
-                                        raw_cap, (uint32_t)ctx_size, (uint32_t)n_test, false);
+                                        raw_cap, (uint32_t)ctx_size, (uint32_t)n_test,
+                                        0, DS4_N_LAYER - 1u, false);
     if (!ok) {
         metal_graph_free(&g);
         fprintf(stderr, "ds4: failed to initialize Metal graph prompt test runtime\n");
@@ -21828,6 +21899,9 @@ struct ds4_engine {
     bool ssd_streaming;
     bool ssd_streaming_cold;
     ds4_distributed_options distributed;
+    bool load_slice;
+    uint32_t load_layer_start;
+    uint32_t load_layer_end;
     bool metal_ready;
     bool mtp_ready;
 };
@@ -22956,7 +23030,8 @@ static int generate_metal_graph_raw_swa(
     }
     ds4_gpu_graph g;
     bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0],
-                                        raw_cap, (uint32_t)ctx_size, prefill_cap, false);
+                                        raw_cap, (uint32_t)ctx_size, prefill_cap,
+                                        0, DS4_N_LAYER - 1u, false);
     if (!ok) {
         fprintf(stderr, "ds4: failed to allocate GPU graph runtime\n");
         return 1;
@@ -25023,7 +25098,8 @@ int ds4_engine_collect_imatrix(ds4_engine *e,
 
     ds4_gpu_graph g;
     bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0],
-                                        raw_cap, (uint32_t)ctx_size, prefill_cap, false);
+                                        raw_cap, (uint32_t)ctx_size, prefill_cap,
+                                        0, DS4_N_LAYER - 1u, false);
     if (!ok) {
         fprintf(stderr, "ds4: failed to allocate imatrix Metal graph runtime\n");
         free(dataset);
@@ -25552,6 +25628,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
     e->ssd_streaming = opt->ssd_streaming;
     e->ssd_streaming_cold = opt->ssd_streaming_cold;
     e->distributed = opt->distributed;
+    ds4_internal_dist_copy_options_private(&opt->distributed, &e->distributed);
     e->power_percent = opt->power_percent > 0 ? opt->power_percent : 100;
     e->prefill_chunk = opt->prefill_chunk;
     e->ssd_streaming_cache_experts = opt->ssd_streaming_cache_experts;
@@ -25575,7 +25652,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
         e->directional_steering_ffn_scale = opt->directional_steering_ffn;
     }
     if (opt->n_threads > 0) g_requested_threads = (uint32_t)opt->n_threads;
-    ds4_acquire_instance_lock();
+    if (opt->distributed.role == DS4_DISTRIBUTED_NONE) ds4_acquire_instance_lock();
 
     if (opt->simulate_used_memory_bytes != 0 &&
         !ds4_ssd_memory_lock_acquire(&e->simulated_memory,
@@ -25600,6 +25677,12 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
         load_output = opt->distributed.layers.has_output;
         load_output_optional = opt->distributed.role == DS4_DISTRIBUTED_COORDINATOR;
     }
+    e->load_slice = load_slice;
+    e->load_layer_start = load_slice ? load_layer_start : 0;
+    e->load_layer_end = load_slice ? load_layer_end : DS4_N_LAYER - 1u;
+    if (e->load_layer_end == UINT32_MAX || e->load_layer_end >= DS4_N_LAYER) {
+        e->load_layer_end = DS4_N_LAYER - 1u;
+    }
 
     const bool graph_backend = ds4_backend_uses_graph(opt->backend);
     if (graph_backend) ds4_linux_graph_backend_set_oom_score(opt->backend);
@@ -25933,7 +26016,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
         if (!accelerator_cache_model_tensors(e->backend, &e->model,
                                              load_offsets, load_sizes,
                                              load_span_count)) {
-            fprintf(stderr, "ds4: %s failed to prepare optional model cache\n",
+            fprintf(stderr, "ds4: %s failed to prepare model cache\n",
                     ds4_backend_name(e->backend));
             free(load_offsets);
             free(load_sizes);
@@ -25949,7 +26032,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
             (void)ds4_gpu_set_model_fd_for_map(e->mtp_model.fd, e->mtp_model.map);
             if (!accelerator_cache_model_tensors(e->backend, &e->mtp_model,
                                                  NULL, NULL, 0)) {
-                fprintf(stderr, "ds4: %s failed to prepare optional MTP model cache\n",
+                fprintf(stderr, "ds4: %s failed to prepare MTP model cache\n",
                         ds4_backend_name(e->backend));
                 ds4_engine_close(e);
                 *out = NULL;
@@ -26013,6 +26096,65 @@ uint64_t ds4_engine_hidden_f32_values(ds4_engine *e) {
     return (uint64_t)DS4_N_HC * DS4_N_EMBD;
 }
 
+int ds4_internal_model_shard_info_from_file(
+        const char *model_path,
+        ds4_internal_model_shard_info *out) {
+    if (!out) return 1;
+    memset(out, 0, sizeof(*out));
+    if (DS4_N_LAYER > DS4_INTERNAL_MAX_LAYER) return 1;
+
+    ds4_model m;
+    model_open(&m, model_path, false, false);
+    config_validate_model(&m);
+
+    out->n_layers = DS4_N_LAYER;
+    out->has_token_embedding = model_find_tensor(&m, "token_embd.weight") != NULL;
+    out->has_output_head =
+        model_find_tensor(&m, "output_hc_base.weight") != NULL &&
+        model_find_tensor(&m, "output_hc_fn.weight") != NULL &&
+        model_find_tensor(&m, "output_hc_scale.weight") != NULL &&
+        model_find_tensor(&m, "output_norm.weight") != NULL &&
+        model_find_tensor(&m, "output.weight") != NULL;
+
+    for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
+        if (!tensor_by_namef(&m, "blk.%u.attn_norm.weight", il)) continue;
+        if (!out->has_layers) {
+            out->first_layer = il;
+            out->last_layer = il;
+            out->has_layers = true;
+        } else {
+            out->last_layer = il;
+        }
+    }
+
+    if (out->has_layers) {
+        ds4_weights w;
+        weights_bind(&w,
+                     &m,
+                     true,
+                     out->first_layer,
+                     out->last_layer,
+                     out->has_output_head,
+                     false);
+        for (uint32_t start = out->first_layer; start <= out->last_layer; start++) {
+            for (uint32_t end = start; end <= out->last_layer; end++) {
+                ds4_model_map_span_vec spans;
+                const bool include_output = out->has_output_head && end == out->last_layer;
+                if (weights_model_map_spans(&w, start, end, include_output, &spans)) {
+                    out->range_cache_bytes[start][end] =
+                        model_map_span_vec_total_bytes(&spans);
+                    free(spans.v);
+                }
+            }
+        }
+        out->total_cache_bytes =
+            out->range_cache_bytes[out->first_layer][out->last_layer];
+    }
+
+    model_close(&m);
+    return 0;
+}
+
 int ds4_engine_model_id(ds4_engine *e) {
     (void)e;
     return (int)DS4_MODEL_VARIANT;
@@ -26030,6 +26172,7 @@ void ds4_engine_close(ds4_engine *e) {
     ds4_gpu_cleanup();
 #endif
     ds4_ssd_memory_lock_release(&e->simulated_memory);
+    ds4_internal_dist_clear_options_private(&e->distributed);
     ds4_release_instance_lock();
     free(e->directional_steering_dirs);
     free(e->directional_steering_file);
@@ -26072,7 +26215,8 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) {
         return 1;
     }
     if (!metal_graph_alloc_raw_cap(&s->graph, &e->weights, shape_layer,
-                                   raw_cap, (uint32_t)ctx_size, s->prefill_cap, e->mtp_ready))
+                                   raw_cap, (uint32_t)ctx_size, s->prefill_cap,
+                                   e->load_layer_start, e->load_layer_end, e->mtp_ready))
     {
         free(s);
         return 1;
@@ -26185,7 +26329,11 @@ static bool ds4_session_cancelled(ds4_session *s) {
     return s && s->cancel && s->cancel(s->cancel_ud);
 }
 
-static bool ds4_session_cancelled_cb(void *ud) {
+bool ds4_internal_session_cancel_requested(ds4_session *s) {
+    return ds4_session_cancelled(s);
+}
+
+static DS4_MAYBE_UNUSED bool ds4_session_cancelled_cb(void *ud) {
     return ds4_session_cancelled(ud);
 }
 
@@ -26381,6 +26529,12 @@ int ds4_session_eval_layer_slice(ds4_session *s,
 
     ds4_engine *e = s->engine;
     ds4_gpu_graph *g = &s->graph;
+    if (layer_start < g->layer_start || layer_end > g->layer_end) {
+        if (errlen) snprintf(err, errlen, "layer-slice %u:%u is outside allocated graph range %u:%u",
+                             layer_start, layer_end, g->layer_start, g->layer_end);
+        s->checkpoint_valid = false;
+        return 1;
+    }
     if (!input_hc && !output_hc && output_logits &&
         layer_start == 0 && layer_end + 1u == (uint32_t)DS4_N_LAYER) {
         bool ok = false;
diff --git a/ds4_cli.c b/ds4_cli.c
index 4ad2240e8..d91f33e59 100644
--- a/ds4_cli.c
+++ b/ds4_cli.c
@@ -1,6 +1,7 @@
 #include "ds4.h"
 #include "ds4_distributed.h"
 #include "ds4_help.h"
+#include "ds4_internal.h"
 #include "linenoise.h"
 
 /* ds4 CLI.
@@ -12,6 +13,7 @@
  * engine API. */
 
 #include <ctype.h>
+#include <dirent.h>
 #include <errno.h>
 #include <limits.h>
 #include <math.h>
@@ -24,6 +26,20 @@
 #include <stdarg.h>
 #include <time.h>
 #include <unistd.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#ifdef __linux__
+#include <sys/prctl.h>
+#endif
+
+enum {
+    CLI_LOCAL_GPU_MAX = 64,
+    CLI_MODEL_PATH_MAX = 16,
+    CLI_LOCAL_ASSIGNMENT_MAX = CLI_LOCAL_GPU_MAX + CLI_MODEL_PATH_MAX
+};
 
 typedef struct {
     const char *prompt;
@@ -51,20 +67,55 @@ typedef struct {
     bool metal_graph_prompt_test;
 } cli_generation_options;
 
+typedef struct {
+    const char *path;
+    ds4_internal_model_shard_info info;
+} cli_model_shard;
+
+typedef struct {
+    const char *model_path;
+    int gpu_id;
+    uint32_t start;
+    uint32_t end;
+    bool has_output;
+    uint64_t cache_bytes;
+} cli_local_gpu_assignment;
+
+typedef struct {
+    bool enabled;
+    int ids[CLI_LOCAL_GPU_MAX];
+    uint32_t n;
+    uint32_t n_layers;
+    cli_model_shard shards[CLI_MODEL_PATH_MAX];
+    uint32_t n_shards;
+    cli_local_gpu_assignment assignments[CLI_LOCAL_ASSIGNMENT_MAX];
+    uint32_t assignment_count;
+    int coordinator_port;
+    int coordinator_listen_fd;
+    pid_t worker_pids[CLI_LOCAL_ASSIGNMENT_MAX];
+    uint32_t worker_count;
+} cli_local_gpus;
+
 typedef struct {
     ds4_engine_options engine;
     ds4_dist_options *dist;
     cli_generation_options gen;
+    cli_local_gpus local_gpus;
+    const char *model_paths[CLI_MODEL_PATH_MAX];
+    uint32_t model_path_count;
     char *prompt_owned;
     bool inspect;
+    bool prefill_chunk_set;
 } cli_config;
 
+static int cli_local_gpus_reap_exited(const cli_config *cfg);
+
 static volatile sig_atomic_t cli_interrupted;
 static volatile sig_atomic_t cli_dist_busy;
 static volatile sig_atomic_t cli_dist_notice_printed;
 
 static const char cli_dist_drain_msg[] =
-    "\nds4: stopping after the distributed cluster finishes the current token/chunk...\n";
+    "\nds4: interrupt received; stopping distributed generation...\n";
 
 static void cli_sigint_handler(int sig) {
     (void)sig;
@@ -82,11 +133,28 @@ static bool cli_interrupt_requested(void) {
     return cli_interrupted != 0;
 }
 
+static bool cli_session_cancel_cb(void *ud) {
+    (void)ud;
+    return cli_interrupt_requested();
+}
+
 static void cli_interrupt_clear(void) {
     cli_interrupted = 0;
     cli_dist_notice_printed = 0;
 }
 
+static void cli_local_gpus_close_listener(cli_local_gpus *g) {
+    if (!g || g->coordinator_listen_fd < 0) return;
+    close(g->coordinator_listen_fd);
+    g->coordinator_listen_fd = -1;
+}
+
+static void cli_local_gpus_release_listener(const cli_config *cfg) {
+    if (!cfg || !cfg->local_gpus.enabled) return;
+    cli_local_gpus *g = (cli_local_gpus *)&cfg->local_gpus;
+    cli_local_gpus_close_listener(g);
+}
+
 static bool cli_distributed_coordinator(const cli_config *cfg) {
     return cfg && cfg->engine.distributed.role == DS4_DISTRIBUTED_COORDINATOR;
 }
@@ -99,6 +167,7 @@ static void cli_dist_busy_set(const cli_config *cfg, bool busy) {
 
 static int cli_wait_distributed_route(const cli_config *cfg, ds4_session *session) {
     if (!cli_distributed_coordinator(cfg)) return 0;
+    cli_local_gpus_release_listener(cfg);
 
     char err[256] = {0};
     char last[256] = {0};
@@ -106,6 +175,13 @@ static int cli_wait_distributed_route(const cli_config *cfg, ds4_session *sessio
     const struct timespec delay = {0, 250000000L};
 
     for (;;) {
+        if (cli_interrupt_requested()) {
+            fprintf(stderr, "ds4: interrupted while waiting for distributed route\n");
+            return 1;
+        }
+        if (cli_local_gpus_reap_exited(cfg) != 0) {
+            return 1;
+        }
         int ready = ds4_session_distributed_route_ready(session, err, sizeof(err));
         if (ready > 0) {
             if (ticks) fprintf(stderr, "ds4: distributed route ready\n");
@@ -231,6 +307,719 @@ static double cli_now_sec(void) {
     return (double)ts.tv_sec + (double)ts.tv_nsec * 1.0e-9;
 }
 
+static void cli_local_gpus_parse(cli_local_gpus *g, const char *spec) {
+    if (!g || !spec || !spec[0]) {
+        fprintf(stderr, "ds4: --gpus requires a comma-separated GPU list\n");
+        exit(2);
+    }
+
+    char *copy = malloc(strlen(spec) + 1u);
+    if (!copy) {
+        fprintf(stderr, "ds4: out of memory parsing --gpus\n");
+        exit(1);
+    }
+    strcpy(copy, spec);
+
+    char *p = copy;
+    while (*p) {
+        while (*p == ',' || isspace((unsigned char)*p)) p++;
+        if (!*p) break;
+        char *start = p;
+        while (*p && *p != ',') p++;
+        char saved = *p;
+        *p = '\0';
+
+        char *end_trim = start + strlen(start);
+        while (end_trim > start && isspace((unsigned char)end_trim[-1])) *--end_trim = '\0';
+        errno = 0;
+        char *end = NULL;
+        long id = strtol(start, &end, 10);
+        if (errno != 0 || start[0] == '\0' || *end != '\0' || id < 0 || id > INT32_MAX) {
+            fprintf(stderr, "ds4: invalid GPU id in --gpus: %s\n", start);
+            free(copy);
+            exit(2);
+        }
+        if (g->n >= CLI_LOCAL_GPU_MAX) {
+            fprintf(stderr, "ds4: --gpus supports at most %u entries\n", (unsigned)CLI_LOCAL_GPU_MAX);
+            free(copy);
+            exit(2);
+        }
+        for (uint32_t i = 0; i < g->n; i++) {
+            if (g->ids[i] == (int)id) {
+                fprintf(stderr, "ds4: duplicate GPU id in --gpus: %ld\n", id);
+                free(copy);
+                exit(2);
+            }
+        }
+        g->ids[g->n++] = (int)id;
+        if (!saved) break;
+        p++;
+    }
+    free(copy);
+
+    if (g->n == 0) {
+        fprintf(stderr, "ds4: --gpus requires at least one GPU id\n");
+        exit(2);
+    }
+    g->enabled = true;
+}
+
+static void cli_local_gpus_open_listener(cli_local_gpus *g) {
+    int fd = socket(AF_INET, SOCK_STREAM, 0);
+    if (fd < 0) {
+        fprintf(stderr, "ds4: failed to create local launcher socket: %s\n", strerror(errno));
+        exit(1);
+    }
+
+    int one = 1;
+    (void)setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
+
+    struct sockaddr_in addr;
+    memset(&addr, 0, sizeof(addr));
+    addr.sin_family = AF_INET;
+    addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+    addr.sin_port = 0;
+    if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) != 0) {
+        fprintf(stderr, "ds4: failed to bind local distributed listener: %s\n", strerror(errno));
+        close(fd);
+        exit(1);
+    }
+    if (listen(fd, 64) != 0) {
+        fprintf(stderr, "ds4: failed to listen on local distributed socket: %s\n", strerror(errno));
+        close(fd);
+        exit(1);
+    }
+
+    socklen_t len = sizeof(addr);
+    if (getsockname(fd, (struct sockaddr *)&addr, &len) != 0) {
+        fprintf(stderr, "ds4: failed to read local distributed port: %s\n", strerror(errno));
+        close(fd);
+        exit(1);
+    }
+    int port = (int)ntohs(addr.sin_port);
+    if (port <= 0) {
+        fprintf(stderr, "ds4: failed to allocate a local distributed port\n");
+        close(fd);
+        exit(1);
+    }
+    g->coordinator_port = port;
+    g->coordinator_listen_fd = fd;
+}
+
+static uint32_t cli_model_shard_layer_count(const cli_model_shard *s) {
+    if (!s || !s->info.has_layers || s->info.last_layer < s->info.first_layer) return 0;
+    return s->info.last_layer - s->info.first_layer + 1u;
+}
+
+static uint64_t cli_gib_bytes(void) {
+    return 1073741824ull;
+}
+
+static bool cli_read_u64_file(const char *path, uint64_t *out) {
+    if (out) *out = 0;
+    if (!path || !out) return false;
+    FILE *fp = fopen(path, "r");
+    if (!fp) return false;
+    char buf[64];
+    if (!fgets(buf, sizeof(buf), fp)) {
+        fclose(fp);
+        return false;
+    }
+    fclose(fp);
+
+    errno = 0;
+    char *end = NULL;
+    unsigned long long v = strtoull(buf, &end, 10);
+    if (errno != 0 || end == buf || v == 0) return false;
+    *out = (uint64_t)v;
+    return true;
+}
+
+static uint64_t cli_local_gpus_min_vram_total_bytes(void) {
+#ifdef __linux__
+    DIR *dir = opendir("/sys/class/drm");
+    if (!dir) return 0;
+    uint64_t min_total = UINT64_MAX;
+    struct dirent *de = NULL;
+    while ((de = readdir(dir)) != NULL) {
+        if (strncmp(de->d_name, "card", 4) != 0) continue;
+        char path[PATH_MAX];
+        int n = snprintf(path,
+                         sizeof(path),
+                         "/sys/class/drm/%s/device/mem_info_vram_total",
+                         de->d_name);
+        if (n < 0 || (size_t)n >= sizeof(path)) continue;
+        uint64_t total = 0;
+        if (!cli_read_u64_file(path, &total) || total == 0) continue;
+        if (total < min_total) min_total = total;
+    }
+    closedir(dir);
+    return min_total == UINT64_MAX ? 0 : min_total;
+#else
+    return 0;
+#endif
+}
+
+static uint64_t cli_model_shard_range_cache_bytes(
+        const cli_model_shard *s,
+        uint32_t start,
+        uint32_t end) {
+    if (!s || !s->info.has_layers || end < start) return 0;
+    if (start < s->info.first_layer || end > s->info.last_layer) return 0;
+    if (start < DS4_INTERNAL_MAX_LAYER && end < DS4_INTERNAL_MAX_LAYER) {
+        const uint64_t exact = s->info.range_cache_bytes[start][end];
+        if (exact != 0) return exact;
+    }
+
+    const uint32_t shard_layers = cli_model_shard_layer_count(s);
+    const uint32_t range_layers = end - start + 1u;
+    if (s->info.total_cache_bytes != 0 && shard_layers != 0) {
+        return (s->info.total_cache_bytes * range_layers + shard_layers - 1u) /
+               shard_layers;
+    }
+    return (uint64_t)range_layers;
+}
+
+static uint64_t cli_model_range_cache_bytes(
+        const cli_local_gpus *g,
+        uint32_t start,
+        uint32_t end) {
+    if (!g || end < start) return 0;
+    uint64_t total = 0;
+    for (uint32_t si = 0; si < g->n_shards; si++) {
+        const cli_model_shard *s = &g->shards[si];
+        if (end < s->info.first_layer || start > s->info.last_layer) continue;
+        const uint32_t lo = start > s->info.first_layer ? start : s->info.first_layer;
+        const uint32_t hi = end < s->info.last_layer ? end : s->info.last_layer;
+        const uint64_t bytes = cli_model_shard_range_cache_bytes(s, lo, hi);
+        if (total > UINT64_MAX - bytes) return UINT64_MAX;
+        total += bytes;
+    }
+    return total;
+}
+
+static uint64_t cli_model_total_cache_bytes(const cli_local_gpus *g) {
+    if (!g || g->n_layers == 0) return 0;
+    return cli_model_range_cache_bytes(g, 0, g->n_layers - 1u);
+}
+
+static void cli_local_gpus_model_budgets(
+        const cli_config *cfg,
+        uint64_t budgets[CLI_LOCAL_GPU_MAX]) {
+    for (uint32_t i = 0; i < CLI_LOCAL_GPU_MAX; i++) budgets[i] = 0;
+    if (!cfg || !cfg->local_gpus.enabled) return;
+
+    const uint64_t total = cli_local_gpus_min_vram_total_bytes();
+    if (total == 0) return;
+
+    const uint64_t model_reserve = 2ull * cli_gib_bytes();
+    uint64_t base = total > model_reserve ? total - model_reserve : 0;
+    for (uint32_t i = 0; i < cfg->local_gpus.n; i++) budgets[i] = base;
+
+    ds4_context_memory ctx =
+        ds4_context_memory_estimate_with_prefill(cfg->engine.backend,
+                                                 cfg->gen.ctx_size,
+                                                 cfg->engine.prefill_chunk);
+    budgets[0] = budgets[0] > ctx.total_bytes ? budgets[0] - ctx.total_bytes : 0;
+    for (uint32_t i = 0; i < cfg->local_gpus.n; i++) {
+        if (budgets[i] == 0) budgets[i] = 1;
+    }
+}
+
+static int cli_model_shard_cmp(const void *a, const void *b) {
+    const cli_model_shard *sa = (const cli_model_shard *)a;
+    const cli_model_shard *sb = (const cli_model_shard *)b;
+    const uint32_t la = sa->info.has_layers ? sa->info.first_layer : UINT32_MAX;
+    const uint32_t lb = sb->info.has_layers ? sb->info.first_layer : UINT32_MAX;
+    if (la < lb) return -1;
+    if (la > lb) return 1;
+    if (sa->info.has_token_embedding != sb->info.has_token_embedding) {
+        return sa->info.has_token_embedding ? -1 : 1;
+    }
+    if (sa->info.has_output_head != sb->info.has_output_head) {
+        return sa->info.has_output_head ? 1 : -1;
+    }
+    return 0;
+}
+
+static void cli_add_model_path(cli_config *cfg, const char *path) {
+    if (!cfg || !path || !path[0]) {
+        fprintf(stderr, "ds4: -m/--model requires a non-empty path\n");
+        exit(2);
+    }
+    if (cfg->model_path_count >= CLI_MODEL_PATH_MAX) {
+        fprintf(stderr,
+                "ds4: at most %u model shards may be passed with repeated -m\n",
+                (unsigned)CLI_MODEL_PATH_MAX);
+        exit(2);
+    }
+    cfg->model_paths[cfg->model_path_count++] = path;
+    cfg->engine.model_path = path;
+}
+
+static void cli_local_gpus_set_visible(int gpu_id) {
+    char id[32];
+    snprintf(id, sizeof(id), "%d", gpu_id);
+#ifdef DS4_ROCM_BUILD
+    setenv("ROCR_VISIBLE_DEVICES", id, 1);
+    setenv("HIP_VISIBLE_DEVICES", "0", 1);
+    setenv("CUDA_VISIBLE_DEVICES", "0", 1);
+#else
+    setenv("CUDA_VISIBLE_DEVICES", id, 1);
+#endif
+}
+
+static void cli_local_gpus_load_shards(cli_config *cfg) {
+    cli_local_gpus *g = &cfg->local_gpus;
+    g->n_shards = cfg->model_path_count;
+    if (g->n_shards == 0 || g->n_shards > CLI_MODEL_PATH_MAX) {
+        fprintf(stderr, "ds4: invalid local GPU model shard count\n");
+        exit(2);
+    }
+
+    uint32_t expected_start = 0;
+    uint32_t n_layers = 0;
+    for (uint32_t i = 0; i < g->n_shards; i++) {
+        cli_model_shard *s = &g->shards[i];
+        s->path = cfg->model_paths[i];
+        if (ds4_internal_model_shard_info_from_file(s->path, &s->info) != 0) {
+            fprintf(stderr, "ds4: failed to inspect model shard: %s\n", s->path);
+            exit(2);
+        }
+        if (i == 0) {
+            n_layers = s->info.n_layers;
+        } else if (s->info.n_layers != n_layers) {
+            fprintf(stderr, "ds4: model shards disagree on layer count\n");
+            exit(2);
+        }
+        if (!s->info.has_layers) {
+            fprintf(stderr, "ds4: model shard contains no transformer layers: %s\n", s->path);
+            exit(2);
+        }
+    }
+
+    qsort(g->shards, g->n_shards, sizeof(g->shards[0]), cli_model_shard_cmp);
+    for (uint32_t i = 0; i < g->n_shards; i++) {
+        cli_model_shard *s = &g->shards[i];
+        if (i == 0 && !s->info.has_token_embedding) {
+            fprintf(stderr,
+                    "ds4: model shard containing layer 0 must contain token_embd.weight: %s\n",
+                    s->path);
+            exit(2);
+        }
+        if (s->info.first_layer != expected_start) {
+            fprintf(stderr,
+                    "ds4: model shards must be contiguous; expected layer %u but %s starts at %u\n",
+                    expected_start,
+                    s->path,
+                    s->info.first_layer);
+            exit(2);
+        }
+        if (i + 1u < g->n_shards && s->info.has_output_head) {
+            fprintf(stderr, "ds4: only the final model shard may contain the output head: %s\n", s->path);
+            exit(2);
+        }
+        expected_start = s->info.last_layer + 1u;
+    }
+
+    if (expected_start != n_layers) {
+        fprintf(stderr,
+                "ds4: model shards cover %u layers but model metadata requires %u\n",
+                expected_start,
+                n_layers);
+        exit(2);
+    }
+    if (!g->shards[g->n_shards - 1u].info.has_output_head) {
+        fprintf(stderr, "ds4: final model shard must contain the output head\n");
+        exit(2);
+    }
+    if (g->n > n_layers) {
+        fprintf(stderr, "ds4: --gpus has %u entries but the model has only %u layers\n",
+                g->n, n_layers);
+        exit(2);
+    }
+    g->n_layers = n_layers;
+}
+
+static void cli_local_gpus_add_assignment(
+        cli_local_gpus *g,
+        const cli_model_shard *s,
+        int gpu_id,
+        uint32_t start,
+        uint32_t end) {
+    if (g->assignment_count >= CLI_LOCAL_ASSIGNMENT_MAX) {
+        fprintf(stderr,
+                "ds4: local GPU split produced too many route entries (%u max)\n",
+                (unsigned)CLI_LOCAL_ASSIGNMENT_MAX);
+        exit(2);
+    }
+    cli_local_gpu_assignment *a = &g->assignments[g->assignment_count++];
+    memset(a, 0, sizeof(*a));
+    a->model_path = s->path;
+    a->gpu_id = gpu_id;
+    a->start = start;
+    a->end = end;
+    a->has_output = s->info.has_output_head && end == s->info.last_layer;
+    a->cache_bytes = cli_model_shard_range_cache_bytes(s, start, end);
+}
+
+static bool cli_local_gpus_plan_ranges(
+        const cli_local_gpus *g,
+        const uint64_t budgets[CLI_LOCAL_GPU_MAX],
+        uint32_t starts[CLI_LOCAL_GPU_MAX],
+        uint32_t ends[CLI_LOCAL_GPU_MAX]) {
+    const uint32_t n = g->n;
+    const uint32_t layers = g->n_layers;
+    double dp[CLI_LOCAL_GPU_MAX + 1u][DS4_INTERNAL_MAX_LAYER + 1u];
+    int split[CLI_LOCAL_GPU_MAX + 1u][DS4_INTERNAL_MAX_LAYER + 1u];
+
+    for (uint32_t p = 0; p <= CLI_LOCAL_GPU_MAX; p++) {
+        for (uint32_t e = 0; e <= DS4_INTERNAL_MAX_LAYER; e++) {
+            dp[p][e] = 1.0e300;
+            split[p][e] = -1;
+        }
+    }
+    dp[0][0] = 0.0;
+
+    for (uint32_t p = 1; p <= n; p++) {
+        for (uint32_t e = p; e <= layers; e++) {
+            for (uint32_t k = p - 1u; k < e; k++) {
+                if (dp[p - 1u][k] >= 1.0e299) continue;
+                const uint64_t cost = cli_model_range_cache_bytes(g, k, e - 1u);
+                if (budgets[p - 1u] != 0 && cost > budgets[p - 1u]) continue;
+                const double load = budgets[p - 1u] != 0 ?
+                    (double)cost / (double)budgets[p - 1u] :
+                    (double)cost;
+                const double candidate = dp[p - 1u][k] > load ? dp[p - 1u][k] : load;
+                if (candidate < dp[p][e]) {
+                    dp[p][e] = candidate;
+                    split[p][e] = (int)k;
+                }
+            }
+        }
+    }
+
+    if (split[n][layers] < 0) return false;
+
+    uint32_t e = layers;
+    for (uint32_t p = n; p > 0; p--) {
+        const int k = split[p][e];
+        if (k < 0) return false;
+        starts[p - 1u] = (uint32_t)k;
+        ends[p - 1u] = e - 1u;
+        e = (uint32_t)k;
+    }
+    return e == 0;
+}
+
+static void cli_local_gpus_build_assignments(cli_config *cfg) {
+    cli_local_gpus *g = &cfg->local_gpus;
+    uint64_t budgets[CLI_LOCAL_GPU_MAX];
+    uint32_t starts[CLI_LOCAL_GPU_MAX];
+    uint32_t ends[CLI_LOCAL_GPU_MAX];
+
+    cli_local_gpus_model_budgets(cfg, budgets);
+    if (!cli_local_gpus_plan_ranges(g, budgets, starts, ends)) {
+        const uint64_t total = cli_model_total_cache_bytes(g);
+        fprintf(stderr,
+                "ds4: --gpus could not fit a byte-balanced local split "
+                "within estimated per-GPU model-cache budgets\n");
+        fprintf(stderr,
+                "ds4: model cache %.2f GiB, coordinator budget %.2f GiB, worker budget %.2f GiB\n",
+                (double)total / (double)cli_gib_bytes(),
+                (double)budgets[0] / (double)cli_gib_bytes(),
+                (g->n > 1 ? (double)budgets[1] : (double)budgets[0]) /
+                    (double)cli_gib_bytes());
+        exit(2);
+    }
+
+    g->assignment_count = 0;
+    for (uint32_t pi = 0; pi < g->n; pi++) {
+        for (uint32_t si = 0; si < g->n_shards; si++) {
+            const cli_model_shard *s = &g->shards[si];
+            if (ends[pi] < s->info.first_layer || starts[pi] > s->info.last_layer) continue;
+            const uint32_t lo = starts[pi] > s->info.first_layer ? starts[pi] : s->info.first_layer;
+            const uint32_t hi = ends[pi] < s->info.last_layer ? ends[pi] : s->info.last_layer;
+            cli_local_gpus_add_assignment(g, s, g->ids[pi], lo, hi);
+        }
+    }
+}
+
+static bool cli_local_gpus_tight_model_split(const cli_local_gpus *g) {
+    if (!g) return false;
+    const uint64_t tight = 160ull * 1073741824ull;
+    for (uint32_t i = 0; i < g->assignment_count; i++) {
+        if (g->assignments[i].cache_bytes >= tight) return true;
+    }
+    return false;
+}
+
+static void cli_local_gpus_configure(cli_config *cfg) {
+    if (!cfg || !cfg->local_gpus.enabled) return;
+    cli_local_gpus *g = &cfg->local_gpus;
+
+    if (cfg->engine.backend != DS4_BACKEND_CUDA) {
+#ifdef DS4_ROCM_BUILD
+        fprintf(stderr, "ds4: --gpus requires the ROCm backend in this build\n");
+#else
+        fprintf(stderr, "ds4: --gpus requires the CUDA backend in this build\n");
+#endif
+        exit(2);
+    }
+    if (cfg->engine.mtp_path) {
+        fprintf(stderr, "ds4: --gpus is not compatible with --mtp yet\n");
+        exit(2);
+    }
+    if (cfg->inspect || cfg->gen.dump_tokens || cfg->gen.head_test ||
+        cfg->gen.first_token_test || cfg->gen.metal_graph_test ||
+        cfg->gen.metal_graph_full_test || cfg->gen.metal_graph_prompt_test ||
+        cfg->gen.perplexity_file_path || cfg->gen.imatrix_dataset_path ||
+        cfg->gen.imatrix_output_path) {
+        fprintf(stderr, "ds4: --gpus is for normal generation/server-style distributed runs, not diagnostics\n");
+        exit(2);
+    }
+    if (cfg->dist->role != DS4_DISTRIBUTED_NONE ||
+        cfg->dist->layers.set ||
+        cfg->dist->listen_host ||
+        cfg->dist->listen_port ||
+        cfg->dist->coordinator_host ||
+        cfg->dist->coordinator_port) {
+        fprintf(stderr, "ds4: --gpus cannot be combined with explicit --role/--layers/--listen/--coordinator\n");
+        exit(2);
+    }
+
+    cli_local_gpus_load_shards(cfg);
+    cli_local_gpus_build_assignments(cfg);
+    if (!cfg->prefill_chunk_set && cli_local_gpus_tight_model_split(g)) {
+        cfg->engine.prefill_chunk = 1024;
+        fprintf(stderr,
+                "ds4: local GPU launcher defaulting --prefill-chunk 1024 "
+                "for a tight VRAM split; pass --prefill-chunk to override\n");
+        cli_local_gpus_build_assignments(cfg);
+    }
+    if (g->assignment_count == 0) {
+        fprintf(stderr, "ds4: local GPU split produced no route entries\n");
+        exit(2);
+    }
+    if (g->assignment_count == 1) {
+        cfg->engine.model_path = g->assignments[0].model_path;
+        return;
+    }
+    cli_local_gpus_open_listener(g);
+
+    const cli_local_gpu_assignment *a = &g->assignments[0];
+    cfg->engine.model_path = a->model_path;
+    cfg->dist->role = DS4_DISTRIBUTED_COORDINATOR;
+    cfg->dist->layers.start = a->start;
+    cfg->dist->layers.end = a->end;
+    cfg->dist->layers.has_output = a->has_output;
+    cfg->dist->layers.set = true;
+    cfg->dist->listen_host = "127.0.0.1";
+    cfg->dist->listen_port = g->coordinator_port;
+    ds4_internal_dist_set_adopt_listen_fd(cfg->dist, g->coordinator_listen_fd);
+    if (!getenv("DS4_DIST_ROUTE_WAIT_SEC")) setenv("DS4_DIST_ROUTE_WAIT_SEC", "300", 0);
+}
+
+static int cli_local_gpu_worker_run(cli_config cfg, uint32_t idx) {
+    cli_local_gpus *g = &cfg.local_gpus;
+    const cli_local_gpu_assignment *a = &g->assignments[idx];
+
+    cli_local_gpus_close_listener(g);
+    ds4_internal_dist_set_adopt_listen_fd(cfg.dist, 0);
+
+    struct sigaction ignore_int;
+    memset(&ignore_int, 0, sizeof(ignore_int));
+    ignore_int.sa_handler = SIG_IGN;
+    sigemptyset(&ignore_int.sa_mask);
+    (void)sigaction(SIGINT, &ignore_int, NULL);
+
+#ifdef __linux__
+    (void)prctl(PR_SET_PDEATHSIG, SIGTERM);
+    if (getppid() == 1) return 1;
+#endif
+
+    cli_local_gpus_set_visible(a->gpu_id);
+    if (!getenv("DS4_DIST_ENABLE_WORKER_PREFETCH") &&
+        !getenv("DS4_DIST_DISABLE_WORKER_PREFETCH")) {
+        setenv("DS4_DIST_DISABLE_WORKER_PREFETCH", "1", 0);
+    }
+    if (!getenv("DS4_DIST_ENABLE_WORKER_FORWARD_PIPELINE") &&
+        !getenv("DS4_DIST_WORKER_FORWARD_SYNC")) {
+        setenv("DS4_DIST_WORKER_FORWARD_SYNC", "1", 0);
+    }
+    cfg.engine.model_path = a->model_path;
+    cfg.dist->role = DS4_DISTRIBUTED_WORKER;
+    cfg.dist->layers.start = a->start;
+    cfg.dist->layers.end = a->end;
+    cfg.dist->layers.has_output = a->has_output;
+    cfg.dist->layers.set = true;
+    cfg.dist->listen_host = NULL;
+    cfg.dist->listen_port = 0;
+    cfg.dist->coordinator_host = "127.0.0.1";
+    cfg.dist->coordinator_port = g->coordinator_port;
+    cfg.dist->prefill_chunk = 0;
+    cfg.dist->prefill_window = 0;
+    cfg.dist->activation_bits = 0;
+    cfg.dist->replay_check = false;
+    ds4_internal_dist_set_local_worker(cfg.dist, true);
+
+    char dist_err[256];
+    if (ds4_dist_prepare_engine_options(cfg.dist, &cfg.engine, dist_err, sizeof(dist_err)) != 0) {
+        fprintf(stderr, "ds4: local GPU worker %u: %s\n", idx, dist_err);
+        return 2;
+    }
+
+    char end_buf[32];
+    if (a->has_output) snprintf(end_buf, sizeof(end_buf), "output");
+    else snprintf(end_buf, sizeof(end_buf), "%u", a->end);
+    fprintf(stderr,
+            "ds4: local GPU worker %u pid=%ld gpu=%d model=%s layers %u:%s connecting to 127.0.0.1:%d\n",
+            idx,
+            (long)getpid(),
+            a->gpu_id,
+            a->model_path,
+            a->start,
+            end_buf,
+            g->coordinator_port);
+
+    ds4_engine *engine = NULL;
+    if (ds4_engine_open(&engine, &cfg.engine) != 0) return 1;
+    ds4_dist_generation_options dist_gen = {
+        .prompt = cfg.gen.prompt,
+        .system = cfg.gen.system,
+        .dump_logits_path = cfg.gen.dump_logits_path,
+        .dump_logprobs_path = cfg.gen.dump_logprobs_path,
+        .dump_logprobs_top_k = cfg.gen.dump_logprobs_top_k,
+        .n_predict = cfg.gen.n_predict,
+        .ctx_size = cfg.gen.ctx_size,
+        .temperature = cfg.gen.temperature,
+        .top_p = cfg.gen.top_p,
+        .min_p = cfg.gen.min_p,
+        .seed = cfg.gen.seed,
+        .think_mode = cfg.gen.think_mode,
+    };
+    int rc = ds4_dist_run(engine, cfg.dist, &dist_gen);
+    ds4_engine_close(engine);
+    return rc;
+}
+
+static void cli_local_gpus_stop(cli_local_gpus *g) {
+    if (!g) return;
+    cli_local_gpus_close_listener(g);
+    if (g->worker_count == 0) return;
+    for (uint32_t i = 0; i < g->worker_count; i++) {
+        if (g->worker_pids[i] > 0) kill(g->worker_pids[i], SIGTERM);
+    }
+
+    bool left = true;
+    for (int spin = 0; spin < 50 && left; spin++) {
+        left = false;
+        for (uint32_t i = 0; i < g->worker_count; i++) {
+            pid_t pid = g->worker_pids[i];
+            if (pid <= 0) continue;
+            int status = 0;
+            pid_t got = waitpid(pid, &status, WNOHANG);
+            if (got == pid) g->worker_pids[i] = 0;
+            else if (got == 0) left = true;
+        }
+        if (left) {
+            const struct timespec delay = {0, 100000000L};
+            nanosleep(&delay, NULL);
+        }
+    }
+
+    for (uint32_t i = 0; i < g->worker_count; i++) {
+        pid_t pid = g->worker_pids[i];
+        if (pid <= 0) continue;
+        kill(pid, SIGKILL);
+        (void)waitpid(pid, NULL, 0);
+        g->worker_pids[i] = 0;
+    }
+    g->worker_count = 0;
+}
+
+static void cli_local_gpus_status_text(int status, char *buf, size_t len) {
+    if (!buf || len == 0) return;
+    if (WIFEXITED(status)) {
+        snprintf(buf, len, "exit status %d", WEXITSTATUS(status));
+    } else if (WIFSIGNALED(status)) {
+        snprintf(buf, len, "signal %d", WTERMSIG(status));
+    } else {
+        snprintf(buf, len, "status %d", status);
+    }
+}
+
+static int cli_local_gpus_reap_exited(const cli_config *cfg) {
+    if (!cfg || !cfg->local_gpus.enabled || cfg->local_gpus.worker_count == 0) return 0;
+    cli_local_gpus *g = (cli_local_gpus *)&cfg->local_gpus;
+    for (uint32_t i = 0; i < g->worker_count; i++) {
+        const pid_t pid = g->worker_pids[i];
+        if (pid <= 0) continue;
+        int status = 0;
+        const pid_t got = waitpid(pid, &status, WNOHANG);
+        if (got == pid) {
+            char detail[64];
+            cli_local_gpus_status_text(status, detail, sizeof(detail));
+            fprintf(stderr,
+                    "ds4: local GPU worker %u pid=%ld exited unexpectedly during startup (%s)\n",
+                    i + 1u,
+                    (long)pid,
+                    detail);
+            g->worker_pids[i] = 0;
+            return 1;
+        }
+        if (got < 0 && errno == ECHILD) {
+            fprintf(stderr,
+                    "ds4: local GPU worker %u pid=%ld is no longer available\n",
+                    i + 1u,
+                    (long)pid);
+            g->worker_pids[i] = 0;
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int cli_local_gpus_start(cli_config *cfg) {
+    if (!cfg || !cfg->local_gpus.enabled) return 0;
+    cli_local_gpus *g = &cfg->local_gpus;
+    if (g->assignment_count == 0) return 0;
+    cli_local_gpus_set_visible(g->assignments[0].gpu_id);
+    if (g->assignment_count == 1) {
+        fprintf(stderr, "ds4: local GPU launcher using gpu=%d\n", g->assignments[0].gpu_id);
+        return 0;
+    }
+
+    const cli_local_gpu_assignment *a = &g->assignments[0];
+    char end_buf[32];
+    if (a->has_output) snprintf(end_buf, sizeof(end_buf), "output");
+    else snprintf(end_buf, sizeof(end_buf), "%u", a->end);
+    fprintf(stderr,
+            "ds4: local GPU launcher coordinator gpu=%d model=%s layers %u:%s listen=127.0.0.1:%d workers=%u\n",
+            a->gpu_id,
+            a->model_path,
+            a->start,
+            end_buf,
+            g->coordinator_port,
+            g->assignment_count - 1u);
+
+    fflush(NULL);
+    for (uint32_t i = 1; i < g->assignment_count; i++) {
+        pid_t pid = fork();
+        if (pid < 0) {
+            fprintf(stderr, "ds4: failed to fork local GPU worker %u: %s\n", i, strerror(errno));
+            cli_local_gpus_stop(g);
+            return 1;
+        }
+        if (pid == 0) {
+            int rc = cli_local_gpu_worker_run(*cfg, i);
+            exit(rc);
+        }
+        g->worker_pids[g->worker_count++] = pid;
+    }
+    return 0;
+}
+
 static char *read_prompt_file(const char *path, bool fatal);
 
 typedef struct {
@@ -968,7 +1757,7 @@ static void print_repl_help(void) {
     puts("  /power N       Set GPU duty cycle percentage, 1..100.");
     puts("  /read FILE     Read a prompt from FILE and run it.");
     puts("  /quit, /exit   Leave the prompt.");
-    puts("  Ctrl+C         Stop generation and return to the prompt.");
+    puts("  Ctrl+C         Stop generation; at the prompt, exit.");
 }
 
 static bool parse_power_percent(const char *arg, int *out) {
@@ -1038,12 +1827,19 @@ static void repl_chat_apply_max_prefix(ds4_engine *engine, repl_chat *chat, bool
     }
 }
 
-static int repl_chat_create_session(ds4_engine *engine, repl_chat *chat, int ctx_size) {
+static int repl_chat_create_session(ds4_engine *engine,
+                                    repl_chat *chat,
+                                    const cli_config *cfg,
+                                    int ctx_size) {
     ds4_session *session = NULL;
     if (ds4_session_create(&session, engine, ctx_size) != 0) {
         fprintf(stderr, "ds4: interactive chat KV cache requires a session backend\n");
         return 1;
     }
+    if (cli_wait_distributed_route(cfg, session) != 0) {
+        ds4_session_free(session);
+        return 1;
+    }
     if (chat->session) ds4_session_free(chat->session);
     chat->session = session;
     chat->ctx_size = ctx_size;
@@ -1058,7 +1854,7 @@ static int repl_chat_init(ds4_engine *engine, repl_chat *chat, const cli_config
     if (cfg->gen.system && cfg->gen.system[0]) {
         ds4_chat_append_message(engine, &chat->transcript, "system", cfg->gen.system);
     }
-    return repl_chat_create_session(engine, chat, cfg->gen.ctx_size);
+    return repl_chat_create_session(engine, chat, cfg, cfg->gen.ctx_size);
 }
 
 static void repl_chat_free(repl_chat *chat) {
@@ -1068,13 +1864,22 @@ static void repl_chat_free(repl_chat *chat) {
     memset(chat, 0, sizeof(*chat));
 }
 
-static int repl_chat_set_ctx(ds4_engine *engine, repl_chat *chat, int ctx_size) {
+static int repl_chat_set_ctx(ds4_engine *engine,
+                             repl_chat *chat,
+                             const cli_config *cfg,
+                             int ctx_size) {
     ds4_session_free(chat->session);
     chat->session = NULL;
     chat->ctx_size = 0;
-    return repl_chat_create_session(engine, chat, ctx_size);
+    return repl_chat_create_session(engine, chat, cfg, ctx_size);
 }
 
+enum {
+    CLI_CHAT_OK = 0,
+    CLI_CHAT_FATAL = 1,
+    CLI_CHAT_INTERRUPTED = 2,
+};
+
 /* Run one interactive turn.  The transcript is tentatively extended with user
  * and assistant markers, then ds4_session_sync() decides whether this is a KV
  * continuation.  If prompt processing fails, the transcript rolls back before
@@ -1082,8 +1887,9 @@ static int repl_chat_set_ctx(ds4_engine *engine, repl_chat *chat, int ctx_size)
 static int run_chat_turn(ds4_engine *engine, cli_config *cfg, repl_chat *chat, const char *user_text) {
     if (!chat->session) {
         fprintf(stderr, "ds4: no active interactive KV cache\n");
-        return 1;
+        return CLI_CHAT_FATAL;
     }
+    ds4_session_set_cancel(chat->session, cli_session_cancel_cb, NULL);
 
     ds4_think_mode think_mode = ds4_think_mode_for_context(cfg->gen.think_mode,
                                                            chat->ctx_size);
@@ -1115,8 +1921,17 @@ static int run_chat_turn(ds4_engine *engine, cli_config *cfg, repl_chat *chat, c
         ds4_session_set_progress(chat->session, NULL, NULL);
         ds4_session_set_display_progress(chat->session, NULL, NULL);
         chat->transcript.len = rollback_len;
-        fprintf(stderr, "ds4: prompt processing failed: %s\n", err);
-        return 1;
+        if (sync_rc == DS4_SESSION_SYNC_INTERRUPTED || cli_interrupt_requested()) {
+            fprintf(stderr, "ds4: prompt processing interrupted\n");
+            ds4_session_invalidate(chat->session);
+            cli_interrupt_clear();
+            ds4_session_set_cancel(chat->session, NULL, NULL);
+            return CLI_CHAT_INTERRUPTED;
+        } else {
+            fprintf(stderr, "ds4: prompt processing failed: %s\n", err);
+        }
+        ds4_session_set_cancel(chat->session, NULL, NULL);
+        return CLI_CHAT_FATAL;
     }
     ds4_session_set_progress(chat->session, NULL, NULL);
     ds4_session_set_display_progress(chat->session, NULL, NULL);
@@ -1164,16 +1979,38 @@ static int run_chat_turn(ds4_engine *engine, cli_config *cfg, repl_chat *chat, c
                                                        sizeof(err));
             cli_dist_busy_set(cfg, false);
             if (ntok < 0) {
-                fprintf(stderr, "ds4: decode failed: %s\n", err);
-                return 1;
+                if (cli_interrupt_requested()) {
+                    fprintf(stderr, "ds4: generation interrupted\n");
+                    if (generated == 0) chat->transcript.len = rollback_len;
+                    else ds4_tokens_push(&chat->transcript, ds4_token_eos(engine));
+                    ds4_session_invalidate(chat->session);
+                    cli_interrupt_clear();
+                    ds4_session_set_cancel(chat->session, NULL, NULL);
+                    return CLI_CHAT_INTERRUPTED;
+                } else {
+                    fprintf(stderr, "ds4: decode failed: %s\n", err);
+                }
+                ds4_session_set_cancel(chat->session, NULL, NULL);
+                return CLI_CHAT_FATAL;
             }
         } else {
             cli_dist_busy_set(cfg, true);
             int eval_rc = ds4_session_eval(chat->session, token, err, sizeof(err));
             cli_dist_busy_set(cfg, false);
             if (eval_rc != 0) {
-                fprintf(stderr, "ds4: decode failed: %s\n", err);
-                return 1;
+                if (cli_interrupt_requested()) {
+                    fprintf(stderr, "ds4: generation interrupted\n");
+                    if (generated == 0) chat->transcript.len = rollback_len;
+                    else ds4_tokens_push(&chat->transcript, ds4_token_eos(engine));
+                    ds4_session_invalidate(chat->session);
+                    cli_interrupt_clear();
+                    ds4_session_set_cancel(chat->session, NULL, NULL);
+                    return CLI_CHAT_INTERRUPTED;
+                } else {
+                    fprintf(stderr, "ds4: decode failed: %s\n", err);
+                }
+                ds4_session_set_cancel(chat->session, NULL, NULL);
+                return CLI_CHAT_FATAL;
             }
             toks[0] = token;
             ntok = 1;
@@ -1215,13 +2052,11 @@ static int run_chat_turn(ds4_engine *engine, cli_config *cfg, repl_chat *chat, c
             "ds4: prefill: %.2f t/s, generation: %.2f t/s\n",
             prefill_s > 0.0 ? (double)suffix / prefill_s : 0.0,
             decode_s > 0.0 ? (double)generated / decode_s : 0.0);
-    return 0;
+    ds4_session_set_cancel(chat->session, NULL, NULL);
+    return CLI_CHAT_OK;
 }
 
 static int run_repl(ds4_engine *engine, cli_config *cfg) {
-    repl_chat chat;
-    if (repl_chat_init(engine, &chat, cfg) != 0) return 1;
-
     struct sigaction old_int;
     struct sigaction sa;
     memset(&sa, 0, sizeof(sa));
@@ -1230,6 +2065,12 @@ static int run_repl(ds4_engine *engine, cli_config *cfg) {
     bool sigint_installed = sigaction(SIGINT, &sa, &old_int) == 0;
     cli_interrupt_clear();
 
+    repl_chat chat;
+    if (repl_chat_init(engine, &chat, cfg) != 0) {
+        if (sigint_installed) sigaction(SIGINT, &old_int, NULL);
+        return 1;
+    }
+
     char hist[PATH_MAX];
     history_file_path(hist, sizeof(hist));
     linenoiseSetMultiLine(1);
@@ -1244,7 +2085,7 @@ static int run_repl(ds4_engine *engine, cli_config *cfg) {
         if (!line) {
             if (errno == EAGAIN || cli_interrupt_requested()) {
                 cli_interrupt_clear();
-                continue;
+                break;
             }
             break;
         }
@@ -1297,7 +2138,7 @@ static int run_repl(ds4_engine *engine, cli_config *cfg) {
                 log_context_memory(cfg->engine.backend,
                                    cfg->gen.ctx_size,
                                    cfg->engine.prefill_chunk);
-                rc = repl_chat_set_ctx(engine, &chat, cfg->gen.ctx_size);
+                rc = repl_chat_set_ctx(engine, &chat, cfg, cfg->gen.ctx_size);
                 if (rc != 0) {
                     linenoiseFree(line);
                     break;
@@ -1328,6 +2169,11 @@ static int run_repl(ds4_engine *engine, cli_config *cfg) {
             rc = run_chat_turn(engine, cfg, &chat, cmd);
         }
         linenoiseFree(line);
+        if (rc == CLI_CHAT_INTERRUPTED) {
+            rc = 0;
+            continue;
+        }
+        if (rc != 0 && cfg->local_gpus.enabled) break;
     }
     if (sigint_installed) sigaction(SIGINT, &old_int, NULL);
     repl_chat_free(&chat);
@@ -1408,6 +2254,9 @@ static cli_config parse_options(int argc, char **argv) {
             .dump_logprobs_top_k = 20,
             .think_mode = DS4_THINK_HIGH,
         },
+        .local_gpus = {
+            .coordinator_listen_fd = -1,
+        },
     };
 
     c.dist = ds4_dist_options_create();
@@ -1455,7 +2304,7 @@ static cli_config parse_options(int argc, char **argv) {
         } else if (!strcmp(arg, "-sys") || !strcmp(arg, "--system")) {
             c.gen.system = need_arg(&i, argc, argv, arg);
         } else if (!strcmp(arg, "-m") || !strcmp(arg, "--model")) {
-            c.engine.model_path = need_arg(&i, argc, argv, arg);
+            cli_add_model_path(&c, need_arg(&i, argc, argv, arg));
         } else if (!strcmp(arg, "--mtp")) {
             c.engine.mtp_path = need_arg(&i, argc, argv, arg);
         } else if (!strcmp(arg, "--mtp-draft")) {
@@ -1466,6 +2315,8 @@ static cli_config parse_options(int argc, char **argv) {
             c.gen.n_predict = parse_int(need_arg(&i, argc, argv, arg), arg);
         } else if (!strcmp(arg, "-c") || !strcmp(arg, "--ctx")) {
             c.gen.ctx_size = parse_int(need_arg(&i, argc, argv, arg), arg);
+        } else if (!strcmp(arg, "--gpus")) {
+            cli_local_gpus_parse(&c.local_gpus, need_arg(&i, argc, argv, arg));
         } else if (!strcmp(arg, "--temp")) {
             c.gen.temperature = parse_float_range(need_arg(&i, argc, argv, arg), arg, 0.0f, 100.0f);
         } else if (!strcmp(arg, "--top-p")) {
@@ -1512,6 +2363,7 @@ static cli_config parse_options(int argc, char **argv) {
                 exit(2);
             }
             c.engine.prefill_chunk = (uint32_t)v;
+            c.prefill_chunk_set = true;
         } else if (!strcmp(arg, "--power")) {
             c.engine.power_percent = parse_int(need_arg(&i, argc, argv, arg), arg);
             if (c.engine.power_percent < 1 || c.engine.power_percent > 100) {
@@ -1625,6 +2477,14 @@ static cli_config parse_options(int argc, char **argv) {
         fprintf(stderr, "ds4: --perplexity-file does not use -p/--prompt-file\n");
         exit(2);
     }
+    if (c.model_path_count == 0) {
+        c.model_paths[c.model_path_count++] = c.engine.model_path;
+    }
+    if (c.model_path_count > 1 && !c.local_gpus.enabled) {
+        fprintf(stderr, "ds4: repeated -m/--model is currently supported only with --gpus\n");
+        exit(2);
+    }
+    cli_local_gpus_configure(&c);
     char dist_err[256];
     if (ds4_dist_prepare_engine_options(c.dist, &c.engine, dist_err, sizeof(dist_err)) != 0) {
         fprintf(stderr, "ds4: %s\n", dist_err);
@@ -1636,6 +2496,7 @@ static cli_config parse_options(int argc, char **argv) {
 
 int main(int argc, char **argv) {
     cli_config cfg = parse_options(argc, argv);
+    ds4_internal_dist_copy_options_private(cfg.dist, &cfg.engine.distributed);
     if (cfg.gen.dump_tokens) {
         if (cfg.gen.prompt == NULL) {
             fprintf(stderr, "ds4: --dump-tokens requires -p or --prompt-file\n");
@@ -1650,8 +2511,14 @@ int main(int argc, char **argv) {
         return rc;
     }
     cfg.engine.inspect_only = cfg.inspect;
+    if (cli_local_gpus_start(&cfg) != 0) {
+        ds4_dist_options_free(cfg.dist);
+        free(cfg.prompt_owned);
+        return 1;
+    }
     ds4_engine *engine = NULL;
     if (ds4_engine_open(&engine, &cfg.engine) != 0) {
+        cli_local_gpus_stop(&cfg.local_gpus);
         ds4_dist_options_free(cfg.dist);
         free(cfg.prompt_owned);
         return 1;
@@ -1673,6 +2540,7 @@ int main(int argc, char **argv) {
         };
         int rc = ds4_dist_run(engine, cfg.dist, &dist_gen);
         ds4_engine_close(engine);
+        cli_local_gpus_stop(&cfg.local_gpus);
         ds4_dist_options_free(cfg.dist);
         free(cfg.prompt_owned);
         return rc;
@@ -1701,6 +2569,7 @@ int main(int argc, char **argv) {
         rc = run_generation(engine, &cfg);
     }
     ds4_engine_close(engine);
+    cli_local_gpus_stop(&cfg.local_gpus);
     ds4_dist_options_free(cfg.dist);
     free(cfg.prompt_owned);
     return rc;
diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 188b341ad..ecdebcfe9 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -2784,6 +2784,18 @@ extern "C" uint64_t ds4_gpu_recommended_working_set_size(void) {
     return 0;
 }
 
+extern "C" int ds4_gpu_memory_info(uint64_t *free_bytes, uint64_t *total_bytes) {
+    size_t free_b = 0;
+    size_t total_b = 0;
+    if (cudaMemGetInfo(&free_b, &total_b) != cudaSuccess) {
+        (void)cudaGetLastError();
+        return 0;
+    }
+    if (free_bytes) *free_bytes = (uint64_t)free_b;
+    if (total_bytes) *total_bytes = (uint64_t)total_b;
+    return 1;
+}
+
 extern "C" uint32_t ds4_gpu_stream_expert_cache_configured_count(void) {
     if (!cuda_stream_expert_cache_budget_visible_to_shared()) return 0;
     return cuda_stream_expert_cache_configured_budget();
diff --git a/ds4_distributed.c b/ds4_distributed.c
index d31c8e2a6..236f7f953 100644
--- a/ds4_distributed.c
+++ b/ds4_distributed.c
@@ -14,6 +14,7 @@
  */
 
 #include "ds4_distributed.h"
+#include "ds4_internal.h"
 
 #include <arpa/inet.h>
 #include <errno.h>
@@ -69,6 +70,115 @@
 #define DS4_DIST_RECV_REMOTE_ERROR 2
 #define DS4_DIST_SNAPSHOT_CHUNK_BYTES (8u * 1024u * 1024u)
 
+typedef struct ds4_dist_options_private {
+    const ds4_distributed_options *opt;
+    int adopt_listen_fd;
+    bool local_worker;
+    struct ds4_dist_options_private *next;
+} ds4_dist_options_private;
+
+static pthread_mutex_t g_dist_options_private_mu = PTHREAD_MUTEX_INITIALIZER;
+static ds4_dist_options_private *g_dist_options_private;
+
+static ds4_dist_options_private *dist_options_private_find_locked(
+        const ds4_distributed_options *opt,
+        bool create) {
+    if (!opt) return NULL;
+    for (ds4_dist_options_private *it = g_dist_options_private; it; it = it->next) {
+        if (it->opt == opt) return it;
+    }
+    if (!create) return NULL;
+    ds4_dist_options_private *p = calloc(1, sizeof(*p));
+    if (!p) return NULL;
+    p->opt = opt;
+    p->next = g_dist_options_private;
+    g_dist_options_private = p;
+    return p;
+}
+
+static void dist_options_private_remove_locked(const ds4_distributed_options *opt) {
+    ds4_dist_options_private **link = &g_dist_options_private;
+    while (*link) {
+        if ((*link)->opt == opt) {
+            ds4_dist_options_private *old = *link;
+            *link = old->next;
+            free(old);
+            return;
+        }
+        link = &(*link)->next;
+    }
+}
+
+static void dist_options_private_prune_locked(const ds4_distributed_options *opt) {
+    ds4_dist_options_private *p = dist_options_private_find_locked(opt, false);
+    if (p && p->adopt_listen_fd <= 0 && !p->local_worker) {
+        dist_options_private_remove_locked(opt);
+    }
+}
+
+static int dist_options_adopt_listen_fd(const ds4_distributed_options *opt) {
+    pthread_mutex_lock(&g_dist_options_private_mu);
+    ds4_dist_options_private *p = dist_options_private_find_locked(opt, false);
+    const int fd = p ? p->adopt_listen_fd : 0;
+    pthread_mutex_unlock(&g_dist_options_private_mu);
+    return fd;
+}
+
+static bool dist_options_local_worker(const ds4_distributed_options *opt) {
+    pthread_mutex_lock(&g_dist_options_private_mu);
+    ds4_dist_options_private *p = dist_options_private_find_locked(opt, false);
+    const bool local_worker = p && p->local_worker;
+    pthread_mutex_unlock(&g_dist_options_private_mu);
+    return local_worker;
+}
+
+void ds4_internal_dist_set_adopt_listen_fd(ds4_distributed_options *opt, int fd) {
+    if (!opt) return;
+    pthread_mutex_lock(&g_dist_options_private_mu);
+    ds4_dist_options_private *p = dist_options_private_find_locked(opt, fd > 0);
+    if (p) {
+        p->adopt_listen_fd = fd;
+        dist_options_private_prune_locked(opt);
+    }
+    pthread_mutex_unlock(&g_dist_options_private_mu);
+}
+
+void ds4_internal_dist_set_local_worker(
+        ds4_distributed_options *opt,
+        bool local_worker) {
+    if (!opt) return;
+    pthread_mutex_lock(&g_dist_options_private_mu);
+    ds4_dist_options_private *p = dist_options_private_find_locked(opt, local_worker);
+    if (p) {
+        p->local_worker = local_worker;
+        dist_options_private_prune_locked(opt);
+    }
+    pthread_mutex_unlock(&g_dist_options_private_mu);
+}
+
+void ds4_internal_dist_copy_options_private(
+        const ds4_distributed_options *src,
+        const ds4_distributed_options *dst) {
+    if (!dst || src == dst) return;
+    pthread_mutex_lock(&g_dist_options_private_mu);
+    dist_options_private_remove_locked(dst);
+    ds4_dist_options_private *sp = dist_options_private_find_locked(src, false);
+    if (sp && (sp->adopt_listen_fd > 0 || sp->local_worker)) {
+        ds4_dist_options_private *dp = dist_options_private_find_locked(dst, true);
+        if (dp) {
+            dp->adopt_listen_fd = sp->adopt_listen_fd;
+            dp->local_worker = sp->local_worker;
+        }
+    }
+    pthread_mutex_unlock(&g_dist_options_private_mu);
+}
+
+void ds4_internal_dist_clear_options_private(const ds4_distributed_options *opt) {
+    pthread_mutex_lock(&g_dist_options_private_mu);
+    dist_options_private_remove_locked(opt);
+    pthread_mutex_unlock(&g_dist_options_private_mu);
+}
+
 typedef struct {
     uint32_t magic;
     uint32_t type;
@@ -492,6 +602,8 @@ static int dist_send_snapshot_file_chunks(
         uint64_t request_id,
         FILE *fp,
         uint64_t bytes);
+static bool dist_session_cancelled(ds4_session *session);
+static int dist_session_interrupted(char *err, size_t errlen);
 
 static int dist_worker_handle_work(
         ds4_dist_worker_state *state,
@@ -780,7 +892,9 @@ static bool dist_parse_positive_u32(
  *
  * The graph-slice APIs exchange float buffers. Distributed transport can leave
  * those buffers as 32-bit floats or pack them to 16/8 bits on the wire; workers
- * decode back to float before executing the next slice.
+ * decode back to float before executing the next slice. 16-bit transport uses
+ * BF16 rather than IEEE fp16 so layer-boundary activation outliers keep the
+ * float32 exponent range.
  */
 
 static uint32_t dist_activation_bits_or_default(uint32_t bits) {
@@ -815,62 +929,16 @@ static bool dist_activation_wire_bytes_from_f32_bytes(uint32_t bits, uint32_t f3
     return dist_activation_wire_bytes(bits, f32_bytes / (uint32_t)sizeof(float), out);
 }
 
-static uint16_t dist_f32_to_f16(float f) {
+static uint16_t dist_f32_to_bf16(float f) {
     uint32_t bits;
     memcpy(&bits, &f, sizeof(bits));
-
-    const uint32_t sign = (bits >> 16) & 0x8000u;
-    int32_t exp = (int32_t)((bits >> 23) & 0xffu) - 127 + 15;
-    uint32_t mant = bits & 0x7fffffu;
-
-    if (exp <= 0) {
-        if (exp < -10) return (uint16_t)sign;
-        mant |= 0x800000u;
-        const uint32_t shift = (uint32_t)(14 - exp);
-        uint32_t half_mant = mant >> shift;
-        const uint32_t round_bit = (mant >> (shift - 1)) & 1u;
-        const uint32_t sticky = mant & ((1u << (shift - 1)) - 1u);
-        if (round_bit && (sticky || (half_mant & 1u))) half_mant++;
-        return (uint16_t)(sign | half_mant);
-    }
-
-    if (exp >= 31) {
-        if (((bits >> 23) & 0xffu) == 0xffu && mant != 0) {
-            return (uint16_t)(sign | 0x7e00u);
-        }
-        return (uint16_t)(sign | 0x7c00u);
-    }
-
-    uint32_t half = sign | ((uint32_t)exp << 10) | (mant >> 13);
-    const uint32_t round = mant & 0x1fffu;
-    if (round > 0x1000u || (round == 0x1000u && (half & 1u))) half++;
-    return (uint16_t)half;
+    const uint32_t lsb = (bits >> 16) & 1u;
+    bits += 0x7fffu + lsb;
+    return (uint16_t)(bits >> 16);
 }
 
-static float dist_f16_to_f32(uint16_t h) {
-    uint32_t sign = (uint32_t)(h & 0x8000u) << 16;
-    int32_t exp = (int32_t)((h >> 10) & 0x1fu);
-    uint32_t mant = h & 0x03ffu;
-    uint32_t bits;
-
-    if (exp == 0) {
-        if (mant == 0) {
-            bits = sign;
-        } else {
-            exp = 1;
-            while ((mant & 0x0400u) == 0) {
-                mant <<= 1;
-                exp--;
-            }
-            mant &= 0x03ffu;
-            bits = sign | ((uint32_t)(exp + 127 - 15) << 23) | (mant << 13);
-        }
-    } else if (exp == 31) {
-        bits = sign | 0x7f800000u | (mant << 13);
-    } else {
-        bits = sign | ((uint32_t)(exp + 127 - 15) << 23) | (mant << 13);
-    }
-
+static float dist_bf16_to_f32(uint16_t h) {
+    const uint32_t bits = (uint32_t)h << 16;
     float f;
     memcpy(&f, &bits, sizeof(f));
     return f;
@@ -948,7 +1016,7 @@ static int dist_write_activation_payload(
         if (n > cap) n = cap;
         if (bits == 16u) {
             uint16_t *dst = buf;
-            for (uint64_t i = 0; i < n; i++) dst[i] = dist_f32_to_f16(src[done + i]);
+            for (uint64_t i = 0; i < n; i++) dst[i] = dist_f32_to_bf16(src[done + i]);
         } else {
             uint8_t *dst = buf;
             for (uint64_t i = 0; i < n; i++) dst[i] = dist_f32_to_f8_e4m3(src[done + i]);
@@ -1010,7 +1078,7 @@ static int dist_decode_activation_payload(
     }
     if (bits == 16u) {
         const uint16_t *src = wire;
-        for (uint64_t i = 0; i < values; i++) dst[i] = dist_f16_to_f32(src[i]);
+        for (uint64_t i = 0; i < values; i++) dst[i] = dist_bf16_to_f32(src[i]);
     } else {
         const uint8_t *src = wire;
         for (uint64_t i = 0; i < values; i++) dst[i] = dist_f8_e4m3_to_f32(src[i]);
@@ -1250,6 +1318,27 @@ static int dist_listener_port(int fd) {
     return (int)v;
 }
 
+static int dist_coordinator_listener_fd(
+        const ds4_dist_options *opt,
+        char *err,
+        size_t errlen) {
+    int adopted = dist_options_adopt_listen_fd(opt);
+    if (adopted > 0) {
+        int fd = dup(adopted);
+        if (fd < 0) {
+            if (errlen) {
+                snprintf(err, errlen,
+                         "failed to duplicate adopted listener fd: %s",
+                         strerror(errno));
+            }
+            return -1;
+        }
+        dist_set_socket_low_latency(fd);
+        return fd;
+    }
+    return dist_open_listener(opt->listen_host, opt->listen_port, err, errlen);
+}
+
 static bool dist_connect_errno_retryable(int e) {
     return e == ECONNREFUSED ||
            e == EHOSTUNREACH ||
@@ -2326,7 +2415,32 @@ static bool dist_coordinator_ensure_route(
         uint64_t *generation,
         char *err,
         size_t errlen) {
-    return dist_coordinator_build_route_plan(state, plan, generation, err, errlen);
+    const char *wait_s = getenv("DS4_DIST_ROUTE_WAIT_SEC");
+    char route_err[256];
+    char *errbuf = err ? err : route_err;
+    size_t errbuf_len = err ? errlen : sizeof(route_err);
+
+    double wait_sec = 0.0;
+    if (wait_s && wait_s[0]) {
+        errno = 0;
+        char *end = NULL;
+        double v = strtod(wait_s, &end);
+        if (errno == 0 && end != wait_s && *end == '\0' && v > 0.0) wait_sec = v;
+    }
+
+    const double deadline = dist_now_sec() + wait_sec;
+    for (;;) {
+        if (dist_coordinator_build_route_plan(state, plan, generation, errbuf, errbuf_len)) {
+            return true;
+        }
+        if (strncmp(errbuf, "distributed route incomplete:", 29) != 0 ||
+            wait_sec <= 0.0 ||
+            dist_now_sec() >= deadline) {
+            return false;
+        }
+        const struct timespec delay = {0, 50 * 1000 * 1000};
+        nanosleep(&delay, NULL);
+    }
 }
 
 static uint64_t dist_coordinator_generation(ds4_dist_coordinator_state *state) {
@@ -2571,6 +2685,7 @@ static int dist_coordinator_eval_remote_on_fd(
         size_t errlen) {
     const bool profile = dist_decode_profile_enabled() && n_tokens == 1;
     const double total_t0 = profile ? dist_now_sec() : 0.0;
+    if (dist_session_cancelled(session)) return dist_session_interrupted(err, errlen);
     const double send_t0 = profile ? dist_now_sec() : 0.0;
     int rc = dist_coordinator_send_remote_work_on_fd(state,
                                                      plan,
@@ -2615,6 +2730,10 @@ static int dist_coordinator_eval_remote_on_fd(
                     (double)payload_bytes / (1024.0 * 1024.0));
         }
     }
+    if (dist_session_cancelled(session)) {
+        free(payload);
+        return dist_session_interrupted(err, errlen);
+    }
     if (rc != 0) return rc;
     if (result_hash != expected_result_hash) {
         free(payload);
@@ -2682,6 +2801,7 @@ static int dist_coordinator_eval_span(
         size_t errlen) {
     const bool profile = dist_decode_profile_enabled() && n_tokens == 1;
     const double span_t0 = profile ? dist_now_sec() : 0.0;
+    if (dist_session_cancelled(session)) return dist_session_interrupted(err, errlen);
     const uint64_t hc_values = ds4_engine_hidden_f32_values(state->engine);
     const uint64_t hidden_bytes64 = (uint64_t)n_tokens * hc_values * sizeof(float);
     if (hidden_bytes64 > UINT32_MAX) {
@@ -2743,6 +2863,10 @@ static int dist_coordinator_eval_span(
                                           err,
                                           errlen);
     const double local_t1 = profile ? dist_now_sec() : 0.0;
+    if (dist_session_cancelled(session)) {
+        free(hidden);
+        return dist_session_interrupted(err, errlen);
+    }
     double remote_t0 = 0.0, remote_t1 = 0.0;
     if (rc == 0 && plan->count != 0) {
         remote_t0 = profile ? dist_now_sec() : 0.0;
@@ -3510,6 +3634,15 @@ static void dist_report_prefill_progress(ds4_session *session, uint32_t current,
     ds4_session_report_progress(session, "prefill_chunk", (int)current, (int)total);
 }
 
+static bool dist_session_cancelled(ds4_session *session) {
+    return ds4_internal_session_cancel_requested(session);
+}
+
+static int dist_session_interrupted(char *err, size_t errlen) {
+    if (errlen) snprintf(err, errlen, "interrupted");
+    return DS4_SESSION_SYNC_INTERRUPTED;
+}
+
 static int dist_coordinator_prefill_prompt_pipelined(
         ds4_dist_coordinator_state *state,
         ds4_session *session,
@@ -3525,6 +3658,7 @@ static int dist_coordinator_prefill_prompt_pipelined(
         char *err,
         size_t errlen) {
     const uint32_t total = n_tokens;
+    if (dist_session_cancelled(session)) return dist_session_interrupted(err, errlen);
     if (!prompt ||
         span_start > (uint32_t)prompt->len ||
         n_tokens == 0 ||
@@ -3634,6 +3768,7 @@ static int dist_coordinator_prefill_prompt_pipelined(
                      flow_window);
 
     int rc = 0;
+    bool interrupted = false;
     double local_eval_sec = 0.0;
     const double pipeline_t0 = dist_now_sec();
     uint32_t pos = span_start;
@@ -3641,6 +3776,9 @@ static int dist_coordinator_prefill_prompt_pipelined(
     uint32_t reported_chunks = 0;
     uint32_t submitted_chunks = 0;
     while (pos < span_end) {
+        if (dist_session_cancelled(session)) {
+            interrupted = true;
+        }
         if (!dist_prefill_reader_wait_flow_window(&reader,
                                                   submitted_chunks,
                                                   flow_window,
@@ -3681,6 +3819,9 @@ static int dist_coordinator_prefill_prompt_pipelined(
         const double local_t1 = dist_now_sec();
         local_eval_sec += local_t1 - local_t0;
         if (rc != 0) break;
+        if (dist_session_cancelled(session)) {
+            interrupted = true;
+        }
 
         slot->pos = pos;
         slot->n_tokens = chunk;
@@ -3714,6 +3855,9 @@ static int dist_coordinator_prefill_prompt_pipelined(
     }
     if (rc == 0) {
         while (!dist_prefill_reader_wait_emit_progress(&reader, &reported_chunks)) {
+            if (dist_session_cancelled(session)) {
+                interrupted = true;
+            }
             ;
         }
     }
@@ -3751,6 +3895,10 @@ static int dist_coordinator_prefill_prompt_pipelined(
         free(reader.final_payload);
         return 1;
     }
+    if (interrupted || dist_session_cancelled(session)) {
+        free(reader.final_payload);
+        return dist_session_interrupted(err, errlen);
+    }
     const uint32_t logits_bytes =
         (uint32_t)((uint64_t)ds4_engine_vocab_size(state->engine) * sizeof(float));
     if (reader.final_kind == DS4_DIST_RESULT_LOGITS &&
@@ -5406,7 +5554,7 @@ int ds4_dist_session_create(
     }
     if (dist_validate_options(opt, err, errlen) != 0) return 1;
 
-    int listen_fd = dist_open_listener(opt->listen_host, opt->listen_port, err, errlen);
+    int listen_fd = dist_coordinator_listener_fd(opt, err, errlen);
     if (listen_fd < 0) return 1;
 
     ds4_dist_session *d = calloc(1, sizeof(*d));
@@ -5513,7 +5661,9 @@ int ds4_dist_session_sync(
         if (errlen) snprintf(err, errlen, "invalid distributed sync request");
         return 1;
     }
+    if (dist_session_cancelled(owner)) return dist_session_interrupted(err, errlen);
     if (dist_session_ensure_route(d, err, errlen) != 0) return 1;
+    if (dist_session_cancelled(owner)) return dist_session_interrupted(err, errlen);
 
     if (checkpoint &&
         checkpoint->len >= 0 &&
@@ -5543,6 +5693,11 @@ int ds4_dist_session_sync(
                                                                        err,
                                                                        errlen);
             if (prefill_rc != 0) {
+                if (dist_session_cancelled(owner) || prefill_rc == DS4_SESSION_SYNC_INTERRUPTED) {
+                    d->plan_ready = false;
+                    d->plan_generation = 0;
+                    return dist_session_interrupted(err, errlen);
+                }
                 if (dist_coordinator_rebuild_from_transcript(&d->state,
                                                              owner,
                                                              &d->plan,
@@ -5565,6 +5720,7 @@ int ds4_dist_session_sync(
 
         uint32_t pos = pos0;
         while (pos < (uint32_t)prompt->len) {
+            if (dist_session_cancelled(owner)) return dist_session_interrupted(err, errlen);
             const uint32_t remaining = (uint32_t)prompt->len - pos;
             const uint32_t chunk = remaining < chunk_cap ? remaining : chunk_cap;
             int eval_rc = dist_coordinator_eval_span(&d->state,
@@ -5580,6 +5736,11 @@ int ds4_dist_session_sync(
                                                      err,
                                                      errlen);
             if (eval_rc != 0) {
+                if (dist_session_cancelled(owner) || eval_rc == DS4_SESSION_SYNC_INTERRUPTED) {
+                    d->plan_ready = false;
+                    d->plan_generation = 0;
+                    return dist_session_interrupted(err, errlen);
+                }
                 if (dist_coordinator_rebuild_from_transcript(&d->state,
                                                              owner,
                                                              &d->plan,
@@ -5614,6 +5775,11 @@ int ds4_dist_session_sync(
                                                      err,
                                                      errlen);
     if (prefill_rc != 0) {
+        if (dist_session_cancelled(owner) || prefill_rc == DS4_SESSION_SYNC_INTERRUPTED) {
+            d->plan_ready = false;
+            d->plan_generation = 0;
+            return dist_session_interrupted(err, errlen);
+        }
         if (dist_coordinator_rebuild_from_transcript(&d->state,
                                                      owner,
                                                      &d->plan,
@@ -5646,7 +5812,9 @@ int ds4_dist_session_eval(
         if (errlen) snprintf(err, errlen, "invalid distributed decode request");
         return 1;
     }
+    if (dist_session_cancelled(owner)) return dist_session_interrupted(err, errlen);
     if (dist_session_ensure_route(d, err, errlen) != 0) return 1;
+    if (dist_session_cancelled(owner)) return dist_session_interrupted(err, errlen);
 
     ds4_tokens transcript = {0};
     ds4_tokens_copy(&transcript, checkpoint);
@@ -5665,6 +5833,12 @@ int ds4_dist_session_eval(
                                         err,
                                         errlen);
     if (rc != 0) {
+        if (dist_session_cancelled(owner) || rc == DS4_SESSION_SYNC_INTERRUPTED) {
+            d->plan_ready = false;
+            d->plan_generation = 0;
+            ds4_tokens_free(&transcript);
+            return dist_session_interrupted(err, errlen);
+        }
         if (dist_coordinator_rebuild_from_transcript(&d->state,
                                                      owner,
                                                      &d->plan,
@@ -5694,7 +5868,7 @@ int ds4_dist_session_eval(
 
 static int dist_run_coordinator(ds4_engine *engine, const ds4_dist_options *opt, const ds4_dist_generation_options *gen) {
     char err[256];
-    int listen_fd = dist_open_listener(opt->listen_host, opt->listen_port, err, sizeof(err));
+    int listen_fd = dist_coordinator_listener_fd(opt, err, sizeof(err));
     if (listen_fd < 0) {
         fprintf(stderr, "ds4: distributed coordinator: %s\n", err);
         return 1;
@@ -6713,12 +6887,7 @@ static int dist_forward_work_to_next(
         const ds4_dist_telemetry_fixed *telemetry,
         const void *route_blob) {
     char err[256];
-    ds4_dist_worker_forwarder *forwarder =
-        dist_worker_get_forwarder(upstream, next->host, next->port, err, sizeof(err));
     const uint64_t request_id = dist_u64_from_halves(work->request_hi, work->request_lo);
-    if (!forwarder) {
-        return dist_worker_upstream_send_work_error(upstream, request_id, err);
-    }
 
     ds4_dist_work_fixed forwarded = *work;
     forwarded.layer_start = next->layer_start;
@@ -6739,6 +6908,74 @@ static int dist_forward_work_to_next(
         forwarded.flags &= ~DS4_DIST_WORK_F_OUTPUT_LOGITS;
     }
 
+    if (getenv("DS4_DIST_WORKER_FORWARD_SYNC") != NULL) {
+        int fd = dist_connect_endpoint(next->host, (int)next->port, err, sizeof(err));
+        if (fd < 0) {
+            return dist_worker_upstream_send_work_error(upstream, request_id, err);
+        }
+
+        const double send_t0 = dist_now_sec();
+        int rc = dist_send_work_frame(fd, &forwarded, tokens, hidden_hc, route_blob);
+        const double send_t1 = dist_now_sec();
+        if (rc != 0) {
+            close(fd);
+            return dist_worker_upstream_send_work_error(upstream,
+                                                        request_id,
+                                                        "failed to forward distributed work");
+        }
+
+        uint32_t kind = 0, payload_bytes = 0;
+        uint64_t result_hash = 0;
+        void *payload = NULL;
+        const double downstream_t0 = dist_now_sec();
+        rc = dist_recv_result_alloc(fd,
+                                    NULL,
+                                    request_id,
+                                    &kind,
+                                    &result_hash,
+                                    &payload,
+                                    &payload_bytes,
+                                    err,
+                                    sizeof(err));
+        const double downstream_t1 = dist_now_sec();
+        close(fd);
+        if (rc != 0) {
+            free(payload);
+            return dist_worker_upstream_send_work_error(upstream, request_id, err);
+        }
+
+        ds4_dist_telemetry_fixed local_telemetry;
+        const ds4_dist_telemetry_fixed *telemetry_out = NULL;
+        uint32_t telemetry_count = 0;
+        if (telemetry) {
+            local_telemetry = *telemetry;
+            local_telemetry.forward_send_usec = dist_usec_since(send_t0, send_t1);
+            local_telemetry.downstream_wait_usec =
+                dist_usec_since(downstream_t0, downstream_t1);
+            telemetry_out = &local_telemetry;
+            telemetry_count = 1;
+        }
+        const uint32_t payload_bits = 32u;
+        int send_rc = dist_worker_upstream_send_work_result(upstream,
+                                                            request_id,
+                                                            result_hash,
+                                                            0,
+                                                            kind,
+                                                            payload_bits,
+                                                            telemetry_out,
+                                                            telemetry_count,
+                                                            payload,
+                                                            payload_bytes);
+        free(payload);
+        return send_rc;
+    }
+
+    ds4_dist_worker_forwarder *forwarder =
+        dist_worker_get_forwarder(upstream, next->host, next->port, err, sizeof(err));
+    if (!forwarder) {
+        return dist_worker_upstream_send_work_error(upstream, request_id, err);
+    }
+
     pthread_mutex_lock(&forwarder->send_mu);
     const double send_t0 = dist_now_sec();
     if (!dist_worker_forwarder_enqueue_request(forwarder, request_id, telemetry, send_t0)) {
@@ -7993,6 +8230,10 @@ static int dist_run_worker(ds4_engine *engine, const ds4_dist_options *opt, int
         }
         fprintf(stderr, "ds4: distributed worker: coordinator disconnected%s; reconnecting\n",
                 rc ? " after error" : "");
+        if (dist_options_local_worker(opt)) {
+            fprintf(stderr, "ds4: local GPU worker: coordinator disconnected; exiting\n");
+            return rc ? 1 : 0;
+        }
         dist_sleep_reconnect();
     }
 }
@@ -8114,6 +8355,7 @@ ds4_dist_options *ds4_dist_options_create(void) {
 }
 
 void ds4_dist_options_free(ds4_dist_options *opt) {
+    ds4_internal_dist_clear_options_private(opt);
     free(opt);
 }
 
@@ -8287,7 +8529,8 @@ static int dist_validate_options(const ds4_dist_options *opt, char *err, size_t
         if (opt->layers.set || opt->listen_host || opt->listen_port ||
             opt->coordinator_host || opt->coordinator_port ||
             opt->prefill_chunk != 0 || opt->prefill_window != 0 ||
-            opt->activation_bits != 0) {
+            opt->activation_bits != 0 ||
+            dist_options_adopt_listen_fd(opt) > 0) {
             if (errlen) snprintf(err, errlen, "distributed options require --role coordinator or --role worker");
             return 1;
         }
@@ -8320,6 +8563,10 @@ static int dist_validate_options(const ds4_dist_options *opt, char *err, size_t
     }
 
     if (opt->role == DS4_DISTRIBUTED_WORKER) {
+        if (dist_options_adopt_listen_fd(opt) > 0) {
+            if (errlen) snprintf(err, errlen, "adopted listen fd requires --role coordinator");
+            return 1;
+        }
         if (!opt->coordinator_host || opt->coordinator_port <= 0) {
             if (errlen) snprintf(err, errlen, "--role worker requires --coordinator HOST PORT");
             return 1;
@@ -8355,6 +8602,7 @@ int ds4_dist_prepare_engine_options(
     }
     if (engine && opt) {
         engine->distributed = *opt;
+        ds4_internal_dist_copy_options_private(opt, &engine->distributed);
         if (ds4_dist_enabled(opt)) {
             engine->load_slice = true;
             engine->load_layer_start = opt->layers.start;
diff --git a/ds4_gpu.h b/ds4_gpu.h
index b58aca9bd..0ce667735 100644
--- a/ds4_gpu.h
+++ b/ds4_gpu.h
@@ -75,6 +75,7 @@ void ds4_gpu_set_ssd_streaming(bool enabled);
 void ds4_gpu_set_streaming_expert_cache_budget(uint32_t experts);
 void ds4_gpu_set_streaming_expert_cache_expert_bytes(uint64_t bytes);
 uint64_t ds4_gpu_recommended_working_set_size(void);
+int ds4_gpu_memory_info(uint64_t *free_bytes, uint64_t *total_bytes);
 uint32_t ds4_gpu_stream_expert_cache_configured_count(void);
 uint32_t ds4_gpu_stream_expert_cache_current_count(void);
 typedef struct ds4_gpu_stream_expert_table {
diff --git a/ds4_help.c b/ds4_help.c
index d32e088cf..d580234d4 100644
--- a/ds4_help.c
+++ b/ds4_help.c
@@ -146,7 +146,11 @@ static const char *tool_summary(ds4_help_tool tool) {
 static void print_model_runtime(FILE *fp, const help_colors *c,
                                 ds4_help_tool tool, bool full) {
     title(fp, c, "Model And Runtime");
-    opt(fp, c, "-m, --model FILE", "GGUF model path. Default: ds4flash.gguf");
+    if (tool == DS4_HELP_DS4) {
+        opt(fp, c, "-m, --model FILE", "GGUF model path. Repeat with --gpus for distributed split shards. Default: ds4flash.gguf");
+    } else {
+        opt(fp, c, "-m, --model FILE", "GGUF model path. Default: ds4flash.gguf");
+    }
 #ifdef DS4_ROCM_BUILD
     opt(fp, c, "--metal | --rocm | --cpu", "Select the backend explicitly.");
     opt(fp, c, "--backend NAME", "Backend name: metal, rocm, or cpu.");
@@ -211,7 +215,7 @@ static void print_steering(FILE *fp, const help_colors *c) {
     fputc('\n', fp);
 }
 
-static void print_distributed(FILE *fp, const help_colors *c) {
+static void print_distributed(FILE *fp, const help_colors *c, ds4_help_tool tool) {
     title(fp, c, "Distributed Inference");
     fputc('\n', fp);
     para(fp, c, "Distributed mode runs one logical session across several machines by assigning contiguous model layer ranges to workers. Workers own their layer slice and KV-cache shard; the coordinator owns the prompt, sampling loop, and client/API flow. Start workers first, then start the coordinator. The coordinator waits for a complete route and streams hidden states through the workers.");
@@ -225,6 +229,9 @@ static void print_distributed(FILE *fp, const help_colors *c) {
     opt(fp, c, "--dist-activation-bits N", "Hidden-state transport width: 32, 16, or 8. Default: 32");
     opt(fp, c, "--dist-replay-check", "Diagnostic: reset and replay prompt, then compare logits.");
     opt(fp, c, "--debug", "Print coordinator route/debug logs.");
+    if (tool == DS4_HELP_DS4) {
+        opt(fp, c, "--gpus LIST", "Local CUDA/ROCm launcher: split layers over comma-separated GPU ids, e.g. 0,1,2,3.");
+    }
     fputc('\n', fp);
 }
 
@@ -270,7 +277,7 @@ static void print_cli_commands(FILE *fp, const help_colors *c) {
     opt(fp, c, "/power N", "Set GPU duty cycle percentage, 1..100.");
     opt(fp, c, "/read FILE", "Read FILE and submit it as the next user message.");
     opt(fp, c, "/quit, /exit", "Leave the prompt.");
-    opt(fp, c, "Ctrl+C", "Stop current generation and return to ds4>.");
+    opt(fp, c, "Ctrl+C", "Stop current generation; at the prompt, exit.");
     fputc('\n', fp);
 }
 
@@ -480,7 +487,7 @@ static void print_topic(FILE *fp, const help_colors *c, ds4_help_tool tool, cons
         print_model_runtime(fp, c, tool, true);
         if (tool_has_topic(tool, "sampling")) print_sampling(fp, c, true);
         if (tool_has_topic(tool, "steering")) print_steering(fp, c);
-        print_distributed(fp, c);
+        print_distributed(fp, c, tool);
         if (tool == DS4_HELP_DS4) {
             print_cli_specific(fp, c, true);
             print_cli_commands(fp, c);
@@ -502,7 +509,7 @@ static void print_topic(FILE *fp, const help_colors *c, ds4_help_tool tool, cons
     if (streq(topic, "runtime")) print_model_runtime(fp, c, tool, true);
     else if (streq(topic, "sampling")) print_sampling(fp, c, true);
     else if (streq(topic, "steering")) print_steering(fp, c);
-    else if (streq(topic, "distributed")) print_distributed(fp, c);
+    else if (streq(topic, "distributed")) print_distributed(fp, c, tool);
     else if (tool == DS4_HELP_DS4 && streq(topic, "diagnostics")) print_cli_diagnostics(fp, c);
     else if (tool == DS4_HELP_DS4 && streq(topic, "commands")) print_cli_commands(fp, c);
     else if (tool == DS4_HELP_SERVER && streq(topic, "api")) print_server_api(fp, c);
diff --git a/ds4_internal.h b/ds4_internal.h
new file mode 100644
index 000000000..74efa3841
--- /dev/null
+++ b/ds4_internal.h
@@ -0,0 +1,36 @@
+#ifndef DS4_INTERNAL_H
+#define DS4_INTERNAL_H
+
+#include "ds4.h"
+
+#define DS4_INTERNAL_MAX_LAYER 61
+
+typedef struct {
+    uint32_t n_layers;
+    uint32_t first_layer;
+    uint32_t last_layer;
+    bool has_layers;
+    bool has_output_head;
+    bool has_token_embedding;
+    uint64_t total_cache_bytes;
+    uint64_t range_cache_bytes[DS4_INTERNAL_MAX_LAYER][DS4_INTERNAL_MAX_LAYER];
+} ds4_internal_model_shard_info;
+
+int ds4_internal_model_shard_info_from_file(
+        const char *model_path,
+        ds4_internal_model_shard_info *out);
+bool ds4_internal_session_cancel_requested(ds4_session *s);
+
+void ds4_internal_dist_set_adopt_listen_fd(
+        ds4_distributed_options *opt,
+        int fd);
+void ds4_internal_dist_set_local_worker(
+        ds4_distributed_options *opt,
+        bool local_worker);
+void ds4_internal_dist_copy_options_private(
+        const ds4_distributed_options *src,
+        const ds4_distributed_options *dst);
+void ds4_internal_dist_clear_options_private(
+        const ds4_distributed_options *opt);
+
+#endif
diff --git a/rocm/ds4_rocm_attention.cuh b/rocm/ds4_rocm_attention.cuh
index 0d688326b..5d2188b35 100644
--- a/rocm/ds4_rocm_attention.cuh
+++ b/rocm/ds4_rocm_attention.cuh
@@ -678,7 +678,13 @@ __global__ static void attention_decode_mixed_kernel(
     __syncthreads();
     uint32_t n_score = raw_count + visible_comp;
     float local_max = sinks[h];
-    if (visible_comp == 0 || n_tokens == 1u) {
+    const bool scalar_score_path =
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+        true;
+#else
+        (visible_comp == 0 || n_tokens == 1u);
+#endif
+    if (scalar_score_path) {
         for (uint32_t r = threadIdx.x; r < raw_count; r += blockDim.x) {
             const float *kvrow = raw_kv + (uint64_t)raw_rows[r] * head_dim;
             float dot = 0.0f;
diff --git a/rocm/ds4_rocm_attention_launch.cuh b/rocm/ds4_rocm_attention_launch.cuh
index b9b43d958..fd9734658 100644
--- a/rocm/ds4_rocm_attention_launch.cuh
+++ b/rocm/ds4_rocm_attention_launch.cuh
@@ -123,7 +123,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads,
             model_map, sinks_offset, (uint64_t)n_head * sizeof(float), "attn_sinks");
     if (!sinks) return 0;
     if (n_tokens > 1 && head_dim == 512 &&
-        !g_quality_mode &&
+        cuda_runtime_config()->prefill_online_attention &&
         ((window != 0u ? window : n_tokens) <= 768u)) {
         dim3 grid(n_tokens, (n_head + 7u) / 8u, 1);
         attention_static_mixed_heads8_online_kernel<<<grid, 256>>>((float *)heads->ptr,
@@ -247,9 +247,9 @@ static int attention_decode_batch_launch(
     const float *sinks = (const float *)cuda_model_range_ptr(
             model_map, sinks_offset, (uint64_t)n_head * sizeof(float), "attn_sinks");
     if (!sinks) return 0;
-    const int fast_window_attention = !g_quality_mode;
+    const int fast_window_attention = cuda_runtime_config()->prefill_online_attention;
     if (!cuda_attention_score_buffer_fits(n_comp)) {
-        if (!use_comp_mask && head_dim == 512u) {
+        if (!use_comp_mask && head_dim == 512u && fast_window_attention) {
             dim3 online_grid(n_tokens, (n_head + 7u) / 8u, 1);
             attention_decode_mixed_heads8_online_kernel<<<online_grid, 256>>>((float *)heads->ptr,
                                                                               sinks,
@@ -647,7 +647,7 @@ static int attention_prefill_mixed_launch(
             model_map, sinks_offset, (uint64_t)n_head * sizeof(float), "attn_sinks");
     if (!sinks) return 0;
     if (!use_comp_mask && n_tokens > 1 && head_dim == 512 &&
-        !g_quality_mode &&
+        cuda_runtime_config()->prefill_online_attention &&
         ((window != 0u ? window : n_tokens) + n_comp <= 768u)) {
         dim3 grid(n_tokens, (n_head + 7u) / 8u, 1);
         attention_static_mixed_heads8_online_kernel<<<grid, 256>>>((float *)heads->ptr,
@@ -860,6 +860,12 @@ extern "C" int ds4_gpu_attention_output_q8_batch_f16_tensor(
         group_dim == 0 || rank == 0 || n_groups == 0 || out_dim == 0 || n_tokens == 0) {
         return 0;
     }
+    /*
+     * Prefill attention heads can exceed fp16 range after attention/inverse-rope
+     * on Pro.  Packing those rows to fp16 turns finite activations into Inf and
+     * poisons the output projection, so keep batched attention output in f32.
+     */
+    if (n_tokens > 1u) return 0;
     if (g_ssd_streaming_mode && n_tokens > 1u) return 0;
     const uint64_t low_dim = (uint64_t)n_groups * rank;
     const uint64_t blocks_a = (group_dim + 31) / 32;
@@ -992,7 +998,8 @@ extern "C" int ds4_gpu_attention_output_q8_batch_tensor(
 
     const int attn_output_cublas =
         cuda_runtime_config()->attention_output_cublas_all &&
-        (n_tokens == 1u || !g_ssd_streaming_mode);
+        n_tokens == 1u &&
+        !g_ssd_streaming_mode;
     if (!attn_output_cublas) {
         if ((group_dim & 31u) == 0u && rank <= UINT32_MAX && n_tokens <= UINT32_MAX) {
             const uint32_t rows_per_block = 32u;
diff --git a/rocm/ds4_rocm_current_api_compat.cuh b/rocm/ds4_rocm_current_api_compat.cuh
index 3fa7a0c29..b532ef38e 100644
--- a/rocm/ds4_rocm_current_api_compat.cuh
+++ b/rocm/ds4_rocm_current_api_compat.cuh
@@ -189,6 +189,18 @@ extern "C" uint64_t ds4_gpu_recommended_working_set_size(void) {
     return (uint64_t)total_b;
 }
 
+extern "C" int ds4_gpu_memory_info(uint64_t *free_bytes, uint64_t *total_bytes) {
+    size_t free_b = 0;
+    size_t total_b = 0;
+    if (cudaMemGetInfo(&free_b, &total_b) != cudaSuccess) {
+        (void)cudaGetLastError();
+        return 0;
+    }
+    if (free_bytes) *free_bytes = (uint64_t)free_b;
+    if (total_bytes) *total_bytes = (uint64_t)total_b;
+    return 1;
+}
+
 extern "C" uint32_t ds4_gpu_stream_expert_cache_configured_count(void) {
     return g_ssd_streaming_mode ? g_stream_expert_cache_budget : 0;
 }
diff --git a/rocm/ds4_rocm_matmul.cuh b/rocm/ds4_rocm_matmul.cuh
index 0819b4b6e..011c411b8 100644
--- a/rocm/ds4_rocm_matmul.cuh
+++ b/rocm/ds4_rocm_matmul.cuh
@@ -293,22 +293,23 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
             return cuda_ok(cudaGetLastError(), "matmul_q8_0 f32 batch wmma 4w launch");
         }
 #endif
-#if (defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)) && DS4_ROCM_MFMA_F16
-        if (!g_quality_mode && (in_dim % 32u) == 0u &&
+#if (defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)) && DS4_ROCM_MFMA_F16 && (DS4_ROCM_DIRECT_MFMA_F16 || DS4_ROCM_ROCWMMA_F16_FALLBACK)
+        if (!g_quality_mode && cuda_runtime_config()->q8_batch_mfma &&
+            (in_dim % 32u) == 0u &&
             out_dim >= 1024u &&
             n_tok >= 32u &&
             in_dim <= UINT32_MAX && out_dim <= UINT32_MAX && n_tok <= UINT32_MAX) {
             constexpr uint32_t tiles_n = 4u;
             constexpr uint32_t bm = 16u;
             constexpr uint32_t bn = 16u;
-            constexpr uint32_t bk = 16u;
+            constexpr uint32_t bk_max = 32u;
             const dim3 grid((uint32_t)((out_dim + tiles_n * bn - 1u) / (tiles_n * bn)),
                             (uint32_t)((n_tok + bm - 1u) / bm),
                             1u);
             const size_t shmem =
-                ((size_t)bm * bk + (size_t)tiles_n * bk * bn) * sizeof(half) +
+                ((size_t)bm * bk_max + (size_t)tiles_n * bk_max * bn) * sizeof(half) +
                 (size_t)tiles_n * bm * bn * sizeof(float);
-            matmul_q8_0_f32_batch_mfma_w64_onthefly_kernel<tiles_n,bm,bn,bk><<<grid, 256u, shmem>>>(
+            matmul_q8_0_f32_batch_mfma_w64_onthefly_kernel<tiles_n,bm,bn><<<grid, 256u, shmem>>>(
                     (float *)out->ptr,
                     reinterpret_cast<const unsigned char *>(wptr),
                     (const float *)x->ptr,
diff --git a/rocm/ds4_rocm_mfma.cuh b/rocm/ds4_rocm_mfma.cuh
new file mode 100644
index 000000000..1ba7bb5a0
--- /dev/null
+++ b/rocm/ds4_rocm_mfma.cuh
@@ -0,0 +1,61 @@
+// DS4 ROCm direct MFMA wrappers.
+
+#pragma once
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+
+#include <stdint.h>
+
+typedef _Float16 __attribute__((ext_vector_type(4))) ds4_rocm_f16x4_t;
+typedef _Float16 __attribute__((ext_vector_type(8))) ds4_rocm_f16x8_t;
+typedef float    __attribute__((ext_vector_type(4))) ds4_rocm_f32x4_t;
+
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define DS4_ROCM_ARCH_CDNA3 1
+#else
+#define DS4_ROCM_ARCH_CDNA3 0
+#endif
+
+#if defined(__gfx950__)
+#define DS4_ROCM_ARCH_CDNA4 1
+#else
+#define DS4_ROCM_ARCH_CDNA4 0
+#endif
+
+#if DS4_ROCM_ARCH_CDNA4
+#define DS4_ROCM_MFMA_F16_K 32u
+#define DS4_ROCM_MFMA_F16_K_PER_LANE 8u
+typedef ds4_rocm_f16x8_t ds4_rocm_mfma_f16_frag_t;
+#elif DS4_ROCM_ARCH_CDNA3
+#define DS4_ROCM_MFMA_F16_K 16u
+#define DS4_ROCM_MFMA_F16_K_PER_LANE 4u
+typedef ds4_rocm_f16x4_t ds4_rocm_mfma_f16_frag_t;
+#else
+#define DS4_ROCM_MFMA_F16_K 16u
+#define DS4_ROCM_MFMA_F16_K_PER_LANE 4u
+typedef ds4_rocm_f16x4_t ds4_rocm_mfma_f16_frag_t;
+#endif
+
+__device__ __forceinline__ static ds4_rocm_f32x4_t ds4_rocm_f32x4_zero(void) {
+    ds4_rocm_f32x4_t v;
+#pragma unroll
+    for (uint32_t i = 0; i < 4u; i++) v[i] = 0.0f;
+    return v;
+}
+
+__device__ __forceinline__ static ds4_rocm_f32x4_t ds4_rocm_mfma_f16_16x16(
+        ds4_rocm_mfma_f16_frag_t a,
+        ds4_rocm_mfma_f16_frag_t b,
+        ds4_rocm_f32x4_t c) {
+#if DS4_ROCM_ARCH_CDNA4
+    return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 0, 0, 0);
+#elif DS4_ROCM_ARCH_CDNA3
+    return __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
+#else
+    (void)a;
+    (void)b;
+    return c;
+#endif
+}
+
+#endif
diff --git a/rocm/ds4_rocm_moe.cuh b/rocm/ds4_rocm_moe.cuh
index 956b1fb22..6209d3b22 100644
--- a/rocm/ds4_rocm_moe.cuh
+++ b/rocm/ds4_rocm_moe.cuh
@@ -16,20 +16,56 @@ __device__ __forceinline__ static uint32_t dev_pack_half2_bits(float x, float y)
     return *reinterpret_cast<const uint32_t *>(&h);
 }
 
+__device__ __forceinline__ static uint32_t dev_load_u32_le(const uint8_t *p) {
+    return (uint32_t)p[0] |
+           ((uint32_t)p[1] << 8) |
+           ((uint32_t)p[2] << 16) |
+           ((uint32_t)p[3] << 24);
+}
+
+__device__ __forceinline__ static int32_t dev_packed_i8_lane(int32_t v, uint32_t lane) {
+    return (int32_t)(int8_t)((uint32_t)v >> (lane * 8u));
+}
+
+__device__ __forceinline__ static int32_t dev_dp4a_i8(int32_t a, int32_t b, int32_t acc) {
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+    #pragma unroll
+    for (uint32_t i = 0; i < 4u; i++) {
+        acc += dev_packed_i8_lane(a, i) * dev_packed_i8_lane(b, i);
+    }
+    return acc;
+#else
+    return __dp4a(a, b, acc);
+#endif
+}
+
 __device__ __forceinline__ static uint32_t dev_unpack_iq2_signs(uint32_t v) {
     const uint32_t p = __popc(v) & 1u;
     const uint32_t s = v ^ (p << 7u);
     return s * 0x01010101u;
 }
 
+__device__ __forceinline__ static int32_t dev_iq2_signed_grid_i8x4(
+        uint64_t grid,
+        uint32_t signs,
+        uint32_t first) {
+    uint32_t packed = 0;
+    #pragma unroll
+    for (uint32_t i = 0; i < 4u; i++) {
+        const uint32_t idx = first + i;
+        int32_t v = (int32_t)((grid >> (idx * 8u)) & 0xffu);
+        if (signs & (1u << idx)) v = -v;
+        packed |= ((uint32_t)(uint8_t)(int8_t)v) << (i * 8u);
+    }
+    return (int32_t)packed;
+}
+
 __device__ __forceinline__ static int32_t dev_iq2_dp4a_8(uint64_t grid, uint32_t sign, const int8_t *q8, int32_t acc) {
     const uint32_t signs = dev_unpack_iq2_signs(sign);
-    const int32_t sm0 = __vcmpne4(signs & 0x08040201u, 0);
-    const int32_t sm1 = __vcmpne4(signs & 0x80402010u, 0);
-    const int32_t g0 = __vsub4((int32_t)(uint32_t)grid ^ sm0, sm0);
-    const int32_t g1 = __vsub4((int32_t)(uint32_t)(grid >> 32) ^ sm1, sm1);
-    acc = __dp4a(g0, *(const int32_t *)(q8 + 0), acc);
-    acc = __dp4a(g1, *(const int32_t *)(q8 + 4), acc);
+    const int32_t g0 = dev_iq2_signed_grid_i8x4(grid, signs, 0u);
+    const int32_t g1 = dev_iq2_signed_grid_i8x4(grid, signs, 4u);
+    acc = dev_dp4a_i8(g0, (int32_t)dev_load_u32_le((const uint8_t *)(q8 + 0)), acc);
+    acc = dev_dp4a_i8(g1, (int32_t)dev_load_u32_le((const uint8_t *)(q8 + 4)), acc);
     return acc;
 }
 
@@ -37,8 +73,9 @@ __device__ static int32_t dev_dot_q2_16(const uint8_t *q2, const int8_t *q8, int
     int32_t sum = 0;
     #pragma unroll
     for (uint32_t i = 0; i < 16; i += 4) {
-        const int32_t v = (*(const int32_t *)(q2 + i) >> shift) & 0x03030303;
-        sum = __dp4a(v, *(const int32_t *)(q8 + i), sum);
+        const int32_t v = (int32_t)((dev_load_u32_le(q2 + i) >> shift) & 0x03030303);
+        const int32_t y = (int32_t)dev_load_u32_le((const uint8_t *)(q8 + i));
+        sum = dev_dp4a_i8(v, y, sum);
     }
     return sum;
 }
@@ -58,11 +95,16 @@ __device__ __forceinline__ static void dev_iq2_i8x8_lut(
         int32_t *w0,
         int32_t *w1) {
     const uint32_t s = dev_unpack_iq2_signs(signs[sign_idx]);
-    const int32_t sm0 = __vcmpne4(s & 0x08040201u, 0);
-    const int32_t sm1 = __vcmpne4(s & 0x80402010u, 0);
     const uint64_t g = grid[grid_idx];
-    *w0 = __vsub4((int32_t)(uint32_t)g ^ sm0, sm0);
-    *w1 = __vsub4((int32_t)(uint32_t)(g >> 32) ^ sm1, sm1);
+    *w0 = dev_iq2_signed_grid_i8x4(g, s, 0u);
+    *w1 = dev_iq2_signed_grid_i8x4(g, s, 4u);
+}
+
+__device__ __forceinline__ static void dev_iq2_load_shared_lut(
+        uint64_t *grid,
+        uint8_t *signs) {
+    for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) grid[i] = cuda_iq2xxs_grid[i];
+    for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) signs[i] = cuda_ksigns_iq2xs[i];
 }
 
 __device__ static float dev_dot_iq2_xxs_q8_K_block_lut(
@@ -85,14 +127,14 @@ __device__ static float dev_dot_iq2_xxs_q8_K_block_lut(
         dev_iq2_i8x8_lut(grid, signs, (uint8_t)((aux0 >> 16) & 0xffu),   (aux1 >> 14) & 127u, &w[4], &w[5]);
         dev_iq2_i8x8_lut(grid, signs, (uint8_t)((aux0 >> 24) & 0xffu),   (aux1 >> 21) & 127u, &w[6], &w[7]);
         int32_t sumi = 0;
-        sumi = __dp4a(w[0], *(const int32_t *)(q8 + ib32 * 32u + 0),  sumi);
-        sumi = __dp4a(w[1], *(const int32_t *)(q8 + ib32 * 32u + 4),  sumi);
-        sumi = __dp4a(w[2], *(const int32_t *)(q8 + ib32 * 32u + 8),  sumi);
-        sumi = __dp4a(w[3], *(const int32_t *)(q8 + ib32 * 32u + 12), sumi);
-        sumi = __dp4a(w[4], *(const int32_t *)(q8 + ib32 * 32u + 16), sumi);
-        sumi = __dp4a(w[5], *(const int32_t *)(q8 + ib32 * 32u + 20), sumi);
-        sumi = __dp4a(w[6], *(const int32_t *)(q8 + ib32 * 32u + 24), sumi);
-        sumi = __dp4a(w[7], *(const int32_t *)(q8 + ib32 * 32u + 28), sumi);
+        sumi = dev_dp4a_i8(w[0], (int32_t)dev_load_u32_le((const uint8_t *)(q8 + ib32 * 32u + 0)),  sumi);
+        sumi = dev_dp4a_i8(w[1], (int32_t)dev_load_u32_le((const uint8_t *)(q8 + ib32 * 32u + 4)),  sumi);
+        sumi = dev_dp4a_i8(w[2], (int32_t)dev_load_u32_le((const uint8_t *)(q8 + ib32 * 32u + 8)),  sumi);
+        sumi = dev_dp4a_i8(w[3], (int32_t)dev_load_u32_le((const uint8_t *)(q8 + ib32 * 32u + 12)), sumi);
+        sumi = dev_dp4a_i8(w[4], (int32_t)dev_load_u32_le((const uint8_t *)(q8 + ib32 * 32u + 16)), sumi);
+        sumi = dev_dp4a_i8(w[5], (int32_t)dev_load_u32_le((const uint8_t *)(q8 + ib32 * 32u + 20)), sumi);
+        sumi = dev_dp4a_i8(w[6], (int32_t)dev_load_u32_le((const uint8_t *)(q8 + ib32 * 32u + 24)), sumi);
+        sumi = dev_dp4a_i8(w[7], (int32_t)dev_load_u32_le((const uint8_t *)(q8 + ib32 * 32u + 28)), sumi);
         bsum += sumi * ls;
     }
     return 0.125f * xd * y->d * (float)bsum;
@@ -156,14 +198,14 @@ __device__ static void dev_dot_iq2_xxs_q8_K_block8_deq_lut(
         for (uint32_t p = 0; p < n; p++) {
             const int8_t *q = q8[p] + ib32 * 32;
             int32_t sumi = 0;
-            sumi = __dp4a(w[0], *(const int32_t *)(q + 0),  sumi);
-            sumi = __dp4a(w[1], *(const int32_t *)(q + 4),  sumi);
-            sumi = __dp4a(w[2], *(const int32_t *)(q + 8),  sumi);
-            sumi = __dp4a(w[3], *(const int32_t *)(q + 12), sumi);
-            sumi = __dp4a(w[4], *(const int32_t *)(q + 16), sumi);
-            sumi = __dp4a(w[5], *(const int32_t *)(q + 20), sumi);
-            sumi = __dp4a(w[6], *(const int32_t *)(q + 24), sumi);
-            sumi = __dp4a(w[7], *(const int32_t *)(q + 28), sumi);
+            sumi = dev_dp4a_i8(w[0], (int32_t)dev_load_u32_le((const uint8_t *)(q + 0)),  sumi);
+            sumi = dev_dp4a_i8(w[1], (int32_t)dev_load_u32_le((const uint8_t *)(q + 4)),  sumi);
+            sumi = dev_dp4a_i8(w[2], (int32_t)dev_load_u32_le((const uint8_t *)(q + 8)),  sumi);
+            sumi = dev_dp4a_i8(w[3], (int32_t)dev_load_u32_le((const uint8_t *)(q + 12)), sumi);
+            sumi = dev_dp4a_i8(w[4], (int32_t)dev_load_u32_le((const uint8_t *)(q + 16)), sumi);
+            sumi = dev_dp4a_i8(w[5], (int32_t)dev_load_u32_le((const uint8_t *)(q + 20)), sumi);
+            sumi = dev_dp4a_i8(w[6], (int32_t)dev_load_u32_le((const uint8_t *)(q + 24)), sumi);
+            sumi = dev_dp4a_i8(w[7], (int32_t)dev_load_u32_le((const uint8_t *)(q + 28)), sumi);
             bsum[p] += sumi * ls;
         }
     }
@@ -265,8 +307,9 @@ __device__ __forceinline__ static int32_t dev_dot_q4_32(const uint8_t *qs, const
     int32_t sum = 0;
     #pragma unroll
     for (uint32_t i = 0; i < 32u; i += 4u) {
-        const int32_t v = (*(const int32_t *)(qs + i) >> shift) & 0x0f0f0f0f;
-        sum = __dp4a(v, *(const int32_t *)(q8 + i), sum);
+        const int32_t v = (int32_t)((dev_load_u32_le(qs + i) >> shift) & 0x0f0f0f0f);
+        const int32_t y = (int32_t)dev_load_u32_le((const uint8_t *)(q8 + i));
+        sum = dev_dp4a_i8(v, y, sum);
     }
     return sum;
 }
@@ -465,7 +508,11 @@ __device__ static void dev_dot_q2_K_q8_K_block8(
 __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
     uint32_t mask = 0xffffu << (threadIdx.x & 16u);
     for (int offset = 8; offset > 0; offset >>= 1) {
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+        v += __shfl_down(v, offset, 16);
+#else
         v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 16);
+#endif
     }
     (void)lane16;
     return v;
@@ -474,7 +521,11 @@ __device__ static float half_warp_sum_f32(float v, uint32_t lane16) {
 __device__ static float quarter_warp_sum_f32(float v, uint32_t lane8) {
     uint32_t mask = 0xffu << (threadIdx.x & 24u);
     for (int offset = 4; offset > 0; offset >>= 1) {
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+        v += __shfl_down(v, offset, 8);
+#else
         v += __shfl_down_sync(static_cast<MASK_T>(mask), v, offset, 8);
+#endif
     }
     (void)lane8;
     return v;
@@ -881,13 +932,12 @@ __global__ static void moe_gate_up_mid_decode_lut_qwarp32_kernel(
     __shared__ cuda_block_q8_K sxq[16];
     __shared__ uint64_t s_iq2_grid[256];
     __shared__ uint8_t s_iq2_signs[128];
+    dev_iq2_load_shared_lut(s_iq2_grid, s_iq2_signs);
     if (xq_blocks <= 16u) {
         for (uint32_t i = threadIdx.x; i < xq_blocks; i += blockDim.x) sxq[i] = xqb[i];
-        for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) s_iq2_grid[i] = cuda_iq2xxs_grid[i];
-        for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) s_iq2_signs[i] = cuda_ksigns_iq2xs[i];
-        __syncthreads();
-        xqb = sxq;
     }
+    __syncthreads();
+    if (xq_blocks <= 16u) xqb = sxq;
     for (uint32_t rr = 0; rr < 4u; rr++) {
         uint32_t row = blockIdx.x * 128u + row_lane + rr * 32u;
         if (row >= expert_mid_dim) continue;
@@ -949,13 +999,12 @@ __global__ static void moe_gate_up_mid_decode_lut_qwarp32_ptrs_kernel(
     __shared__ cuda_block_q8_K sxq[16];
     __shared__ uint64_t s_iq2_grid[256];
     __shared__ uint8_t s_iq2_signs[128];
+    dev_iq2_load_shared_lut(s_iq2_grid, s_iq2_signs);
     if (xq_blocks <= 16u) {
         for (uint32_t i = threadIdx.x; i < xq_blocks; i += blockDim.x) sxq[i] = xqb[i];
-        for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) s_iq2_grid[i] = cuda_iq2xxs_grid[i];
-        for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) s_iq2_signs[i] = cuda_ksigns_iq2xs[i];
-        __syncthreads();
-        xqb = sxq;
     }
+    __syncthreads();
+    if (xq_blocks <= 16u) xqb = sxq;
     for (uint32_t rr = 0; rr < 4u; rr++) {
         uint32_t row = blockIdx.x * 128u + row_lane + rr * 32u;
         if (row >= expert_mid_dim) continue;
@@ -1321,15 +1370,16 @@ __global__ static void moe_gate_up_mid_expert_tile8_row32_kernel(
         slot[np] = pair[np] - tok[np] * n_expert;
         xqb[np] = xq + (uint64_t)tok[np] * xq_blocks;
     }
+    dev_iq2_load_shared_lut(s_iq2_grid, s_iq2_signs);
     if (xq_blocks <= 16u) {
         for (uint32_t i = threadIdx.x; i < np * xq_blocks; i += blockDim.x) {
             uint32_t p = i / xq_blocks;
             uint32_t b = i - p * xq_blocks;
             sxq[p][b] = xqb[p][b];
         }
-        for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) s_iq2_grid[i] = cuda_iq2xxs_grid[i];
-        for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) s_iq2_signs[i] = cuda_ksigns_iq2xs[i];
-        __syncthreads();
+    }
+    __syncthreads();
+    if (xq_blocks <= 16u) {
         for (uint32_t p = 0; p < np; p++) xqb[p] = sxq[p];
     }
     if (row >= expert_mid_dim) return;
@@ -1414,15 +1464,16 @@ __global__ static void moe_gate_up_mid_expert_tile8_row2048_kernel(
         slot[np] = pair[np] - tok[np] * n_expert;
         xqb[np] = xq + (uint64_t)tok[np] * xq_blocks;
     }
+    dev_iq2_load_shared_lut(s_iq2_grid, s_iq2_signs);
     if (xq_blocks <= 16u) {
         for (uint32_t i = threadIdx.x; i < np * xq_blocks; i += blockDim.x) {
             uint32_t p = i / xq_blocks;
             uint32_t b = i - p * xq_blocks;
             sxq[p][b] = xqb[p][b];
         }
-        for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) s_iq2_grid[i] = cuda_iq2xxs_grid[i];
-        for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) s_iq2_signs[i] = cuda_ksigns_iq2xs[i];
-        __syncthreads();
+    }
+    __syncthreads();
+    if (xq_blocks <= 16u) {
         for (uint32_t p = 0; p < np; p++) xqb[p] = sxq[p];
     }
     for (uint32_t rr = 0; rr < 64u; rr++) {
@@ -1511,15 +1562,16 @@ __global__ static void moe_gate_up_mid_expert_tile8_rowspan_kernel(
         slot[np] = pair[np] - tok[np] * n_expert;
         xqb[np] = xq + (uint64_t)tok[np] * xq_blocks;
     }
+    dev_iq2_load_shared_lut(s_iq2_grid, s_iq2_signs);
     if (xq_blocks <= 16u) {
         for (uint32_t i = threadIdx.x; i < np * xq_blocks; i += blockDim.x) {
             uint32_t p = i / xq_blocks;
             uint32_t b = i - p * xq_blocks;
             sxq[p][b] = xqb[p][b];
         }
-        for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) s_iq2_grid[i] = cuda_iq2xxs_grid[i];
-        for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) s_iq2_signs[i] = cuda_ksigns_iq2xs[i];
-        __syncthreads();
+    }
+    __syncthreads();
+    if (xq_blocks <= 16u) {
         for (uint32_t p = 0; p < np; p++) xqb[p] = sxq[p];
     }
     for (uint32_t rr = 0; rr < ROW_SPAN / 32u; rr++) {
@@ -1996,6 +2048,71 @@ __global__ static void moe_gate_up_mid_decode_q4K_qwarp32_kernel(
     }
 }
 
+__global__ static void moe_gate_up_mid_decode_q4K_qwarp32_ptrs_kernel(
+        float *gate_out,
+        float *up_out,
+        float *mid_out,
+        const char * const *gate_slots,
+        const char * const *up_slots,
+        const cuda_block_q8_K *xq,
+        const int32_t *selected,
+        const float *weights,
+        uint64_t gate_row_bytes,
+        uint32_t xq_blocks,
+        uint32_t expert_mid_dim,
+        uint32_t n_expert,
+        uint32_t write_aux,
+        float clamp) {
+    __shared__ float s_gate[256];
+    __shared__ float s_up[256];
+    uint32_t lane = threadIdx.x & 7u;
+    uint32_t row_lane = threadIdx.x >> 3u;
+    uint32_t pair = blockIdx.y;
+    uint32_t tok = pair / n_expert;
+    uint32_t slot = pair - tok * n_expert;
+    int32_t expert_i = selected[(uint64_t)tok * n_expert + slot];
+    if (expert_i < 0) expert_i = 0;
+    const char *gate_base = gate_slots[(uint32_t)expert_i];
+    const char *up_base = up_slots[(uint32_t)expert_i];
+    if (!gate_base || !up_base) return;
+    const cuda_block_q8_K *xqb = xq + (uint64_t)tok * xq_blocks;
+    for (uint32_t rr = 0; rr < 4u; rr++) {
+        uint32_t row = blockIdx.x * 128u + row_lane + rr * 32u;
+        float gate = 0.0f;
+        float up = 0.0f;
+        if (row < expert_mid_dim) {
+            const cuda_block_q4_K *gr = (const cuda_block_q4_K *)(gate_base + (uint64_t)row * gate_row_bytes);
+            const cuda_block_q4_K *ur = (const cuda_block_q4_K *)(up_base + (uint64_t)row * gate_row_bytes);
+            for (uint32_t b = lane; b < xq_blocks; b += 8u) {
+                gate += dev_dot_q4_K_q8_K_block(gr + b, xqb + b);
+                up += dev_dot_q4_K_q8_K_block(ur + b, xqb + b);
+            }
+        }
+        s_gate[threadIdx.x] = gate;
+        s_up[threadIdx.x] = up;
+        __syncthreads();
+        if (row < expert_mid_dim && lane == 0) {
+            #pragma unroll
+            for (uint32_t i = 1u; i < 8u; i++) {
+                gate += s_gate[threadIdx.x + i];
+                up += s_up[threadIdx.x + i];
+            }
+            if (clamp > 1.0e-6f) {
+                if (gate > clamp) gate = clamp;
+                if (up > clamp) up = clamp;
+                if (up < -clamp) up = -clamp;
+            }
+            const uint64_t off = (uint64_t)pair * expert_mid_dim + row;
+            if (write_aux) {
+                gate_out[off] = gate;
+                up_out[off] = up;
+            }
+            mid_out[off] = (gate / (1.0f + expf(-gate))) * up * weights[(uint64_t)tok * n_expert + slot];
+        }
+        __syncthreads();
+    }
+}
+
 __global__ static void moe_gate_up_mid_q2K_decode_q8_qwarp32_kernel(
         float *gate_out,
         float *up_out,
@@ -2166,6 +2283,41 @@ __global__ static void moe_down_q4K_sum6_qwarp32_kernel(
     if (lane == 0) out[row] = total;
 }
 
+__global__ static void moe_down_q4K_sum6_qwarp32_ptrs_kernel(
+        float *out,
+        const char * const *down_slots,
+        const cuda_block_q8_K *midq,
+        const int32_t *selected,
+        uint64_t down_row_bytes,
+        uint32_t midq_blocks,
+        uint32_t out_dim) {
+    __shared__ float s_acc[256];
+    uint32_t lane = threadIdx.x & 7u;
+    uint32_t row = blockIdx.x * 32u + (threadIdx.x >> 3u);
+    float total = 0.0f;
+    #pragma unroll
+    for (uint32_t slot = 0; slot < 6u; slot++) {
+        int32_t expert_i = selected[slot];
+        if (expert_i < 0) expert_i = 0;
+        const char *down_base = down_slots[(uint32_t)expert_i];
+        float acc = 0.0f;
+        if (row < out_dim && down_base) {
+            const cuda_block_q4_K *wr = (const cuda_block_q4_K *)(down_base + (uint64_t)row * down_row_bytes);
+            const cuda_block_q8_K *xq = midq + (uint64_t)slot * midq_blocks;
+            for (uint32_t b = lane; b < midq_blocks; b += 8u) acc += dev_dot_q4_K_q8_K_block(wr + b, xq + b);
+        }
+        s_acc[threadIdx.x] = acc;
+        __syncthreads();
+        if (row < out_dim && lane == 0) {
+            #pragma unroll
+            for (uint32_t i = 1u; i < 8u; i++) acc += s_acc[threadIdx.x + i];
+            total += acc;
+        }
+        __syncthreads();
+    }
+    if (row < out_dim && lane == 0) out[row] = total;
+}
+
 __global__ static void moe_down_q4K_qwarp32_kernel(
         float *down_out,
         const char *down_base,
@@ -2192,6 +2344,39 @@ __global__ static void moe_down_q4K_qwarp32_kernel(
     if (lane == 0) down_out[(uint64_t)pair * out_dim + row] = acc;
 }
 
+__global__ static void moe_down_q4K_qwarp32_ptrs_kernel(
+        float *down_out,
+        const char * const *down_slots,
+        const cuda_block_q8_K *midq,
+        const int32_t *selected,
+        uint64_t down_row_bytes,
+        uint32_t midq_blocks,
+        uint32_t out_dim,
+        uint32_t n_expert) {
+    __shared__ float s_acc[256];
+    uint32_t lane = threadIdx.x & 7u;
+    uint32_t row = blockIdx.x * 32u + (threadIdx.x >> 3u);
+    uint32_t pair = blockIdx.y;
+    uint32_t tok = pair / n_expert;
+    uint32_t slot = pair - tok * n_expert;
+    int32_t expert_i = selected[(uint64_t)tok * n_expert + slot];
+    if (expert_i < 0) expert_i = 0;
+    const char *down_base = down_slots[(uint32_t)expert_i];
+    float acc = 0.0f;
+    if (row < out_dim && down_base) {
+        const cuda_block_q4_K *wr = (const cuda_block_q4_K *)(down_base + (uint64_t)row * down_row_bytes);
+        const cuda_block_q8_K *xq = midq + (uint64_t)pair * midq_blocks;
+        for (uint32_t b = lane; b < midq_blocks; b += 8u) acc += dev_dot_q4_K_q8_K_block(wr + b, xq + b);
+    }
+    s_acc[threadIdx.x] = acc;
+    __syncthreads();
+    if (row < out_dim && lane == 0) {
+        #pragma unroll
+        for (uint32_t i = 1u; i < 8u; i++) acc += s_acc[threadIdx.x + i];
+        down_out[(uint64_t)pair * out_dim + row] = acc;
+    }
+}
+
 __global__ static void moe_down_q4K_sorted_qwarp32_kernel(
         float *down_out,
         const char *down_base,
diff --git a/rocm/ds4_rocm_moe_launch.cuh b/rocm/ds4_rocm_moe_launch.cuh
index 6eafc377b..ae02b5960 100644
--- a/rocm/ds4_rocm_moe_launch.cuh
+++ b/rocm/ds4_rocm_moe_launch.cuh
@@ -47,13 +47,10 @@ static int routed_moe_q2_float_down_launch(
     uint32_t hot_count = 0u;
     uint32_t hot_max = 0u;
     const uint32_t hot_threshold = 8u;
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-    const int use_wmma_hot = hot_experts_dev &&
-        !g_quality_mode &&
-        (expert_mid_dim % 16u) == 0u && (out_dim % 16u) == 0u;
-#else
+    /* The IQ2/Q2 float-down hotlist WMMA overlay can write huge invalid
+     * activations on CDNA, which later overflow into NaN logits.  Keep this
+     * path on the scalar expert-batch kernel until the overlay has coverage. */
     const int use_wmma_hot = 0;
-#endif
     uint32_t h_hot[DS4_ROCM_MAX_N_EXPERT] = {0};
     if (use_wmma_hot) {
         for (uint32_t e = 0; e < n_total_expert; e++) {
@@ -372,6 +369,9 @@ static int routed_moe_launch(
     const char **gate_slot_ptrs = NULL;
     const char **up_slot_ptrs = NULL;
     const char **down_slot_ptrs = NULL;
+    const char **q4_gate_slot_ptrs = NULL;
+    const char **q4_up_slot_ptrs = NULL;
+    const char **q4_down_slot_ptrs = NULL;
     const char **resident_gate_slot_ptrs = NULL;
     const char **resident_up_slot_ptrs = NULL;
     const char **missing_gate_slot_ptrs = NULL;
@@ -520,6 +520,52 @@ static int routed_moe_launch(
     }
 
     int ok = 1;
+    if (q4k_path &&
+        !compact_selected &&
+        !batch_stream_selected &&
+        !batch_stream_split_selected) {
+        if (!gate_w || !up_w || !down_w ||
+            n_total_expert == 0 ||
+            n_total_expert > DS4_ROCM_MAX_N_EXPERT) {
+            return 0;
+        }
+        const uint64_t ptr_bytes = (uint64_t)n_total_expert * sizeof(char *);
+        uint8_t *ptr_scratch = (uint8_t *)cuda_tmp_alloc(ptr_bytes * 3u,
+                                                         "routed_moe q4 expert ptr table");
+        if (!ptr_scratch) return 0;
+        q4_gate_slot_ptrs = (const char **)ptr_scratch;
+        q4_up_slot_ptrs = (const char **)(ptr_scratch + ptr_bytes);
+        q4_down_slot_ptrs = (const char **)(ptr_scratch + ptr_bytes * 2u);
+        const char *h_gate[DS4_ROCM_MAX_N_EXPERT] = {0};
+        const char *h_up[DS4_ROCM_MAX_N_EXPERT] = {0};
+        const char *h_down[DS4_ROCM_MAX_N_EXPERT] = {0};
+        for (uint32_t e = 0; e < n_total_expert; e++) {
+            uint64_t gate_rel = 0;
+            uint64_t down_rel = 0;
+            if (!cuda_u64_mul_checked(e, gate_expert_bytes, &gate_rel) ||
+                !cuda_u64_mul_checked(e, down_expert_bytes, &down_rel)) {
+                return 0;
+            }
+            h_gate[e] = (const char *)((uintptr_t)gate_w + (uintptr_t)gate_rel);
+            h_up[e] = (const char *)((uintptr_t)up_w + (uintptr_t)gate_rel);
+            h_down[e] = (const char *)((uintptr_t)down_w + (uintptr_t)down_rel);
+        }
+        ok = cuda_ok(cudaMemcpy(q4_gate_slot_ptrs, h_gate, (size_t)ptr_bytes,
+                                cudaMemcpyHostToDevice),
+                     "routed_moe q4 gate ptr table upload");
+        if (ok) {
+            ok = cuda_ok(cudaMemcpy(q4_up_slot_ptrs, h_up, (size_t)ptr_bytes,
+                                    cudaMemcpyHostToDevice),
+                         "routed_moe q4 up ptr table upload");
+        }
+        if (ok) {
+            ok = cuda_ok(cudaMemcpy(q4_down_slot_ptrs, h_down, (size_t)ptr_bytes,
+                                    cudaMemcpyHostToDevice),
+                         "routed_moe q4 down ptr table upload");
+        }
+        if (!ok) return 0;
+    }
+
     const uint32_t xq_blocks = expert_in_dim / CUDA_QK_K;
     const uint32_t midq_blocks = expert_mid_dim / CUDA_QK_K;
     const uint64_t xq_count = (uint64_t)n_tokens * xq_blocks;
@@ -530,8 +576,7 @@ static int routed_moe_launch(
         cuda_block_q8_K *xq = (cuda_block_q8_K *)down->ptr;
         cuda_block_q8_K *midq = (cuda_block_q8_K *)gate->ptr;
         const uint32_t pair_count = n_tokens * n_expert;
-        const uint32_t use_sorted_pairs = n_tokens > 1u &&
-            (!q4k_path || n_tokens >= 32u);
+        const uint32_t use_sorted_pairs = n_tokens > 1u && !q4k_path;
         const uint32_t use_expert_tiles = use_sorted_pairs;
         const uint32_t expert_tile_m = 8u;
         const uint32_t write_gate_up = 0u;
@@ -1002,22 +1047,40 @@ static int routed_moe_launch(
             } else if (ok) {
                 dim3 qgrid((expert_mid_dim + 127u) / 128u, n_tokens * n_expert, 1);
                 if (q4k_path) {
-                    moe_gate_up_mid_decode_q4K_qwarp32_kernel<<<qgrid, 256>>>(
-                        (float *)gate->ptr,
-                        (float *)up->ptr,
-                        (float *)mid->ptr,
-                        gate_w,
-                        up_w,
-                        xq,
-                        (const int32_t *)selected_exec->ptr,
-                        (const float *)weights->ptr,
-                        gate_expert_bytes,
-                        gate_row_bytes,
-                        xq_blocks,
-                        expert_mid_dim,
-                        n_expert,
-                        write_gate_up,
-                        clamp);
+                    if (q4_gate_slot_ptrs && q4_up_slot_ptrs) {
+                        moe_gate_up_mid_decode_q4K_qwarp32_ptrs_kernel<<<qgrid, 256>>>(
+                            (float *)gate->ptr,
+                            (float *)up->ptr,
+                            (float *)mid->ptr,
+                            q4_gate_slot_ptrs,
+                            q4_up_slot_ptrs,
+                            xq,
+                            (const int32_t *)selected_exec->ptr,
+                            (const float *)weights->ptr,
+                            gate_row_bytes,
+                            xq_blocks,
+                            expert_mid_dim,
+                            n_expert,
+                            write_gate_up,
+                            clamp);
+                    } else {
+                        moe_gate_up_mid_decode_q4K_qwarp32_kernel<<<qgrid, 256>>>(
+                            (float *)gate->ptr,
+                            (float *)up->ptr,
+                            (float *)mid->ptr,
+                            gate_w,
+                            up_w,
+                            xq,
+                            (const int32_t *)selected_exec->ptr,
+                            (const float *)weights->ptr,
+                            gate_expert_bytes,
+                            gate_row_bytes,
+                            xq_blocks,
+                            expert_mid_dim,
+                            n_expert,
+                            write_gate_up,
+                            clamp);
+                    }
                 } else if (use_decode_lut_gate) {
                     moe_gate_up_mid_decode_lut_qwarp32_kernel<<<qgrid, 256>>>(
                         (float *)gate->ptr,
@@ -1175,15 +1238,26 @@ static int routed_moe_launch(
             if (use_direct_down_sum6) {
                 dim3 sgrid((out_dim + 31u) / 32u, 1, 1);
                 if (q4k_path) {
-                    moe_down_q4K_sum6_qwarp32_kernel<<<sgrid, 256>>>(
-                        (float *)out->ptr,
-                        down_w,
-                        midq,
-                        (const int32_t *)selected_exec->ptr,
-                        down_expert_bytes,
-                        down_row_bytes,
-                        midq_blocks,
-                        out_dim);
+                    if (q4_down_slot_ptrs) {
+                        moe_down_q4K_sum6_qwarp32_ptrs_kernel<<<sgrid, 256>>>(
+                            (float *)out->ptr,
+                            q4_down_slot_ptrs,
+                            midq,
+                            (const int32_t *)selected_exec->ptr,
+                            down_row_bytes,
+                            midq_blocks,
+                            out_dim);
+                    } else {
+                        moe_down_q4K_sum6_qwarp32_kernel<<<sgrid, 256>>>(
+                            (float *)out->ptr,
+                            down_w,
+                            midq,
+                            (const int32_t *)selected_exec->ptr,
+                            down_expert_bytes,
+                            down_row_bytes,
+                            midq_blocks,
+                            out_dim);
+                    }
                 } else {
                     moe_down_sum6_qwarp32_kernel<<<sgrid, 256>>>(
                         (float *)out->ptr,
@@ -1306,16 +1380,28 @@ static int routed_moe_launch(
                 }
             } else {
                 if (q4k_path) {
-                    moe_down_q4K_qwarp32_kernel<<<dgrid, 256>>>(
-                        (float *)down->ptr,
-                        down_w,
-                        midq,
-                        (const int32_t *)selected_exec->ptr,
-                        down_expert_bytes,
-                        down_row_bytes,
-                        midq_blocks,
-                        out_dim,
-                        n_expert);
+                    if (q4_down_slot_ptrs) {
+                        moe_down_q4K_qwarp32_ptrs_kernel<<<dgrid, 256>>>(
+                            (float *)down->ptr,
+                            q4_down_slot_ptrs,
+                            midq,
+                            (const int32_t *)selected_exec->ptr,
+                            down_row_bytes,
+                            midq_blocks,
+                            out_dim,
+                            n_expert);
+                    } else {
+                        moe_down_q4K_qwarp32_kernel<<<dgrid, 256>>>(
+                            (float *)down->ptr,
+                            down_w,
+                            midq,
+                            (const int32_t *)selected_exec->ptr,
+                            down_expert_bytes,
+                            down_row_bytes,
+                            midq_blocks,
+                            out_dim,
+                            n_expert);
+                    }
                 } else {
                     moe_down_qwarp32_kernel<<<dgrid, 256>>>(
                         (float *)down->ptr,
diff --git a/rocm/ds4_rocm_norm_rope.cuh b/rocm/ds4_rocm_norm_rope.cuh
index 57eabdab6..671720330 100644
--- a/rocm/ds4_rocm_norm_rope.cuh
+++ b/rocm/ds4_rocm_norm_rope.cuh
@@ -531,6 +531,7 @@ extern "C" int ds4_gpu_attn_q_b_f16_head_rms_rope_tail_tensor(
         float                 beta_fast,
         float                 beta_slow,
         float                 eps) {
+    if (g_quality_mode) return 0;
     if (!g_cublas_ready || !out || !q_half || !x || !model_map || n_tok == 0 ||
         n_rot > head_dim || (n_rot & 1u) || out_dim != (uint64_t)n_head * head_dim ||
         x->bytes < (uint64_t)n_tok * in_dim * sizeof(float) ||
diff --git a/rocm/ds4_rocm_q8.cuh b/rocm/ds4_rocm_q8.cuh
index ac507fa01..163e9139f 100644
--- a/rocm/ds4_rocm_q8.cuh
+++ b/rocm/ds4_rocm_q8.cuh
@@ -5,6 +5,7 @@
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #include <rocwmma/rocwmma.hpp>
+#include "ds4_rocm_mfma.cuh"
 #endif
 
 #ifndef DS4_ROCM_WMMA_W32
@@ -15,6 +16,14 @@
 #define DS4_ROCM_MFMA_F16 0
 #endif
 
+#ifndef DS4_ROCM_DIRECT_MFMA_F16
+#define DS4_ROCM_DIRECT_MFMA_F16 DS4_ROCM_MFMA_F16
+#endif
+
+#ifndef DS4_ROCM_ROCWMMA_F16_FALLBACK
+#define DS4_ROCM_ROCWMMA_F16_FALLBACK 0
+#endif
+
 __device__ __forceinline__ static int32_t load_i8x4_i32_aligned(const int8_t *p) {
     return *(const int32_t *)p;
 }
@@ -865,10 +874,107 @@ __global__ static void matmul_q8_0_f32_batch_wmma_onthefly_kernel(
 }
 
 #if DS4_ROCM_MFMA_F16
+#if DS4_ROCM_DIRECT_MFMA_F16
 /* CDNA/gfx9 large-batch Q8_0 GEMM path.  Each hardware wave64 owns one
- * 16-token x 16-row output tile and rocWMMA lowers the fragment multiply to
- * v_mfma_f32_16x16x16_f16. */
-template <int TILES_N=4, int BM=16, int BN=16, int BK=16>
+ * 16-token x 16-row output tile.  The high-level dispatch is shared across
+ * CDNA3/CDNA4; the low-level wrapper selects the native MFMA shape. */
+template <int TILES_N=4, int BM=16, int BN=16>
+__launch_bounds__(256, 2)
+__global__ static void matmul_q8_0_f32_batch_mfma_w64_onthefly_kernel(
+        float *out,
+        const unsigned char *w,
+        const float *x,
+        uint32_t n_tokens,
+        uint32_t in_dim,
+        uint32_t out_dim,
+        uint64_t row_bytes) {
+    constexpr uint32_t BK = DS4_ROCM_MFMA_F16_K;
+    constexpr uint32_t K_PER_LANE = DS4_ROCM_MFMA_F16_K_PER_LANE;
+    static_assert(BM == 16 && BN == 16, "direct MFMA kernel expects 16x16 output tiles");
+    static_assert(BK == 16u || BK == 32u, "unsupported CDNA MFMA K tile");
+
+    extern __shared__ unsigned char raw_sh[];
+    _Float16 *shA = reinterpret_cast<_Float16 *>(raw_sh);
+    _Float16 *shB = shA + BM * BK;
+    float *shC = reinterpret_cast<float *>(shB + TILES_N * BK * BN);
+
+    const uint32_t tid = threadIdx.x;
+    const uint32_t wave = tid >> 6u;
+    const uint32_t lane = tid & 63u;
+    const uint32_t t0 = (uint32_t)blockIdx.y * BM;
+    const uint32_t row0 = (uint32_t)blockIdx.x * TILES_N * BN;
+
+    ds4_rocm_f32x4_t acc = ds4_rocm_f32x4_zero();
+
+    for (uint32_t k0 = 0; k0 < in_dim; k0 += BK) {
+        for (uint32_t j = tid; j < BM * BK; j += blockDim.x) {
+            const uint32_t m = j / BK;
+            const uint32_t kk = j - m * BK;
+            const uint32_t t = t0 + m;
+            shA[j] = (t < n_tokens && k0 + kk < in_dim)
+                ? (_Float16)x[(uint64_t)t * in_dim + k0 + kk]
+                : (_Float16)0.0f;
+        }
+        for (uint32_t j = tid; j < TILES_N * BK * BN; j += blockDim.x) {
+            const uint32_t tn = j / (BK * BN);
+            const uint32_t rem = j - tn * BK * BN;
+            const uint32_t kk = rem / BN;
+            const uint32_t nn = rem - kk * BN;
+            const uint32_t row = row0 + tn * BN + nn;
+            const uint32_t k = k0 + kk;
+            if (row < out_dim && k < in_dim) {
+                const unsigned char *blk = w + (uint64_t)row * row_bytes + (uint64_t)(k >> 5u) * 34u;
+                uint16_t scale_bits;
+                _Float16 d;
+                __builtin_memcpy(&scale_bits, blk, 2);
+                __builtin_memcpy(&d, &scale_bits, 2);
+                const int8_t q = ((const int8_t *)(blk + 2u))[k & 31u];
+                shB[j] = d * (_Float16)(float)(int)q;
+            } else {
+                shB[j] = (_Float16)0.0f;
+            }
+        }
+        __syncthreads();
+        if (wave < TILES_N) {
+            ds4_rocm_mfma_f16_frag_t a;
+            ds4_rocm_mfma_f16_frag_t b;
+            const uint32_t frag_m = lane & 15u;
+            const uint32_t frag_n = lane & 15u;
+            const uint32_t frag_k = (lane >> 4u) * K_PER_LANE;
+            const _Float16 *tile_b = shB + wave * BK * BN;
+#pragma unroll
+            for (uint32_t i = 0; i < K_PER_LANE; i++) {
+                a[i] = shA[frag_m * BK + frag_k + i];
+                b[i] = tile_b[(frag_k + i) * BN + frag_n];
+            }
+            acc = ds4_rocm_mfma_f16_16x16(a, b, acc);
+        }
+        __syncthreads();
+    }
+
+    if (wave < TILES_N) {
+        const uint32_t frag_n = lane & 15u;
+        const uint32_t frag_m0 = (lane >> 4u) * 4u;
+        float *tile_c = shC + wave * BM * BN;
+#pragma unroll
+        for (uint32_t i = 0; i < 4u; i++) {
+            tile_c[(frag_m0 + i) * BN + frag_n] = acc[i];
+        }
+    }
+    __syncthreads();
+    for (uint32_t j = tid; j < TILES_N * BM * BN; j += blockDim.x) {
+        const uint32_t tn = j / (BM * BN);
+        const uint32_t rem = j - tn * BM * BN;
+        const uint32_t m = rem / BN;
+        const uint32_t nn = rem - m * BN;
+        const uint32_t t = t0 + m;
+        const uint32_t row = row0 + tn * BN + nn;
+        if (t < n_tokens && row < out_dim) out[(uint64_t)t * out_dim + row] = shC[j];
+    }
+}
+#elif DS4_ROCM_ROCWMMA_F16_FALLBACK
+/* Rollback path for comparing the direct MFMA implementation against rocWMMA. */
+template <int TILES_N=4, int BM=16, int BN=16>
 __launch_bounds__(256, 2)
 __global__ static void matmul_q8_0_f32_batch_mfma_w64_onthefly_kernel(
         float *out,
@@ -878,6 +984,7 @@ __global__ static void matmul_q8_0_f32_batch_mfma_w64_onthefly_kernel(
         uint32_t in_dim,
         uint32_t out_dim,
         uint64_t row_bytes) {
+    constexpr uint32_t BK = 16u;
     extern __shared__ unsigned char raw_sh[];
     half *shA = reinterpret_cast<half *>(raw_sh);
     half *shB = shA + BM * BK;
@@ -944,6 +1051,7 @@ __global__ static void matmul_q8_0_f32_batch_mfma_w64_onthefly_kernel(
 }
 #endif
 #endif
+#endif
 
 __global__ static void matmul_q8_0_pair_f32_warp8_kernel(
         float *out0,
diff --git a/rocm/ds4_rocm_router.cuh b/rocm/ds4_rocm_router.cuh
index 8e04613ec..4d102e611 100644
--- a/rocm/ds4_rocm_router.cuh
+++ b/rocm/ds4_rocm_router.cuh
@@ -56,8 +56,9 @@ __global__ static void router_select_warp_topk_kernel(
             #pragma unroll
             for (uint32_t j = 0; j < DS4_ROCM_N_EXPERT_USED; j++) {
                 const int32_t e = row[j];
-                sel[j] = e;
-                const float v = (e >= 0 && e < N_EXPERT) ? sprob[row_in_block][(uint32_t)e] : 0.0f;
+                const int valid = e >= 0 && e < (int32_t)N_EXPERT;
+                sel[j] = valid ? e : 0;
+                const float v = valid ? sprob[row_in_block][(uint32_t)e] : 0.0f;
                 w[j] = v;
                 sum += v;
             }
@@ -87,9 +88,15 @@ __global__ static void router_select_warp_topk_kernel(
         }
         #pragma unroll
         for (uint32_t mask = 16u; mask > 0u; mask >>= 1u) {
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+            const float other_score = __shfl_xor(best_score, mask, 32);
+            const float other_prob = __shfl_xor(best_prob, mask, 32);
+            const uint32_t other_idx = __shfl_xor(best_idx, mask, 32);
+#else
             const float other_score = __shfl_xor_sync(FULL_WARP_MASK, best_score, mask);
             const float other_prob = __shfl_xor_sync(FULL_WARP_MASK, best_prob, mask);
             const uint32_t other_idx = __shfl_xor_sync(FULL_WARP_MASK, best_idx, mask);
+#endif
             if (router_score_better(other_score, other_idx, best_score, best_idx)) {
                 best_score = other_score;
                 best_prob = other_prob;
diff --git a/rocm/ds4_rocm_runtime.cuh b/rocm/ds4_rocm_runtime.cuh
index 3bd786f8e..636664e34 100644
--- a/rocm/ds4_rocm_runtime.cuh
+++ b/rocm/ds4_rocm_runtime.cuh
@@ -3522,6 +3522,8 @@ struct ds4_rocm_runtime_config {
     int disable_shared_gate_up_fused_w32;
     int attention_output_cublas_all;
     int shared_down_cublas;
+    int q8_batch_mfma;
+    int prefill_online_attention;
     int graph_dump;
     uint32_t q8_decode_rpb;
     uint32_t q8_hc_decode_rpb;
@@ -3539,12 +3541,17 @@ static const ds4_rocm_runtime_config *cuda_runtime_config(void) {
         g_rocm_cfg.disable_shared_gate_up_fused_w32 = !g_quality_mode;
         g_rocm_cfg.attention_output_cublas_all = !g_quality_mode;
         g_rocm_cfg.shared_down_cublas = !g_quality_mode;
+        g_rocm_cfg.q8_batch_mfma = !g_quality_mode;
+        g_rocm_cfg.prefill_online_attention = 0;
         g_rocm_cfg.graph_dump = cuda_env_present(getenv("DS4_METAL_GRAPH_DUMP_PREFIX"));
         g_rocm_cfg.q8_decode_rpb = g_quality_mode ? 8u : 1u;
         g_rocm_cfg.q8_hc_decode_rpb = g_quality_mode ? 8u : 16u;
         g_rocm_cfg.attn_out_low_decode_rpb = g_quality_mode ? 8u : 32u;
         g_rocm_cfg.moe_decode_rpb = g_quality_mode ? 8u : 1u;
         g_rocm_cfg.oldhip_attention_decode = !g_quality_mode;
+        if (cuda_env_present(getenv("DS4_ROCM_DISABLE_Q8_BATCH_MFMA"))) {
+            g_rocm_cfg.q8_batch_mfma = 0;
+        }
         g_rocm_cfg.initialized = 1;
     }
     return &g_rocm_cfg;
@@ -3559,7 +3566,12 @@ static uint64_t cuda_q8_f16_cache_reserve_bytes(uint64_t total_bytes) {
         return cuda_stream_resident_free_reserve_bytes();
     }
     if (total_bytes >= 112ull * 1024ull * 1024ull * 1024ull) {
-        return 512ull * 1048576ull;
+        /*
+         * On large discrete cards, Q8->F16 expansion is still optional.  Leave
+         * enough room for long-context KV and normal runtime scratch instead of
+         * forcing otherwise-fitting contexts into managed memory.
+         */
+        return 40ull * 1024ull * 1024ull * 1024ull;
     }
 
     /* The expanded Q8->F16 cache is only an acceleration path.  Keep enough
@@ -4131,15 +4143,22 @@ static uint64_t cuda_model_cache_limit_bytes(void) {
     return UINT64_MAX;
 }
 
-static uint64_t cuda_model_arena_chunk_bytes(uint64_t need) {
-    uint64_t bytes = 1792ull * 1048576ull;
-    if (bytes < need) {
-        const uint64_t align = 256ull * 1048576ull;
-        bytes = (need + align - 1u) & ~(align - 1u);
+static uint64_t cuda_model_arena_allocated_bytes(void) {
+    uint64_t bytes = 0;
+    for (const cuda_model_arena &a : g_model_arenas) {
+        if (a.bytes > UINT64_MAX - bytes) return UINT64_MAX;
+        bytes += a.bytes;
     }
     return bytes;
 }
 
+static uint64_t cuda_model_arena_chunk_bytes(uint64_t need) {
+    const uint64_t min_chunk = 64ull * 1048576ull;
+    const uint64_t align = 64ull * 1048576ull;
+    uint64_t bytes = need < min_chunk ? min_chunk : need;
+    return cuda_round_up(bytes, align);
+}
+
 static char *cuda_model_arena_alloc(uint64_t bytes, const char *what) {
     if (bytes == 0) return NULL;
     if (g_model_cache_full) return NULL;
@@ -4162,10 +4181,34 @@ static char *cuda_model_arena_alloc(uint64_t bytes, const char *what) {
     void *dev = NULL;
     cudaError_t err = cudaMalloc(&dev, (size_t)chunk);
     if (err != cudaSuccess) {
-        fprintf(stderr, DS4_GPU_LOG_PREFIX "model arena alloc failed for %s (%.2f MiB chunk): %s\n",
-                what ? what : "weights",
-                (double)chunk / 1048576.0,
-                cudaGetErrorString(err));
+        size_t free_b = 0;
+        size_t total_b = 0;
+        const cudaError_t mem_err = cudaMemGetInfo(&free_b, &total_b);
+        if (mem_err == cudaSuccess) {
+            fprintf(stderr,
+                    DS4_GPU_LOG_PREFIX "model arena alloc failed for %s "
+                    "(request=%.2f MiB chunk=%.2f MiB cached=%.2f GiB "
+                    "arena=%.2f GiB free=%.2f GiB total=%.2f GiB): %s\n",
+                    what ? what : "weights",
+                    (double)bytes / 1048576.0,
+                    (double)chunk / 1048576.0,
+                    (double)g_model_range_bytes / 1073741824.0,
+                    (double)cuda_model_arena_allocated_bytes() / 1073741824.0,
+                    (double)free_b / 1073741824.0,
+                    (double)total_b / 1073741824.0,
+                    cudaGetErrorString(err));
+        } else {
+            fprintf(stderr,
+                    DS4_GPU_LOG_PREFIX "model arena alloc failed for %s "
+                    "(request=%.2f MiB chunk=%.2f MiB cached=%.2f GiB "
+                    "arena=%.2f GiB): %s\n",
+                    what ? what : "weights",
+                    (double)bytes / 1048576.0,
+                    (double)chunk / 1048576.0,
+                    (double)g_model_range_bytes / 1073741824.0,
+                    (double)cuda_model_arena_allocated_bytes() / 1073741824.0,
+                    cudaGetErrorString(err));
+        }
         (void)cudaGetLastError();
         g_model_cache_full = 1;
         return NULL;
@@ -4387,7 +4430,8 @@ extern "C" int ds4_gpu_init(void) {
     }
     if (!g_cublas_ready) {
         if (!cublas_ok(cublasCreate(&g_cublas), "create handle")) return 0;
-        const cublasMath_t math_mode = g_quality_mode ? CUBLAS_DEFAULT_MATH : CUBLAS_TF32_TENSOR_OP_MATH;
+        const cublasMath_t math_mode = g_quality_mode ?
+            CUBLAS_DEFAULT_MATH : CUBLAS_TF32_TENSOR_OP_MATH;
         (void)cublasSetMathMode(g_cublas, math_mode);
         g_cublas_ready = 1;
     }
@@ -4502,8 +4546,8 @@ extern "C" ds4_gpu_tensor *ds4_gpu_tensor_alloc_managed(uint64_t bytes) {
 
 static uint64_t cuda_managed_kv_reserve_bytes(uint64_t total_bytes) {
     const uint64_t min_reserve = 8ull * 1073741824ull;
-    const uint64_t max_reserve = 40ull * 1073741824ull;
-    uint64_t reserve = total_bytes / 4u;
+    const uint64_t max_reserve = 16ull * 1073741824ull;
+    uint64_t reserve = total_bytes / 12u;
     if (reserve < min_reserve) reserve = min_reserve;
     if (reserve > max_reserve) reserve = max_reserve;
     return reserve;
@@ -4512,12 +4556,10 @@ static uint64_t cuda_managed_kv_reserve_bytes(uint64_t total_bytes) {
 extern "C" int ds4_gpu_should_use_managed_kv_cache(uint64_t kv_cache_bytes, uint64_t context_bytes) {
     if (kv_cache_bytes == 0) return 0;
 
-    /* Very large KV caches are where device-only cudaMalloc() can make a
-     * unified-memory machine unresponsive.  Managed memory restores the old
-     * demand-paged behavior for this one long-lived allocation class only. */
-    const uint64_t huge_kv = 8ull * 1073741824ull;
-    if (kv_cache_bytes >= huge_kv) return 1;
-
+    /* On discrete CDNA devices, large KV caches should stay in normal VRAM when
+     * they fit.  ROCm managed memory is a pressure valve, not the default path:
+     * use it only when the estimated context footprint would exhaust free device
+     * memory or leave too little reserve for transient runtime allocations. */
     const uint64_t large_context = 8ull * 1073741824ull;
     if (context_bytes < large_context) return 0;
 
@@ -4800,7 +4842,8 @@ extern "C" void ds4_gpu_set_quality(bool quality) {
     }
     g_quality_mode = new_quality_mode;
     if (g_cublas_ready) {
-        const cublasMath_t math_mode = g_quality_mode ? CUBLAS_DEFAULT_MATH : CUBLAS_TF32_TENSOR_OP_MATH;
+        const cublasMath_t math_mode = g_quality_mode ?
+            CUBLAS_DEFAULT_MATH : CUBLAS_TF32_TENSOR_OP_MATH;
         (void)cublasSetMathMode(g_cublas, math_mode);
     }
 }
diff --git a/tests/rocm_pro_q4_8gpu_multiturn_smoke.sh b/tests/rocm_pro_q4_8gpu_multiturn_smoke.sh
new file mode 100755
index 000000000..9f768c80f
--- /dev/null
+++ b/tests/rocm_pro_q4_8gpu_multiturn_smoke.sh
@@ -0,0 +1,246 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Manual hardware validation for the interactive local multi-GPU ROCm path.
+# This depends on private Pro Q4 shard names by default and self-skips outside
+# that environment.
+
+skip() {
+    echo "SKIP rocm pro q4 multiturn smoke: $*"
+    exit 0
+}
+
+DS4_BIN=${DS4_BIN:-./ds4}
+MODEL0=${DS4_PRO_Q4_MODEL0:-gguf/DeepSeek-V4-Pro-Q4K-Layers00-30.gguf}
+MODEL1=${DS4_PRO_Q4_MODEL1:-gguf/DeepSeek-V4-Pro-Q4K-Layers-31-output.gguf}
+GPUS=${DS4_GPUS:-0,1,2,3,4,5,6,7}
+CTX=${DS4_CTX:-262144}
+PREFILL_CHUNK=${DS4_PREFILL_CHUNK:-4096}
+DIST_PREFILL_CHUNK=${DS4_DIST_PREFILL_CHUNK:-4096}
+DIST_PREFILL_WINDOW=${DS4_DIST_PREFILL_WINDOW:-8}
+TOKENS=${DS4_MULTITURN_SMOKE_TOKENS:-64}
+TIMEOUT=${DS4_MULTITURN_SMOKE_TIMEOUT:-360}
+
+[[ -x "$DS4_BIN" ]] || skip "$DS4_BIN is not executable"
+[[ -f "$MODEL0" ]] || skip "$MODEL0 is missing"
+[[ -f "$MODEL1" ]] || skip "$MODEL1 is missing"
+if [[ ! -e /dev/kfd && ! -e /dev/dri ]]; then
+    skip "ROCm device nodes are not available"
+fi
+
+REQ_GPUS=$(python3 - "$GPUS" <<'PY'
+import sys
+print(len([p for p in sys.argv[1].split(",") if p.strip()]))
+PY
+)
+if command -v rocminfo >/dev/null 2>&1; then
+    HAVE_GPUS=$(rocminfo 2>/dev/null | grep -E 'Name:[[:space:]]+gfx' | wc -l | tr -d ' ')
+    if [[ "$HAVE_GPUS" != "0" && "$HAVE_GPUS" -lt "$REQ_GPUS" ]]; then
+        skip "requested $REQ_GPUS GPUs but rocminfo reports $HAVE_GPUS"
+    fi
+fi
+
+LOG=${DS4_MULTITURN_SMOKE_LOG:-$(mktemp -t ds4-rocm-pro-q4-multiturn.XXXXXX.log)}
+
+python3 - "$LOG" "$TIMEOUT" "$REQ_GPUS" "$DS4_BIN" "$MODEL0" "$MODEL1" "$GPUS" "$CTX" \
+    "$PREFILL_CHUNK" "$DIST_PREFILL_CHUNK" "$DIST_PREFILL_WINDOW" "$TOKENS" <<'PY'
+import os
+import pty
+import re
+import select
+import signal
+import subprocess
+import sys
+import time
+
+(
+    log_path,
+    timeout_s,
+    req_gpus_s,
+    ds4_bin,
+    model0,
+    model1,
+    gpus,
+    ctx,
+    prefill_chunk,
+    dist_prefill_chunk,
+    dist_prefill_window,
+    tokens,
+) = sys.argv[1:13]
+
+timeout = float(timeout_s)
+req_gpus = int(req_gpus_s)
+argv = [
+    ds4_bin,
+    "--rocm",
+    "-m", model0,
+    "-m", model1,
+    "--gpus", gpus,
+    "--ctx", ctx,
+    "--prefill-chunk", prefill_chunk,
+    "--dist-prefill-chunk", dist_prefill_chunk,
+    "--dist-prefill-window", dist_prefill_window,
+    "--tokens", tokens,
+    "--nothink",
+    "--temp", "0",
+]
+
+master, slave = pty.openpty()
+proc = subprocess.Popen(
+    argv,
+    stdin=slave,
+    stdout=slave,
+    stderr=slave,
+    close_fds=True,
+    start_new_session=True,
+)
+os.close(slave)
+
+buf = bytearray()
+scan_pos = 0
+deadline = time.monotonic() + timeout
+
+def write_log():
+    with open(log_path, "wb") as f:
+        f.write(buf)
+
+def fail(msg):
+    try:
+        os.killpg(proc.pid, signal.SIGTERM)
+    except ProcessLookupError:
+        pass
+    try:
+        proc.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        try:
+            os.killpg(proc.pid, signal.SIGKILL)
+        except ProcessLookupError:
+            pass
+        proc.wait(timeout=5)
+    write_log()
+    print(f"FAIL multiturn smoke: {msg}", file=sys.stderr)
+    print(f"log: {log_path}", file=sys.stderr)
+    sys.exit(1)
+
+def drain_once(label):
+    remaining = deadline - time.monotonic()
+    if remaining <= 0:
+        fail(f"timed out waiting for {label}")
+    r, _, _ = select.select([master], [], [], min(1.0, remaining))
+    if not r:
+        if proc.poll() is not None:
+            fail(f"process exited while waiting for {label} (rc={proc.returncode})")
+        return
+    try:
+        chunk = os.read(master, 65536)
+    except OSError as exc:
+        if proc.poll() is not None:
+            fail(f"process exited while waiting for {label} (rc={proc.returncode})")
+        fail(f"pty read failed while waiting for {label}: {exc}")
+    if chunk:
+        buf.extend(chunk)
+        if b"\x1b[6n" in chunk:
+            os.write(master, b"\x1b[1;1R")
+
+def read_until(pattern, label):
+    global scan_pos
+    pat = pattern if isinstance(pattern, bytes) else pattern.encode()
+    start = scan_pos
+    while True:
+        found = buf.find(pat, scan_pos)
+        if found >= 0:
+            scan_pos = found + len(pat)
+            return bytes(buf[start:scan_pos])
+        drain_once(label)
+
+def send_line(text):
+    os.write(master, text.encode() + b"\r")
+
+ansi = re.compile(r"\x1b\[[0-9;?]*[A-Za-z]")
+
+def plain(data):
+    return ansi.sub("", data.decode("utf-8", errors="replace"))
+
+def meaningful_output(segment, prompt):
+    text = plain(segment).replace(prompt, "")
+    lines = []
+    for raw in text.splitlines():
+        line = raw.strip().replace("ds4> ", "").strip()
+        if not line or line.startswith("ds4:"):
+            continue
+        lines.append(line)
+    visible = re.sub(r"[^A-Za-z0-9]+", "", " ".join(lines))
+    return len(visible) >= 8
+
+read_until(b"ds4> ", "initial prompt")
+def run_turn(prompt, label):
+    start = len(buf)
+    send_line(prompt)
+    read_until(b"ds4: prefill:", f"{label} timing")
+    read_until(b"ds4> ", f"{label} prompt")
+    return bytes(buf[start:scan_pos])
+
+first = "tell me about yourself"
+seg1 = run_turn(first, "first answer")
+if not meaningful_output(seg1, first):
+    fail("first turn produced no meaningful output")
+
+second = "tell me a short story about a lighthouse"
+seg2 = run_turn(second, "second answer")
+if not meaningful_output(seg2, second):
+    fail("second turn produced no meaningful output")
+
+send_line("/exit")
+
+while proc.poll() is None:
+    remaining = deadline - time.monotonic()
+    if remaining <= 0:
+        fail("timed out waiting for process exit")
+    r, _, _ = select.select([master], [], [], min(1.0, remaining))
+    if r:
+        try:
+            chunk = os.read(master, 65536)
+        except OSError:
+            break
+        if chunk:
+            buf.extend(chunk)
+            if b"\x1b[6n" in chunk:
+                os.write(master, b"\x1b[1;1R")
+
+if proc.returncode is None:
+    try:
+        proc.wait(timeout=30)
+    except subprocess.TimeoutExpired:
+        fail("process did not exit after /exit")
+
+write_log()
+text = plain(buf)
+bad = [
+    "Kernel Name:",
+    "HSA_STATUS_ERROR_EXCEPTION",
+    "unspecified launch failure",
+    "prompt processing failed",
+    "decode failed",
+    "Aborted",
+    "unable to connect to 127.0.0.1",
+]
+for needle in bad:
+    if needle in text:
+        print(f"FAIL multiturn smoke saw {needle!r}", file=sys.stderr)
+        print(f"log: {log_path}", file=sys.stderr)
+        sys.exit(1)
+
+if req_gpus > 1 and "local GPU worker: coordinator disconnected; exiting" not in text:
+    print("FAIL multiturn smoke did not observe local worker shutdown", file=sys.stderr)
+    print(f"log: {log_path}", file=sys.stderr)
+    sys.exit(1)
+
+if proc.returncode != 0:
+    print(f"FAIL multiturn smoke exited rc={proc.returncode}", file=sys.stderr)
+    print(f"log: {log_path}", file=sys.stderr)
+    sys.exit(1)
+
+print("PASS multiturn smoke")
+PY
+
+echo "log: $LOG"
diff --git a/tests/rocm_pro_q4_8gpu_smoke.sh b/tests/rocm_pro_q4_8gpu_smoke.sh
new file mode 100755
index 000000000..db956607e
--- /dev/null
+++ b/tests/rocm_pro_q4_8gpu_smoke.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Manual hardware validation for a local multi-GPU ROCm Pro Q4 setup. This
+# script self-skips when the private shard files or requested GPU count are not
+# present; it is not intended as a portable CI test.
+
+skip() {
+    echo "SKIP rocm pro q4 smoke: $*"
+    exit 0
+}
+
+DS4_BIN=${DS4_BIN:-./ds4}
+MODEL0=${DS4_PRO_Q4_MODEL0:-gguf/DeepSeek-V4-Pro-Q4K-Layers00-30.gguf}
+MODEL1=${DS4_PRO_Q4_MODEL1:-gguf/DeepSeek-V4-Pro-Q4K-Layers-31-output.gguf}
+GPUS=${DS4_GPUS:-0,1,2,3,4,5,6,7}
+CTX=${DS4_CTX:-262144}
+PREFILL_CHUNK=${DS4_PREFILL_CHUNK:-4096}
+DIST_PREFILL_CHUNK=${DS4_DIST_PREFILL_CHUNK:-4096}
+DIST_PREFILL_WINDOW=${DS4_DIST_PREFILL_WINDOW:-8}
+PROMPT=${DS4_SMOKE_PROMPT:-tell me about yourself}
+
+[[ -x "$DS4_BIN" ]] || skip "$DS4_BIN is not executable"
+[[ -f "$MODEL0" ]] || skip "$MODEL0 is missing"
+[[ -f "$MODEL1" ]] || skip "$MODEL1 is missing"
+if [[ ! -e /dev/kfd && ! -e /dev/dri ]]; then
+    skip "ROCm device nodes are not available"
+fi
+
+REQ_GPUS=$(python3 - "$GPUS" <<'PY'
+import sys
+print(len([p for p in sys.argv[1].split(",") if p.strip()]))
+PY
+)
+if command -v rocminfo >/dev/null 2>&1; then
+    HAVE_GPUS=$(rocminfo 2>/dev/null | grep -E 'Name:[[:space:]]+gfx' | wc -l | tr -d ' ')
+    if [[ "$HAVE_GPUS" != "0" && "$HAVE_GPUS" -lt "$REQ_GPUS" ]]; then
+        skip "requested $REQ_GPUS GPUs but rocminfo reports $HAVE_GPUS"
+    fi
+fi
+
+OUT=${DS4_SMOKE_LOGPROBS:-$(mktemp -t ds4-rocm-pro-q4-logprobs.XXXXXX.json)}
+LOG=${DS4_SMOKE_LOG:-$(mktemp -t ds4-rocm-pro-q4.XXXXXX.log)}
+
+if ! "$DS4_BIN" --rocm \
+    -m "$MODEL0" \
+    -m "$MODEL1" \
+    --gpus "$GPUS" \
+    --ctx "$CTX" \
+    --prefill-chunk "$PREFILL_CHUNK" \
+    --dist-prefill-chunk "$DIST_PREFILL_CHUNK" \
+    --dist-prefill-window "$DIST_PREFILL_WINDOW" \
+    --nothink \
+    --temp 0 \
+    --tokens 1 \
+    --dump-logprobs "$OUT" \
+    -p "$PROMPT" >"$LOG" 2>&1; then
+    echo "FAIL rocm pro q4 smoke run failed; log: $LOG" >&2
+    tail -80 "$LOG" >&2 || true
+    exit 1
+fi
+
+python3 - "$OUT" "$LOG" "$REQ_GPUS" <<'PY'
+import json
+import math
+import sys
+from pathlib import Path
+
+out_path, log_path, req_gpus_s = sys.argv[1:4]
+req_gpus = int(req_gpus_s)
+log = Path(log_path).read_text(encoding="utf-8", errors="replace")
+bad = [
+    "Kernel Name:",
+    "HSA_STATUS_ERROR_EXCEPTION",
+    "unspecified launch failure",
+    "prompt processing failed",
+    "decode failed",
+    "Aborted",
+    "unable to connect to 127.0.0.1",
+]
+for needle in bad:
+    if needle in log:
+        print(f"FAIL smoke saw {needle!r}", file=sys.stderr)
+        print(f"log: {log_path}", file=sys.stderr)
+        sys.exit(1)
+
+if req_gpus > 1 and "local GPU worker: coordinator disconnected; exiting" not in log:
+    print("FAIL smoke did not observe local worker shutdown", file=sys.stderr)
+    print(f"log: {log_path}", file=sys.stderr)
+    sys.exit(1)
+
+with open(out_path, "r", encoding="utf-8") as f:
+    data = json.load(f)
+root = data[0] if isinstance(data, list) else data
+steps = root.get("steps")
+if not isinstance(steps, list) or not steps:
+    print("FAIL smoke logprobs has no steps", file=sys.stderr)
+    sys.exit(1)
+step = steps[0]
+selected = step.get("selected", {})
+top = step.get("top_logprobs", [])
+if not isinstance(selected.get("id"), int):
+    print("FAIL smoke selected token id missing", file=sys.stderr)
+    sys.exit(1)
+if not isinstance(top, list) or not top:
+    print("FAIL smoke top_logprobs missing", file=sys.stderr)
+    sys.exit(1)
+for item in top[:5]:
+    for key in ("logit", "logprob"):
+        value = item.get(key)
+        if not isinstance(value, (int, float)) or not math.isfinite(value):
+            print(f"FAIL smoke non-finite {key}", file=sys.stderr)
+            sys.exit(1)
+
+print(f"PASS first token id={selected['id']} top_k={len(top)}")
+PY
+
+echo "logprobs: $OUT"
+echo "log: $LOG"
diff --git a/tests/rocm_pro_q4_logits_compare.sh b/tests/rocm_pro_q4_logits_compare.sh
new file mode 100755
index 000000000..4e9a85ce6
--- /dev/null
+++ b/tests/rocm_pro_q4_logits_compare.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Manual regression smoke for the Pro Q4 local multi-GPU path. It compares the
+# new MFMA route with the fallback route for top-token stability; the synthetic
+# rocm-q8-mfma-correctness test is the numerical correctness test.
+
+skip() {
+    echo "SKIP rocm pro q4 logits smoke: $*"
+    exit 0
+}
+
+DS4_BIN=${DS4_BIN:-./ds4}
+MODEL0=${DS4_PRO_Q4_MODEL0:-gguf/DeepSeek-V4-Pro-Q4K-Layers00-30.gguf}
+MODEL1=${DS4_PRO_Q4_MODEL1:-gguf/DeepSeek-V4-Pro-Q4K-Layers-31-output.gguf}
+GPUS=${DS4_GPUS:-0,1,2,3,4,5,6,7}
+CTX=${DS4_CTX:-262144}
+PREFILL_CHUNK=${DS4_PREFILL_CHUNK:-4096}
+DIST_PREFILL_CHUNK=${DS4_DIST_PREFILL_CHUNK:-4096}
+DIST_PREFILL_WINDOW=${DS4_DIST_PREFILL_WINDOW:-8}
+
+[[ -x "$DS4_BIN" ]] || skip "$DS4_BIN is not executable"
+[[ -f "$MODEL0" ]] || skip "$MODEL0 is missing"
+[[ -f "$MODEL1" ]] || skip "$MODEL1 is missing"
+if [[ ! -e /dev/kfd && ! -e /dev/dri ]]; then
+    skip "ROCm device nodes are not available"
+fi
+
+REQ_GPUS=$(python3 - "$GPUS" <<'PY'
+import sys
+print(len([p for p in sys.argv[1].split(",") if p.strip()]))
+PY
+)
+if command -v rocminfo >/dev/null 2>&1; then
+    HAVE_GPUS=$(rocminfo 2>/dev/null | grep -E 'Name:[[:space:]]+gfx' | wc -l | tr -d ' ')
+    if [[ "$HAVE_GPUS" != "0" && "$HAVE_GPUS" -lt "$REQ_GPUS" ]]; then
+        skip "requested $REQ_GPUS GPUs but rocminfo reports $HAVE_GPUS"
+    fi
+fi
+
+OUT_A=${DS4_LOGITS_COMPARE_A:-$(mktemp -t ds4-rocm-pro-q4-mfma.XXXXXX.json)}
+OUT_B=${DS4_LOGITS_COMPARE_B:-$(mktemp -t ds4-rocm-pro-q4-fallback.XXXXXX.json)}
+LOG_A=${DS4_LOGITS_COMPARE_LOG_A:-$(mktemp -t ds4-rocm-pro-q4-mfma.XXXXXX.log)}
+LOG_B=${DS4_LOGITS_COMPARE_LOG_B:-$(mktemp -t ds4-rocm-pro-q4-fallback.XXXXXX.log)}
+PROMPT_FILE=${DS4_LOGITS_COMPARE_PROMPT_FILE:-$(mktemp -t ds4-rocm-pro-q4-prompt.XXXXXX.txt)}
+
+if [[ -z "${DS4_LOGITS_COMPARE_PROMPT_FILE:-}" ]]; then
+    python3 - "$PROMPT_FILE" <<'PY'
+from pathlib import Path
+text = (
+    "Summarize the design tradeoffs in local distributed inference. "
+    "Focus on memory placement, activation transport, and deterministic testing. "
+)
+Path(__import__("sys").argv[1]).write_text(text * 96, encoding="utf-8")
+PY
+fi
+
+run_case() {
+    local label=$1
+    local out=$2
+    local log=$3
+    shift 3
+    if ! env "$@" "$DS4_BIN" --rocm \
+        -m "$MODEL0" \
+        -m "$MODEL1" \
+        --gpus "$GPUS" \
+        --ctx "$CTX" \
+        --prefill-chunk "$PREFILL_CHUNK" \
+        --dist-prefill-chunk "$DIST_PREFILL_CHUNK" \
+        --dist-prefill-window "$DIST_PREFILL_WINDOW" \
+        --nothink \
+        --temp 0 \
+        --dump-logits "$out" \
+        --prompt-file "$PROMPT_FILE" >"$log" 2>&1; then
+        echo "FAIL $label logits smoke run failed; log: $log" >&2
+        tail -80 "$log" >&2 || true
+        exit 1
+    fi
+}
+
+run_case "mfma" "$OUT_A" "$LOG_A"
+run_case "fallback" "$OUT_B" "$LOG_B" DS4_ROCM_DISABLE_Q8_BATCH_MFMA=1
+
+python3 - "$OUT_A" "$OUT_B" "$LOG_A" "$LOG_B" <<'PY'
+import json
+import math
+import os
+import sys
+from pathlib import Path
+
+out_a, out_b, log_a, log_b = sys.argv[1:5]
+
+bad_needles = [
+    "Kernel Name:",
+    "HSA_STATUS_ERROR_EXCEPTION",
+    "unspecified launch failure",
+    "prompt processing failed",
+    "Aborted",
+]
+for log_path in (log_a, log_b):
+    text = Path(log_path).read_text(encoding="utf-8", errors="replace")
+    for needle in bad_needles:
+        if needle in text:
+            print(f"FAIL logits smoke saw {needle!r} in {log_path}", file=sys.stderr)
+            sys.exit(1)
+
+def load(path):
+    data = json.loads(Path(path).read_text(encoding="utf-8"))
+    logits = data.get("logits")
+    if not isinstance(logits, list) or not logits:
+        raise SystemExit(f"FAIL {path} has no logits array")
+    vals = []
+    for i, v in enumerate(logits):
+        if not isinstance(v, (int, float)) or not math.isfinite(v):
+            raise SystemExit(f"FAIL {path} has non-finite logit at {i}")
+        vals.append(float(v))
+    arg = data.get("argmax_token", {}).get("id")
+    if not isinstance(arg, int):
+        raise SystemExit(f"FAIL {path} has no argmax token id")
+    return vals, arg
+
+a, arg_a = load(out_a)
+b, arg_b = load(out_b)
+if len(a) != len(b):
+    print(f"FAIL vocab mismatch {len(a)} != {len(b)}", file=sys.stderr)
+    sys.exit(1)
+
+diffs = [abs(x - y) for x, y in zip(a, b)]
+max_abs = max(diffs)
+rms = math.sqrt(sum(d * d for d in diffs) / len(diffs))
+top_a = sorted(range(len(a)), key=a.__getitem__, reverse=True)[:5]
+top_b = sorted(range(len(b)), key=b.__getitem__, reverse=True)[:5]
+overlap = len(set(top_a) & set(top_b))
+
+max_abs_limit = float(os.getenv("DS4_LOGITS_COMPARE_MAX_ABS", "5.0"))
+rms_limit = float(os.getenv("DS4_LOGITS_COMPARE_RMS", "0.75"))
+min_top5_overlap = int(os.getenv("DS4_LOGITS_COMPARE_MIN_TOP5", "3"))
+
+if arg_a != arg_b or overlap < min_top5_overlap or max_abs > max_abs_limit or rms > rms_limit:
+    print(
+        "FAIL logits smoke "
+        f"argmax={arg_a}/{arg_b} top5_overlap={overlap}/5 "
+        f"max_abs={max_abs:.6g} rms={rms:.6g}",
+        file=sys.stderr,
+    )
+    print(f"mfma logits: {out_a}", file=sys.stderr)
+    print(f"fallback logits: {out_b}", file=sys.stderr)
+    print(f"mfma log: {log_a}", file=sys.stderr)
+    print(f"fallback log: {log_b}", file=sys.stderr)
+    sys.exit(1)
+
+print(
+    "PASS logits smoke "
+    f"argmax={arg_a} top5_overlap={overlap}/5 "
+    f"max_abs={max_abs:.6g} rms={rms:.6g}"
+)
+PY
+
+echo "mfma logits: $OUT_A"
+echo "fallback logits: $OUT_B"
+echo "mfma log: $LOG_A"
+echo "fallback log: $LOG_B"
diff --git a/tests/rocm_q8_mfma_correctness.c b/tests/rocm_q8_mfma_correctness.c
new file mode 100644
index 000000000..3f8940465
--- /dev/null
+++ b/tests/rocm_q8_mfma_correctness.c
@@ -0,0 +1,252 @@
+#include "ds4_gpu.h"
+
+/* Synthetic ROCm Q8_0 matmul correctness check. The default build runs the
+ * CDNA MFMA path, then the Makefile target reruns with that path disabled.
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static uint64_t round_up_u64(uint64_t n, uint64_t align) {
+    return (n + align - 1u) & ~(align - 1u);
+}
+
+static uint16_t float_to_f16(float f) {
+    union {
+        float f;
+        uint32_t u;
+    } v = { .f = f };
+
+    uint32_t sign = (v.u >> 16) & 0x8000u;
+    int32_t exp = (int32_t)((v.u >> 23) & 0xffu) - 127 + 15;
+    uint32_t mant = v.u & 0x7fffffu;
+
+    if (exp <= 0) {
+        if (exp < -10) return (uint16_t)sign;
+        mant |= 0x800000u;
+        uint32_t shift = (uint32_t)(14 - exp);
+        uint32_t half_mant = mant >> shift;
+        if ((mant >> (shift - 1)) & 1u) half_mant++;
+        return (uint16_t)(sign | half_mant);
+    }
+    if (exp >= 31) return (uint16_t)(sign | 0x7c00u);
+
+    uint32_t half = sign | ((uint32_t)exp << 10) | (mant >> 13);
+    if (mant & 0x1000u) half++;
+    return (uint16_t)half;
+}
+
+static float f16_to_f32(uint16_t h) {
+    uint32_t sign = (uint32_t)(h & 0x8000u) << 16;
+    uint32_t exp = (h >> 10) & 0x1fu;
+    uint32_t mant = h & 0x03ffu;
+    uint32_t bits;
+
+    if (exp == 0) {
+        if (mant == 0) {
+            bits = sign;
+        } else {
+            exp = 1;
+            while ((mant & 0x0400u) == 0) {
+                mant <<= 1;
+                exp--;
+            }
+            mant &= 0x03ffu;
+            bits = sign | ((exp + 127u - 15u) << 23) | (mant << 13);
+        }
+    } else if (exp == 31) {
+        bits = sign | 0x7f800000u | (mant << 13);
+    } else {
+        bits = sign | ((exp + 127u - 15u) << 23) | (mant << 13);
+    }
+
+    float f;
+    memcpy(&f, &bits, sizeof(f));
+    return f;
+}
+
+static void fill_q8_0_weights(uint8_t *weights,
+                              uint32_t in_dim,
+                              uint32_t out_dim) {
+    const uint32_t blocks = in_dim / 32u;
+    const uint64_t row_bytes = (uint64_t)blocks * 34u;
+    for (uint32_t o = 0; o < out_dim; o++) {
+        uint8_t *row = weights + (uint64_t)o * row_bytes;
+        for (uint32_t b = 0; b < blocks; b++) {
+            float vals[32];
+            float amax = 0.0f;
+            for (uint32_t i = 0; i < 32; i++) {
+                const uint32_t k = b * 32u + i;
+                const int v = (int)((o * 17u + k * 23u + (o ^ k) * 3u) % 67u) - 33;
+                vals[i] = (float)v / 96.0f;
+                const float av = fabsf(vals[i]);
+                if (av > amax) amax = av;
+            }
+            const uint16_t scale_bits = float_to_f16(amax / 127.0f);
+            const float scale = f16_to_f32(scale_bits);
+            memcpy(row + b * 34u, &scale_bits, sizeof(scale_bits));
+            int8_t *qs = (int8_t *)(row + b * 34u + 2u);
+            for (uint32_t i = 0; i < 32; i++) {
+                int q = scale != 0.0f ? (int)lrintf(vals[i] / scale) : 0;
+                if (q > 127) q = 127;
+                if (q < -128) q = -128;
+                qs[i] = (int8_t)q;
+            }
+        }
+    }
+}
+
+static void fill_activations(float *x, uint32_t n_tok, uint32_t in_dim) {
+    for (uint32_t t = 0; t < n_tok; t++) {
+        for (uint32_t i = 0; i < in_dim; i++) {
+            const int v = (int)((t * 19u + i * 7u + (t ^ i)) % 71u) - 35;
+            x[(uint64_t)t * in_dim + i] = (float)v / 80.0f;
+        }
+    }
+}
+
+static void reference_q8_0(const uint8_t *weights,
+                           const float *x,
+                           float *ref_f32,
+                           float *ref_f16,
+                           uint32_t n_tok,
+                           uint32_t in_dim,
+                           uint32_t out_dim) {
+    const uint32_t blocks = in_dim / 32u;
+    const uint64_t row_bytes = (uint64_t)blocks * 34u;
+    for (uint32_t t = 0; t < n_tok; t++) {
+        for (uint32_t o = 0; o < out_dim; o++) {
+            const uint8_t *row = weights + (uint64_t)o * row_bytes;
+            float acc_f32 = 0.0f;
+            float acc_f16 = 0.0f;
+            for (uint32_t b = 0; b < blocks; b++) {
+                uint16_t scale_bits;
+                memcpy(&scale_bits, row + b * 34u, sizeof(scale_bits));
+                const float scale = f16_to_f32(scale_bits);
+                const int8_t *qs = (const int8_t *)(row + b * 34u + 2u);
+                for (uint32_t i = 0; i < 32; i++) {
+                    const float xv = x[(uint64_t)t * in_dim + b * 32u + i];
+                    const float w_f32 = scale * (float)qs[i];
+                    const float w_f16 = f16_to_f32(float_to_f16(w_f32));
+                    const float x_f16 = f16_to_f32(float_to_f16(xv));
+                    acc_f32 += w_f32 * xv;
+                    acc_f16 += w_f16 * x_f16;
+                }
+            }
+            ref_f32[(uint64_t)t * out_dim + o] = acc_f32;
+            ref_f16[(uint64_t)t * out_dim + o] = acc_f16;
+        }
+    }
+}
+
+static int check_errors(const float *got,
+                        const float *ref,
+                        uint64_t n,
+                        float *out_max_abs,
+                        float *out_rms) {
+    double sumsq = 0.0;
+    float max_abs = 0.0f;
+    int bad = 0;
+    for (uint64_t i = 0; i < n; i++) {
+        if (!isfinite(got[i])) {
+            bad = 1;
+            continue;
+        }
+        const float err = fabsf(got[i] - ref[i]);
+        if (err > max_abs) max_abs = err;
+        sumsq += (double)err * (double)err;
+    }
+    *out_max_abs = max_abs;
+    *out_rms = (float)sqrt(sumsq / (double)n);
+    return bad ? 1 : 0;
+}
+
+int main(void) {
+    const uint32_t in_dim = 1024;
+    const uint32_t out_dim = 1024;
+    const uint32_t n_tok = 32;
+    const uint64_t row_bytes = (uint64_t)(in_dim / 32u) * 34u;
+    const uint64_t weight_bytes = (uint64_t)out_dim * row_bytes;
+    const uint64_t weight_alloc = round_up_u64(weight_bytes, (uint64_t)getpagesize());
+    const uint64_t x_bytes = (uint64_t)n_tok * in_dim * sizeof(float);
+    const uint64_t out_bytes = (uint64_t)n_tok * out_dim * sizeof(float);
+
+    void *weights_raw = NULL;
+    if (posix_memalign(&weights_raw, (size_t)getpagesize(), (size_t)weight_alloc) != 0 ||
+        !weights_raw) {
+        fprintf(stderr, "rocm-q8-mfma-correctness: failed to allocate weights\n");
+        return 1;
+    }
+    memset(weights_raw, 0, (size_t)weight_alloc);
+    fill_q8_0_weights((uint8_t *)weights_raw, in_dim, out_dim);
+
+    float *x_host = (float *)malloc((size_t)x_bytes);
+    float *out_host = (float *)malloc((size_t)out_bytes);
+    float *ref_f32 = (float *)malloc((size_t)out_bytes);
+    float *ref_f16 = (float *)malloc((size_t)out_bytes);
+    if (!x_host || !out_host || !ref_f32 || !ref_f16) {
+        fprintf(stderr, "rocm-q8-mfma-correctness: failed to allocate host buffers\n");
+        free(ref_f16);
+        free(ref_f32);
+        free(out_host);
+        free(x_host);
+        free(weights_raw);
+        return 1;
+    }
+    fill_activations(x_host, n_tok, in_dim);
+    for (uint64_t i = 0; i < (uint64_t)n_tok * out_dim; i++) out_host[i] = 12345.0f;
+    reference_q8_0((const uint8_t *)weights_raw, x_host, ref_f32, ref_f16,
+                   n_tok, in_dim, out_dim);
+
+    if (!ds4_gpu_init()) {
+        fprintf(stderr, "rocm-q8-mfma-correctness: ROCm backend unavailable\n");
+        free(ref_f16);
+        free(ref_f32);
+        free(out_host);
+        free(x_host);
+        free(weights_raw);
+        return 1;
+    }
+
+    ds4_gpu_tensor *x = ds4_gpu_tensor_alloc(x_bytes);
+    ds4_gpu_tensor *out = ds4_gpu_tensor_alloc(out_bytes);
+    int rc = 1;
+    if (x && out &&
+        ds4_gpu_tensor_write(x, 0, x_host, x_bytes) &&
+        ds4_gpu_tensor_write(out, 0, out_host, out_bytes) &&
+        ds4_gpu_set_model_map(weights_raw, weight_alloc)) {
+        ds4_gpu_set_quality(false);
+        if (ds4_gpu_matmul_q8_0_tensor(out, weights_raw, weight_alloc, 0,
+                                       in_dim, out_dim, x, n_tok) &&
+            ds4_gpu_tensor_read(out, 0, out_host, out_bytes)) {
+            float max_f32 = 0.0f, rms_f32 = 0.0f;
+            float max_f16 = 0.0f, rms_f16 = 0.0f;
+            const uint64_t n = (uint64_t)n_tok * out_dim;
+            int bad_f32 = check_errors(out_host, ref_f32, n, &max_f32, &rms_f32);
+            int bad_f16 = check_errors(out_host, ref_f16, n, &max_f16, &rms_f16);
+            const int use_f16 = rms_f16 < rms_f32;
+            const float best_max = use_f16 ? max_f16 : max_f32;
+            const float best_rms = use_f16 ? rms_f16 : rms_f32;
+            fprintf(stderr,
+                    "rocm-q8-mfma-correctness: n_tok=%u in=%u out=%u ref=%s "
+                    "max_abs=%g rms=%g f32(max=%g rms=%g) f16(max=%g rms=%g)\n",
+                    n_tok, in_dim, out_dim, use_f16 ? "f16" : "f32",
+                    best_max, best_rms, max_f32, rms_f32, max_f16, rms_f16);
+            if (!bad_f32 && !bad_f16 && best_max < 0.12f && best_rms < 0.02f) rc = 0;
+        }
+    }
+
+    ds4_gpu_tensor_free(out);
+    ds4_gpu_tensor_free(x);
+    ds4_gpu_cleanup();
+    free(ref_f16);
+    free(ref_f32);
+    free(out_host);
+    free(x_host);
+    free(weights_raw);
+    return rc;
+}

From ed6360513c42b8d64b988dfe57c410437c6f26fd Mon Sep 17 00:00:00 2001
From: Eric Hartford <eric@quixi.ai>
Date: Thu, 2 Jul 2026 14:11:35 +0000
Subject: [PATCH 3/3] fix README.md mention of Strix Halo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 29911c05a..f17e1b3d4 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ and for quality and speed testing.
 We support the following backends:
 * **Metal** is our primary target. Starting from MacBooks with 96GB of RAM (or less, using SSD streaming).
 * **NVIDIA CUDA / DGX Spark**, CUDA with special care for the DGX Spark.
-* **AMD ROCm**, validated on AMD Instinct CDNA3 / MI300X. CDNA4 (`gfx950`) build targets are included but still need runtime validation on CDNA4 hardware. Strix Halo uses the `gfx1151` target.
+* **AMD ROCm / Strix Halo**, systems like the Framework Desktop and other systems based on the same GPU and unified RAM design, and CDNA3/MI300X.
 
 This project would not exist without **llama.cpp and GGML**, make sure to read
 the acknowledgements section, a big thank you to Georgi Gerganov and all the