antirez · ehartford · Jun 21, 2026 · Jul 1, 2026 · Jul 2, 2026
diff --git a/Makefile b/Makefile
@@ -33,14 +33,24 @@ CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
 HIPCC ?= $(shell command -v hipcc 2>/dev/null || echo /opt/rocm/bin/hipcc)
 ROCM_ARCH ?= gfx1151
-ROCM_CFLAGS ?= -O3 -ffast-math -g -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
+ROCM_ARCHES = $(strip $(ROCM_ARCH))
+ROCM_PRIMARY_ARCH = $(firstword $(subst :, ,$(firstword $(ROCM_ARCHES))))
+ROCM_OFFLOAD_FLAGS = $(foreach arch,$(ROCM_ARCHES),--offload-arch=$(arch))
+ROCM_Q8_MFMA_ARCH ?= $(if $(filter gfx1151,$(ROCM_ARCH)),gfx942,$(ROCM_ARCH))
+ROCM_WMMA_W32 ?= $(if $(filter gfx11%,$(ROCM_PRIMARY_ARCH)),1,0)
+ROCM_MFMA_F16 ?= $(if $(filter gfx94% gfx95%,$(ROCM_PRIMARY_ARCH)),1,0)
+ROCM_DIRECT_MFMA_F16 ?= $(ROCM_MFMA_F16)
+ROCM_ROCWMMA_F16_FALLBACK ?= 0
+ROCM_CFLAGS ?= -O3 -ffast-math -g -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -DDS4_ROCM_WMMA_W32=$(ROCM_WMMA_W32) -DDS4_ROCM_MFMA_F16=$(ROCM_MFMA_F16) -DDS4_ROCM_DIRECT_MFMA_F16=$(ROCM_DIRECT_MFMA_F16) -DDS4_ROCM_ROCWMMA_F16_FALLBACK=$(ROCM_ROCWMMA_F16_FALLBACK) -Wno-unused-command-line-argument $(ROCM_OFFLOAD_FLAGS)
 ROCM_LDLIBS ?= -lm -pthread -lhipblas -lhipblaslt
+ROCM_TARGETS := ds4 ds4-server ds4-bench ds4-eval ds4-agent
+ROCM_CORE_OBJS := ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o
 DS4_LINK ?= $(NVCC) $(NVCCFLAGS)
 DS4_LINK_LIBS ?= $(CUDA_LDLIBS)
 METAL_LDLIBS := $(LDLIBS)
 endif
 
-.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo rocm
+.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo cdna cdna3 cdna4 mi300x mi325x mi350x mi355x rocm rocm-q8-mfma-correctness manual-rocm-pro-q4-smoke manual-rocm-pro-q4-multiturn-smoke manual-rocm-pro-q4-logits-compare
 
 ifeq ($(UNAME_S),Darwin)
 all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
@@ -85,7 +95,12 @@ help:
 	@echo "  make cuda-generic        Build CUDA for a generic local CUDA GPU"
 	@echo "  make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
 	@echo "  make strix-halo          Build ROCm for Strix Halo / gfx1151"
-	@echo "  make rocm                Alias for make strix-halo"
+	@echo "  make cdna                Build ROCm CDNA3+CDNA4 fat binary / gfx942+gfx950"
+	@echo "  make cdna3               Build ROCm for AMD Instinct MI300X/MI325X / gfx942"
+	@echo "  make cdna4               Build ROCm for AMD Instinct MI350X/MI355X / gfx950 (runtime validation pending)"
+	@echo "  make mi300x|mi325x       Alias for make cdna3"
+	@echo "  make mi350x|mi355x       Alias for make cdna4"
+	@echo "  make rocm ROCM_ARCH=gfxN Build ROCm with explicit AMD GPU target(s)"
 	@echo "  make cpu                 Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
 	@echo "  make test                Build and run tests"
 	@echo "  make clean               Remove build outputs"
@@ -104,14 +119,42 @@ cuda:
 	fi
 	$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH="$(CUDA_ARCH)"
 
+strix-halo: ROCM_ARCH := gfx1151
 strix-halo:
-	$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent \
-		CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o" \
+	$(MAKE) -B $(ROCM_TARGETS) \
+		HIPCC="$(HIPCC)" \
+		ROCM_ARCH="$(ROCM_ARCH)" \
+		ROCM_CFLAGS="$(ROCM_CFLAGS)" \
+		ROCM_LDLIBS="$(ROCM_LDLIBS)" \
+		CORE_OBJS="$(ROCM_CORE_OBJS)" \
 		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
 		DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
 		DS4_LINK_LIBS="$(ROCM_LDLIBS)"
 
-rocm: strix-halo
+cdna: ROCM_ARCH := gfx942 gfx950
+cdna3 mi300x mi325x: ROCM_ARCH := gfx942
+cdna4 mi350x mi355x: ROCM_ARCH := gfx950
+cdna cdna3 cdna4 mi300x mi325x mi350x mi355x:
+	$(MAKE) -B $(ROCM_TARGETS) \
+		HIPCC="$(HIPCC)" \
+		ROCM_ARCH="$(ROCM_ARCH)" \
+		ROCM_CFLAGS="$(ROCM_CFLAGS)" \
+		ROCM_LDLIBS="$(ROCM_LDLIBS)" \
+		CORE_OBJS="$(ROCM_CORE_OBJS)" \
+		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
+		DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
+		DS4_LINK_LIBS="$(ROCM_LDLIBS)"
+
+rocm:
+	$(MAKE) -B $(ROCM_TARGETS) \
+		HIPCC="$(HIPCC)" \
+		ROCM_ARCH="$(ROCM_ARCH)" \
+		ROCM_CFLAGS="$(ROCM_CFLAGS)" \
+		ROCM_LDLIBS="$(ROCM_LDLIBS)" \
+		CORE_OBJS="$(ROCM_CORE_OBJS)" \
+		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
+		DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
+		DS4_LINK_LIBS="$(ROCM_LDLIBS)"
 
 ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS)
 	$(DS4_LINK) -o $@ $^ $(DS4_LINK_LIBS)
@@ -139,16 +182,16 @@ cuda-regression: tests/cuda_long_context_smoke
 	./tests/cuda_long_context_smoke
 endif
 
-ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
+ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_gpu.h
 	$(CC) $(CFLAGS) -c -o $@ ds4.c
 
 ds4_ssd.o: ds4_ssd.c ds4_ssd.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_ssd.c
 
-ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
+ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_help.h linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_cli.c
 
-ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4.h ds4_ssd.h
+ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4_internal.h ds4.h ds4_ssd.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_distributed.c
 
 ds4_help.o: ds4_help.c ds4_help.h
@@ -172,6 +215,24 @@ ds4_web.o: ds4_web.c ds4_web.h
 ds4_kvstore.o: ds4_kvstore.c ds4_kvstore.h ds4.h ds4_ssd.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_kvstore.c
 
+manual-rocm-pro-q4-smoke:
+	tests/rocm_pro_q4_8gpu_smoke.sh
+
+manual-rocm-pro-q4-multiturn-smoke:
+	tests/rocm_pro_q4_8gpu_multiturn_smoke.sh
+
+manual-rocm-pro-q4-logits-compare:
+	tests/rocm_pro_q4_logits_compare.sh
+
+rocm-q8-mfma-correctness:
+	$(MAKE) -B tests/rocm_q8_mfma_correctness \
+		HIPCC="$(HIPCC)" \
+		ROCM_ARCH="$(ROCM_Q8_MFMA_ARCH)" \
+		ROCM_LDLIBS="$(ROCM_LDLIBS)" \
+		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD"
+	tests/rocm_q8_mfma_correctness
+	DS4_ROCM_DISABLE_Q8_BATCH_MFMA=1 tests/rocm_q8_mfma_correctness
+
 ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
 	$(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c
 
@@ -181,16 +242,19 @@ ds4_agent_test.o: tests/ds4_agent_test.c ds4_agent.c ds4.h ds4_ssd.h ds4_distrib
 tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
 	$(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c
 
+tests/rocm_q8_mfma_correctness.o: tests/rocm_q8_mfma_correctness.c ds4_gpu.h
+	$(CC) $(CFLAGS) -I. -c -o $@ tests/rocm_q8_mfma_correctness.c
+
 rax.o: rax.c rax.h rax_malloc.h
 	$(CC) $(CFLAGS) -c -o $@ rax.c
 
 linenoise.o: linenoise.c linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ linenoise.c
 
-ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
+ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_gpu.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c
 
-ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
+ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_help.h linenoise.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_cli.c
 
 ds4_server_cpu.o: ds4_server.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
@@ -217,6 +281,9 @@ ds4_rocm.o: ds4_rocm.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(ROCM_SRCS)
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
+tests/rocm_q8_mfma_correctness: tests/rocm_q8_mfma_correctness.o ds4_rocm.o
+	$(HIPCC) $(ROCM_CFLAGS) -o $@ $^ $(ROCM_LDLIBS)
+
 ds4_test: ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS)
 ifeq ($(UNAME_S),Darwin)
 	$(CC) $(CFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
@@ -241,4 +308,4 @@ q4k-dot-test: tests/test_q4k_dot.c
 	./tests/test_q4k_dot
 
 clean:
-	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test ds4_agent_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test ds4_agent_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/rocm_q8_mfma_correctness tests/rocm_q8_mfma_correctness.o
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ and for quality and speed testing.
 We support the following backends:
 * **Metal** is our primary target. Starting from MacBooks with 96GB of RAM (or less, using SSD streaming).
 * **NVIDIA CUDA / DGX Spark**, CUDA with special care for the DGX Spark.
-* **Strix Halo (ROCm)**, systems like the Framework Desktop and other systems based on the same GPU and unified RAM design.
+* **AMD ROCm / Strix Halo**, systems like the Framework Desktop and other systems based on the same GPU and unified RAM design, and CDNA3/MI300X.
 
 This project would not exist without **llama.cpp and GGML**, make sure to read
 the acknowledgements section, a big thank you to Georgi Gerganov and all the
@@ -41,7 +41,7 @@ That said, a few important things about this project:
 * This software is developed with **strong assistance from GPT 5.5** and with humans leading the ideas, testing, and debugging. We say this openly because it shaped how the project was built. If you are not happy with AI-developed code, this software is not for you. The acknowledgement below is equally important: this would not exist without `llama.cpp` and GGML, largely written by hand.
 * This implementation is based on the idea that compressed KV caches like the one of DeepSeek v4 and the fast SSD disks of modern MacBooks should change our idea that KV cache belongs to RAM. **The KV cache is actually a first-class disk citizen**. Fast SSD disks also changed the inference game from the point of view of "model needs to fit RAM": while having more RAM the the model size is still preferred, SSD streaming allows to turn the available amount of RAM from a hard cutoff (can I run this model or not?) to continuous spectrum of speed levels.
 * Our vision is that local inference should be a set of three things working well together, out of the box: A) inference engine with HTTP API + B) GGUF specially crafted to run well under a given engine and given assumptions + C) testing and validation with coding agents implementations. D) Purpose built agents for specific models and execution environments. DwarfStar only runs with the GGUF files provided. It gets tested against officially obtained logits at different context sizes. This project exists because we wanted to make one local model feel finished end to end, not just runnable. However this is beta quality code, so probably we are not still there, especially since recently we introduced large new features: distributed inference, SSD streaming, and other minor improvements.
-* The optimized graph path targets **Metal on macOS** and **CUDA on Linux**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.
+* The optimized graph path targets **Metal on macOS** and **CUDA or ROCm on Linux**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA, ROCm, or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.
 
 ## Acknowledgements to llama.cpp and GGML
 
@@ -145,6 +145,10 @@ Then build:
 make                  # macOS Metal
 make cuda-spark       # Linux CUDA, DGX Spark / GB10
 make cuda-generic     # Linux CUDA, other local CUDA GPUs
+make cdna             # Linux ROCm, AMD Instinct CDNA3+CDNA4 / gfx942+gfx950
+make cdna3            # Linux ROCm, AMD Instinct MI300X/MI325X / gfx942
+make cdna4            # Linux ROCm, AMD Instinct MI350X/MI355X / gfx950 (runtime validation pending)
+make strix-halo       # Linux ROCm, Strix Halo / gfx1151
 make cpu              # CPU-only diagnostics build
 ```
 
@@ -290,6 +294,34 @@ To build an initial mental model, here are the high level concepts:
 4. Each worker keeps its slice of the KV cache.
 5. Communication is worker-to-worker, there is no need to use the coordinator as relay, so if your coordinator is `A`, and you make a request, activations will flow in `A -> B -> C -> back to A`.
 
+For multiple GPUs in one local Linux host, `./ds4 --gpus` starts the local
+workers for you, assigns one process per listed GPU, picks a localhost
+coordinator port, and splits layers contiguously. It uses the same distributed
+runtime as the manual `--role coordinator` / `--role worker` flow, but you do
+not need a shell loop:
+
+```sh
+./ds4 --rocm -m ds4flash.gguf --gpus 0,1,2,3,4,5,6,7 -p "Hello"
+./ds4 --cuda -m ds4flash.gguf --gpus 0,1,2,3 -p "Hello"
+```
+
+For distributed split GGUFs on one host, repeat `-m` for each shard in any
+order. The local launcher inspects and sorts the shards, assigns GPUs
+by model-cache footprint across the full layer range, and gives the output head
+to the final GPU. If a physical GPU's assigned range crosses a GGUF shard
+boundary, the launcher starts two adjacent local workers on that same GPU so
+each process can keep loading one shard file. For very tight VRAM splits, the
+launcher may default `--prefill-chunk` to 1024 to keep graph scratch below the
+remaining device memory; pass `--prefill-chunk` explicitly to override that:
+
+```sh
+./ds4 --rocm \
+  -m gguf/DeepSeek-V4-Pro-Q4K-Layers00-30.gguf \
+  -m gguf/DeepSeek-V4-Pro-Q4K-Layers-31-output.gguf \
+  --gpus 0,1,2,3,4,5,6,7 \
+  --ctx 262144
+```
+
 ### How it works and how to configure it
 
 The prefill path is pipelined (this is why it can go faster than in a single machine).
@@ -685,8 +717,8 @@ ds4>
 The interactive CLI is a real multi-turn chat. It keeps the rendered chat
 transcript and the live graph KV checkpoint, so each turn extends the previous
 conversation. Useful commands are `/help`, `/think`, `/think-max`, `/nothink`,
-`/ctx N`, `/read FILE`, and `/quit`. Ctrl+C interrupts the current generation
-and returns to `ds4>`.
+`/ctx N`, `/read FILE`, and `/quit`. Ctrl+C interrupts the current generation;
+at `ds4>`, it exits.
 
 The CLI defaults to thinking mode. Use `/nothink` or `--nothink` for direct
 answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional MTP speculative
@@ -1173,11 +1205,13 @@ the kv cache files include the verbatim prompt cached.
 
 ## Backends
 
-The default graph backend is Metal on macOS and CUDA in CUDA builds:
+The default graph backend is Metal on macOS, CUDA in CUDA builds, and ROCm in
+ROCm builds:
 
 ```sh
 ./ds4 -p "Hello" --metal
 ./ds4 -p "Hello" --cuda
+./ds4 -p "Hello" --rocm
 ```
 
 On Linux, plain `make` prints the available build targets instead of selecting a
@@ -1191,6 +1225,34 @@ make cuda CUDA_ARCH=sm_120
 make cuda CUDA_ARCH=native
 ```
 
+For AMD GPUs, use the matching ROCm target. `make cdna` builds a CDNA3/CDNA4
+binary with both `gfx942` and `gfx950` code objects and enables the CDNA wave64
+MFMA q8 prefill path. Runtime validation for this path has been performed on
+CDNA3 / MI300X only; the CDNA4 targets compile the `gfx950` path but still need
+validation on MI350X/MI355X hardware. If you only need one product generation,
+`make cdna3`, `make mi300x`, and `make mi325x` target `gfx942`; `make cdna4`,
+`make mi350x`, and `make mi355x` target `gfx950`. Strix Halo systems use
+`make strix-halo` and keep the gfx11 wave32 WMMA q8 path. For another AMD GPU,
+set `ROCM_ARCH` explicitly. Multiple targets can be passed as a quoted
+space-separated list:
+
+```sh
+make cdna
+make mi300x
+make mi325x
+make mi355x
+make strix-halo
+make rocm ROCM_ARCH=gfx942
+make rocm ROCM_ARCH="gfx942 gfx950"
+```
+
+The portable synthetic check for the CDNA MFMA Q8 path is:
+
+```sh
+make rocm-q8-mfma-correctness
+make rocm-q8-mfma-correctness ROCM_Q8_MFMA_ARCH=gfx950  # requires CDNA4 hardware to run
+```
+
 There is also a CPU reference/debug path:
 
 ```sh