Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 79 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,24 @@ CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
HIPCC ?= $(shell command -v hipcc 2>/dev/null || echo /opt/rocm/bin/hipcc)
ROCM_ARCH ?= gfx1151
ROCM_CFLAGS ?= -O3 -ffast-math -g -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
ROCM_ARCHES = $(strip $(ROCM_ARCH))
ROCM_PRIMARY_ARCH = $(firstword $(subst :, ,$(firstword $(ROCM_ARCHES))))
ROCM_OFFLOAD_FLAGS = $(foreach arch,$(ROCM_ARCHES),--offload-arch=$(arch))
ROCM_Q8_MFMA_ARCH ?= $(if $(filter gfx1151,$(ROCM_ARCH)),gfx942,$(ROCM_ARCH))
ROCM_WMMA_W32 ?= $(if $(filter gfx11%,$(ROCM_PRIMARY_ARCH)),1,0)
ROCM_MFMA_F16 ?= $(if $(filter gfx94% gfx95%,$(ROCM_PRIMARY_ARCH)),1,0)
ROCM_DIRECT_MFMA_F16 ?= $(ROCM_MFMA_F16)
ROCM_ROCWMMA_F16_FALLBACK ?= 0
ROCM_CFLAGS ?= -O3 -ffast-math -g -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -DDS4_ROCM_WMMA_W32=$(ROCM_WMMA_W32) -DDS4_ROCM_MFMA_F16=$(ROCM_MFMA_F16) -DDS4_ROCM_DIRECT_MFMA_F16=$(ROCM_DIRECT_MFMA_F16) -DDS4_ROCM_ROCWMMA_F16_FALLBACK=$(ROCM_ROCWMMA_F16_FALLBACK) -Wno-unused-command-line-argument $(ROCM_OFFLOAD_FLAGS)
ROCM_LDLIBS ?= -lm -pthread -lhipblas -lhipblaslt
ROCM_TARGETS := ds4 ds4-server ds4-bench ds4-eval ds4-agent
ROCM_CORE_OBJS := ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o
DS4_LINK ?= $(NVCC) $(NVCCFLAGS)
DS4_LINK_LIBS ?= $(CUDA_LDLIBS)
METAL_LDLIBS := $(LDLIBS)
endif

.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo rocm
.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo cdna cdna3 cdna4 mi300x mi325x mi350x mi355x rocm rocm-q8-mfma-correctness manual-rocm-pro-q4-smoke manual-rocm-pro-q4-multiturn-smoke manual-rocm-pro-q4-logits-compare

ifeq ($(UNAME_S),Darwin)
all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
Expand Down Expand Up @@ -85,7 +95,12 @@ help:
@echo " make cuda-generic Build CUDA for a generic local CUDA GPU"
@echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
@echo " make strix-halo Build ROCm for Strix Halo / gfx1151"
@echo " make rocm Alias for make strix-halo"
@echo " make cdna Build ROCm CDNA3+CDNA4 fat binary / gfx942+gfx950"
@echo " make cdna3 Build ROCm for AMD Instinct MI300X/MI325X / gfx942"
@echo " make cdna4 Build ROCm for AMD Instinct MI350X/MI355X / gfx950 (runtime validation pending)"
@echo " make mi300x|mi325x Alias for make cdna3"
@echo " make mi350x|mi355x Alias for make cdna4"
@echo " make rocm ROCM_ARCH=gfxN Build ROCm with explicit AMD GPU target(s)"
@echo " make cpu Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
@echo " make test Build and run tests"
@echo " make clean Remove build outputs"
Expand All @@ -104,14 +119,42 @@ cuda:
fi
$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH="$(CUDA_ARCH)"

strix-halo: ROCM_ARCH := gfx1151
strix-halo:
$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent \
CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o" \
$(MAKE) -B $(ROCM_TARGETS) \
HIPCC="$(HIPCC)" \
ROCM_ARCH="$(ROCM_ARCH)" \
ROCM_CFLAGS="$(ROCM_CFLAGS)" \
ROCM_LDLIBS="$(ROCM_LDLIBS)" \
CORE_OBJS="$(ROCM_CORE_OBJS)" \
CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
DS4_LINK_LIBS="$(ROCM_LDLIBS)"

rocm: strix-halo
cdna: ROCM_ARCH := gfx942 gfx950
cdna3 mi300x mi325x: ROCM_ARCH := gfx942
cdna4 mi350x mi355x: ROCM_ARCH := gfx950
cdna cdna3 cdna4 mi300x mi325x mi350x mi355x:
$(MAKE) -B $(ROCM_TARGETS) \
HIPCC="$(HIPCC)" \
ROCM_ARCH="$(ROCM_ARCH)" \
ROCM_CFLAGS="$(ROCM_CFLAGS)" \
ROCM_LDLIBS="$(ROCM_LDLIBS)" \
CORE_OBJS="$(ROCM_CORE_OBJS)" \
CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
DS4_LINK_LIBS="$(ROCM_LDLIBS)"

rocm:
$(MAKE) -B $(ROCM_TARGETS) \
HIPCC="$(HIPCC)" \
ROCM_ARCH="$(ROCM_ARCH)" \
ROCM_CFLAGS="$(ROCM_CFLAGS)" \
ROCM_LDLIBS="$(ROCM_LDLIBS)" \
CORE_OBJS="$(ROCM_CORE_OBJS)" \
CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
DS4_LINK_LIBS="$(ROCM_LDLIBS)"

ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS)
$(DS4_LINK) -o $@ $^ $(DS4_LINK_LIBS)
Expand Down Expand Up @@ -139,16 +182,16 @@ cuda-regression: tests/cuda_long_context_smoke
./tests/cuda_long_context_smoke
endif

ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_gpu.h
$(CC) $(CFLAGS) -c -o $@ ds4.c

ds4_ssd.o: ds4_ssd.c ds4_ssd.h
$(CC) $(CFLAGS) -c -o $@ ds4_ssd.c

ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_help.h linenoise.h
$(CC) $(CFLAGS) -c -o $@ ds4_cli.c

ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4.h ds4_ssd.h
ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4_internal.h ds4.h ds4_ssd.h
$(CC) $(CFLAGS) -c -o $@ ds4_distributed.c

ds4_help.o: ds4_help.c ds4_help.h
Expand All @@ -172,6 +215,24 @@ ds4_web.o: ds4_web.c ds4_web.h
ds4_kvstore.o: ds4_kvstore.c ds4_kvstore.h ds4.h ds4_ssd.h
$(CC) $(CFLAGS) -c -o $@ ds4_kvstore.c

manual-rocm-pro-q4-smoke:
tests/rocm_pro_q4_8gpu_smoke.sh

manual-rocm-pro-q4-multiturn-smoke:
tests/rocm_pro_q4_8gpu_multiturn_smoke.sh

manual-rocm-pro-q4-logits-compare:
tests/rocm_pro_q4_logits_compare.sh

rocm-q8-mfma-correctness:
$(MAKE) -B tests/rocm_q8_mfma_correctness \
HIPCC="$(HIPCC)" \
ROCM_ARCH="$(ROCM_Q8_MFMA_ARCH)" \
ROCM_LDLIBS="$(ROCM_LDLIBS)" \
CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD"
tests/rocm_q8_mfma_correctness
DS4_ROCM_DISABLE_Q8_BATCH_MFMA=1 tests/rocm_q8_mfma_correctness

ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
$(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c

Expand All @@ -181,16 +242,19 @@ ds4_agent_test.o: tests/ds4_agent_test.c ds4_agent.c ds4.h ds4_ssd.h ds4_distrib
tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
$(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c

tests/rocm_q8_mfma_correctness.o: tests/rocm_q8_mfma_correctness.c ds4_gpu.h
$(CC) $(CFLAGS) -I. -c -o $@ tests/rocm_q8_mfma_correctness.c

rax.o: rax.c rax.h rax_malloc.h
$(CC) $(CFLAGS) -c -o $@ rax.c

linenoise.o: linenoise.c linenoise.h
$(CC) $(CFLAGS) -c -o $@ linenoise.c

ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_gpu.h
$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c

ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_internal.h ds4_help.h linenoise.h
$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_cli.c

ds4_server_cpu.o: ds4_server.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
Expand All @@ -217,6 +281,9 @@ ds4_rocm.o: ds4_rocm.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(ROCM_SRCS)
tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)

tests/rocm_q8_mfma_correctness: tests/rocm_q8_mfma_correctness.o ds4_rocm.o
$(HIPCC) $(ROCM_CFLAGS) -o $@ $^ $(ROCM_LDLIBS)

ds4_test: ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS)
ifeq ($(UNAME_S),Darwin)
$(CC) $(CFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
Expand All @@ -241,4 +308,4 @@ q4k-dot-test: tests/test_q4k_dot.c
./tests/test_q4k_dot

clean:
rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test ds4_agent_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test ds4_agent_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/rocm_q8_mfma_correctness tests/rocm_q8_mfma_correctness.o
72 changes: 67 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ and for quality and speed testing.
We support the following backends:
* **Metal** is our primary target. Starting from MacBooks with 96GB of RAM (or less, using SSD streaming).
* **NVIDIA CUDA / DGX Spark**, CUDA with special care for the DGX Spark.
* **Strix Halo (ROCm)**, systems like the Framework Desktop and other systems based on the same GPU and unified RAM design.
* **AMD ROCm / Strix Halo**, systems like the Framework Desktop and other systems based on the same GPU and unified RAM design, and CDNA3/MI300X.

This project would not exist without **llama.cpp and GGML**, make sure to read
the acknowledgements section, a big thank you to Georgi Gerganov and all the
Expand All @@ -41,7 +41,7 @@ That said, a few important things about this project:
* This software is developed with **strong assistance from GPT 5.5** and with humans leading the ideas, testing, and debugging. We say this openly because it shaped how the project was built. If you are not happy with AI-developed code, this software is not for you. The acknowledgement below is equally important: this would not exist without `llama.cpp` and GGML, largely written by hand.
* This implementation is based on the idea that compressed KV caches like the one of DeepSeek v4 and the fast SSD disks of modern MacBooks should change our idea that KV cache belongs to RAM. **The KV cache is actually a first-class disk citizen**. Fast SSD disks also changed the inference game from the point of view of "model needs to fit RAM": while having more RAM the the model size is still preferred, SSD streaming allows to turn the available amount of RAM from a hard cutoff (can I run this model or not?) to continuous spectrum of speed levels.
* Our vision is that local inference should be a set of three things working well together, out of the box: A) inference engine with HTTP API + B) GGUF specially crafted to run well under a given engine and given assumptions + C) testing and validation with coding agents implementations. D) Purpose built agents for specific models and execution environments. DwarfStar only runs with the GGUF files provided. It gets tested against officially obtained logits at different context sizes. This project exists because we wanted to make one local model feel finished end to end, not just runnable. However this is beta quality code, so probably we are not still there, especially since recently we introduced large new features: distributed inference, SSD streaming, and other minor improvements.
* The optimized graph path targets **Metal on macOS** and **CUDA on Linux**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.
* The optimized graph path targets **Metal on macOS** and **CUDA or ROCm on Linux**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA, ROCm, or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.

## Acknowledgements to llama.cpp and GGML

Expand Down Expand Up @@ -145,6 +145,10 @@ Then build:
make # macOS Metal
make cuda-spark # Linux CUDA, DGX Spark / GB10
make cuda-generic # Linux CUDA, other local CUDA GPUs
make cdna # Linux ROCm, AMD Instinct CDNA3+CDNA4 / gfx942+gfx950
make cdna3 # Linux ROCm, AMD Instinct MI300X/MI325X / gfx942
make cdna4 # Linux ROCm, AMD Instinct MI350X/MI355X / gfx950 (runtime validation pending)
make strix-halo # Linux ROCm, Strix Halo / gfx1151
make cpu # CPU-only diagnostics build
```

Expand Down Expand Up @@ -290,6 +294,34 @@ To build an initial mental model, here are the high level concepts:
4. Each worker keeps its slice of the KV cache.
5. Communication is worker-to-worker, there is no need to use the coordinator as relay, so if your coordinator is `A`, and you make a request, activations will flow in `A -> B -> C -> back to A`.

For multiple GPUs in one local Linux host, `./ds4 --gpus` starts the local
workers for you, assigns one process per listed GPU, picks a localhost
coordinator port, and splits layers contiguously. It uses the same distributed
runtime as the manual `--role coordinator` / `--role worker` flow, but you do
not need a shell loop:

```sh
./ds4 --rocm -m ds4flash.gguf --gpus 0,1,2,3,4,5,6,7 -p "Hello"
./ds4 --cuda -m ds4flash.gguf --gpus 0,1,2,3 -p "Hello"
```

For distributed split GGUFs on one host, repeat `-m` for each shard in any
order. The local launcher inspects and sorts the shards, assigns GPUs
by model-cache footprint across the full layer range, and gives the output head
to the final GPU. If a physical GPU's assigned range crosses a GGUF shard
boundary, the launcher starts two adjacent local workers on that same GPU so
each process can keep loading one shard file. For very tight VRAM splits, the
launcher may default `--prefill-chunk` to 1024 to keep graph scratch below the
remaining device memory; pass `--prefill-chunk` explicitly to override that:

```sh
./ds4 --rocm \
-m gguf/DeepSeek-V4-Pro-Q4K-Layers00-30.gguf \
-m gguf/DeepSeek-V4-Pro-Q4K-Layers-31-output.gguf \
--gpus 0,1,2,3,4,5,6,7 \
--ctx 262144
```

### How it works and how to configure it

The prefill path is pipelined (this is why it can go faster than in a single machine).
Expand Down Expand Up @@ -685,8 +717,8 @@ ds4>
The interactive CLI is a real multi-turn chat. It keeps the rendered chat
transcript and the live graph KV checkpoint, so each turn extends the previous
conversation. Useful commands are `/help`, `/think`, `/think-max`, `/nothink`,
`/ctx N`, `/read FILE`, and `/quit`. Ctrl+C interrupts the current generation
and returns to `ds4>`.
`/ctx N`, `/read FILE`, and `/quit`. Ctrl+C interrupts the current generation;
at `ds4>`, it exits.

The CLI defaults to thinking mode. Use `/nothink` or `--nothink` for direct
answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional MTP speculative
Expand Down Expand Up @@ -1173,11 +1205,13 @@ the kv cache files include the verbatim prompt cached.

## Backends

The default graph backend is Metal on macOS and CUDA in CUDA builds:
The default graph backend is Metal on macOS, CUDA in CUDA builds, and ROCm in
ROCm builds:

```sh
./ds4 -p "Hello" --metal
./ds4 -p "Hello" --cuda
./ds4 -p "Hello" --rocm
```

On Linux, plain `make` prints the available build targets instead of selecting a
Expand All @@ -1191,6 +1225,34 @@ make cuda CUDA_ARCH=sm_120
make cuda CUDA_ARCH=native
```

For AMD GPUs, use the matching ROCm target. `make cdna` builds a CDNA3/CDNA4
binary with both `gfx942` and `gfx950` code objects and enables the CDNA wave64
MFMA q8 prefill path. Runtime validation for this path has been performed on
CDNA3 / MI300X only; the CDNA4 targets compile the `gfx950` path but still need
validation on MI350X/MI355X hardware. If you only need one product generation,
`make cdna3`, `make mi300x`, and `make mi325x` target `gfx942`; `make cdna4`,
`make mi350x`, and `make mi355x` target `gfx950`. Strix Halo systems use
`make strix-halo` and keep the gfx11 wave32 WMMA q8 path. For another AMD GPU,
set `ROCM_ARCH` explicitly. Multiple targets can be passed as a quoted
space-separated list:

```sh
make cdna
make mi300x
make mi325x
make mi355x
make strix-halo
make rocm ROCM_ARCH=gfx942
make rocm ROCM_ARCH="gfx942 gfx950"
```

The portable synthetic check for the CDNA MFMA Q8 path is:

```sh
make rocm-q8-mfma-correctness
make rocm-q8-mfma-correctness ROCM_Q8_MFMA_ARCH=gfx950 # requires CDNA4 hardware to run
```

There is also a CPU reference/debug path:

```sh
Expand Down
Loading