diff --git a/.gitignore b/.gitignore index 228607990..b955b1cbc 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ /ds4_native /ds4_server_test /ds4_test +/tests/test_q4k_dot /ds4flash.gguf /TODO.md /gguf/ @@ -16,3 +17,4 @@ __pycache__/ /misc/ .*.swp .DS_Store +/logs/ diff --git a/AGENT.md b/AGENT.md index 7a1387c9e..e596a4faa 100644 --- a/AGENT.md +++ b/AGENT.md @@ -28,6 +28,24 @@ Objective-C only where Metal requires it and Metal kernels under `metal/`. - Avoid large CPU inference runs on macOS; the CPU path has previously exposed kernel VM failures with very large mappings. - Do not run multiple huge model processes concurrently. The instance lock is intentional. + +## Repository Maintenance + +- In this checkout, `origin` is the audreyt/ds4 fork. To chase antirez + upstream, fetch it explicitly: + `git fetch https://github.com/antirez/ds4.git main:refs/remotes/antirez/main`. +- Compare and merge `antirez/main`; do not merge `origin/main` expecting + upstream changes. +- Leave upstream-chase merge commits unpushed unless the user asks. +- Preserve local README/MODEL_CARD benchmark numbers unless replaced by fresh + local measurements. +- Treat scheduling, KV-cache lifetime, attention math, tokenizer behavior, + model shape, and tensor metadata conflicts as correctness-sensitive. +- Keep CUDA/ROCm parity in view when upstream changes Metal logic. +- Historical branch note: older M5 side experiments used Metal function + constant slot 703 after avoiding slot 702. Verify current branches before + reusing those slots. + ## Layout - `ds4.c`: model loading, tokenizer, CPU reference code, Metal graph scheduling, diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9bb07168..bc34453b2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -129,6 +129,17 @@ and background load when comparing two commits. For backend work, run at least one before/after CSV and compare both `prefill_tps` and `gen_tps`. Generation is greedy and skips EOS so each frontier gets the same number of generated tokens. +When comparing this fork against antirez/main or another fork: + +- Use each fork's preferred quant only when the question is fork-vs-fork + behavior; use the same GGUF when isolating runtime or kernel changes. +- Run baseline and candidate sequentially on the same machine. Do not run two + huge model processes concurrently. +- Use the current sweep above unless the PR notes explicitly choose another + sweep. Do not copy old README tables or stale agent playbooks. +- Report exact commits, model files, CSV paths, backend, hardware, and thermal + state. + To generate a graph for a CSV: ```sh diff --git a/Makefile b/Makefile index 9711dc1a4..0fb4705f0 100644 --- a/Makefile +++ b/Makefile @@ -13,12 +13,13 @@ OBJCFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -fo LDLIBS ?= -lm -pthread METAL_SRCS := $(wildcard metal/*.metal) +CUDA_CONFIG := .ds4_cuda.config ROCM_SRCS := $(wildcard rocm/*.cuh) ifeq ($(UNAME_S),Darwin) METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal -CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_metal.o -CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o +CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_metal.o +CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o else CFLAGS += -D_GNU_SOURCE -fno-finite-math-only CUDA_HOME ?= /usr/local/cuda @@ -28,8 +29,8 @@ ifneq ($(strip $(CUDA_ARCH)),) NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH) endif NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread -CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_cuda.o -CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o +CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_cuda.o +CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas HIPCC ?= $(shell command -v hipcc 2>/dev/null || echo /opt/rocm/bin/hipcc) ROCM_ARCH ?= gfx1151 @@ -40,7 +41,7 @@ DS4_LINK_LIBS ?= $(CUDA_LDLIBS) METAL_LDLIBS := $(LDLIBS) endif -.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo rocm +.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression strix-halo rocm FORCE ifeq ($(UNAME_S),Darwin) all: ds4 ds4-server ds4-bench ds4-eval ds4-agent @@ -106,7 +107,7 @@ cuda: strix-halo: $(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent \ - CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o" \ + CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_rocm.o" \ CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \ DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \ DS4_LINK_LIBS="$(ROCM_LDLIBS)" @@ -137,13 +138,29 @@ cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu cuda-regression: tests/cuda_long_context_smoke ./tests/cuda_long_context_smoke + +$(CUDA_CONFIG): FORCE + @tmp="$@.tmp"; \ + { \ + printf '%s\n' "CUDA_ARCH=$(CUDA_ARCH)"; \ + printf '%s\n' "NVCC=$(NVCC)"; \ + printf '%s\n' "NVCCFLAGS=$(NVCCFLAGS)"; \ + } > "$$tmp"; \ + if test -r "$@" && cmp -s "$$tmp" "$@"; then \ + rm -f "$$tmp"; \ + else \ + mv "$$tmp" "$@"; \ + rm -f ds4_cuda.o; \ + fi endif -ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h +ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h $(CC) $(CFLAGS) -c -o $@ ds4.c ds4_ssd.o: ds4_ssd.c ds4_ssd.h - $(CC) $(CFLAGS) -c -o $@ ds4_ssd.c + +ds4_dspark_runtime.o: ds4_dspark_runtime.c ds4_dspark_runtime.h ds4.h + $(CC) $(CFLAGS) -c -o $@ ds4_dspark_runtime.c ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h $(CC) $(CFLAGS) -c -o $@ ds4_cli.c @@ -187,7 +204,7 @@ rax.o: rax.c rax.h rax_malloc.h linenoise.o: linenoise.c linenoise.h $(CC) $(CFLAGS) -c -o $@ linenoise.c -ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h +ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h $(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h @@ -208,7 +225,7 @@ ds4_agent_cpu.o: ds4_agent.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h ds4_kv ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS) $(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m -ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc +ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(CUDA_CONFIG) $(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu ds4_rocm.o: ds4_rocm.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(ROCM_SRCS) @@ -241,4 +258,4 @@ q4k-dot-test: tests/test_q4k_dot.c ./tests/test_q4k_dot clean: - rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test ds4_agent_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o + rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test ds4_agent_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o $(CUDA_CONFIG) $(CUDA_CONFIG).tmp diff --git a/README.md b/README.md index 785695284..db7bbc16e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,13 @@ -# DwarfStar +# DwarfStar 4: Abliteration + Uncertainty Steering + +**Branch note:** upstream `antirez/main` has absorbed the main Metal/NAX +performance work. This branch is now focused on the CyberNeurova abliterated, +ds4-aligned IQ2XXS imatrix GGUF and the uncertainty steering direction that +nudges final answers while leaving prompt prefill, thinking tokens, and tool +syntax unsteered by default. + +The branch intentionally does not maintain a separate speed headline. Use +upstream DS4 documentation for baseline performance expectations. **DwarfStar** is a small native inference engine optimized first for **DeepSeek V4 Flash**, with support for **DeepSeek V4 PRO** on very high-memory @@ -9,7 +18,7 @@ correct and fast way, the project goal is to provide DeepSeek specific loading, prompt rendering, tool calling, KV state handling (RAM and on-disk), server API and integrated coding agent, all ready to work with coding agents or with the provided CLI interface. There are also tools for GGUF and imatrix generation, -and for quality and speed testing. +and for quality testing. We support the following backends: * **Metal** is our primary target. Starting from MacBooks with 96GB of RAM (or less, using SSD streaming). @@ -73,8 +82,8 @@ If you are looking for very specific things, we have other sub-README files. Otherwise for normal usage keep reading the next sections. -- [CONTRIBUTING.md](CONTRIBUTING.md): correctness and speed regression testing - guide for contributors. **Read this before sending a pull request**. +- [CONTRIBUTING.md](CONTRIBUTING.md): correctness and regression testing guide + for contributors. **Read this before sending a pull request**. - [gguf-tools/README.md](gguf-tools/README.md): offline GGUF generation, imatrix collection, quantization tooling, and quality checks. - [gguf-tools/imatrix/README.md](gguf-tools/imatrix/README.md): how the @@ -85,8 +94,6 @@ next sections. how local GGUFs are scored against official DeepSeek V4 Flash/PRO continuations. - [dir-steering/README.md](dir-steering/README.md): directional steering data, vector generation, and usage. -- [speed-bench/README.md](speed-bench/README.md): benchmark commands, charts, - and CSV generation. - [tests/test-vectors/README.md](tests/test-vectors/README.md): official continuation vectors used for regression checks. @@ -102,10 +109,12 @@ experts are quantized, up/gate at `IQ2_XXS`, down at `Q2_K`. They are the majority of all the model space: the other components (shared experts, projections, routing) are left untouched to guarantee quality. -Download one main model. **Prefer the imatrix versions.** +Download one main model. For this branch, `q2-imatrix` is the recommended +target: it points at the CyberNeurova abliterated, ds4-aligned IQ2XXS imatrix +GGUF and matches the included uncertainty steering vector. ```sh -./download_model.sh q2-imatrix # 96/128 GB RAM machines, imatrix-tuned q2 +./download_model.sh q2-imatrix # CyberNeurova abliterated q2, 96/128 GB RAM ./download_model.sh q2-q4-imatrix # 96/128 GB RAM machines, q2 with last 6 layers q4 ./download_model.sh q4-imatrix # >= 256 GB RAM machines, imatrix-tuned q4 ./download_model.sh pro-q2-imatrix # 512 GB RAM machines, PRO q2 imatrix quant @@ -118,11 +127,16 @@ For the full PRO Q4 distributed run, download one half on each machine: ./download_model.sh pro-q4-layers31-output # second half of PRO Q4 split ``` -The script downloads from `https://huggingface.co/antirez/deepseek-v4-gguf`, -stores files under `./gguf/`, resumes partial downloads with `curl -C -`, and -updates `./ds4flash.gguf` to point at the selected main model. -The `pro-q4-layers00-30`, `pro-q4-layers31-output`, and `pro-q4-split` targets -download distributed PRO Q4 pieces and do not update `./ds4flash.gguf`. +The script downloads `q2-imatrix` from +`https://huggingface.co/audreyt/CyberNeurova-DeepSeek-V4-Flash-abliterated-GGUF` +and all other targets — including the distributed `pro-q4-layers00-30`, +`pro-q4-layers31-output`, and `pro-q4-split` pieces — from +`https://huggingface.co/antirez/deepseek-v4-gguf`. +It stores files under `./gguf/`, resumes partial downloads with `curl -C -`, +and updates `./ds4flash.gguf` to point at the selected main model. +The distributed PRO Q4 targets do not update `./ds4flash.gguf`. +The plain q2 XXS weights are produced with the weights importance vector only, +without an imatrix. The imatrix variants are preferred. Authentication is optional for public downloads, but `--token TOKEN`, `HF_TOKEN`, or the local Hugging Face token cache are used when present. @@ -133,11 +147,38 @@ weights. Flash GGUF generation is supported by the local tools. PRO GGUF production currently still depends on the external `llama.cpp`-based workflow; native tooling can be added later. -`./download_model.sh mtp` fetches the optional speculative decoding support -GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and q4-imatrix, -but must be enabled explicitly with `--mtp`. The current MTP/speculative -decoding path is still experimental: it is correctness-gated and currently -provides at most a slight speedup, not a meaningful generation-speed win. +`./download_model.sh mtp` fetches the optional legacy speculative decoding +support GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and +q4-imatrix, but must be enabled explicitly with `--mtp`. Legacy one-step MTP is +correctness-gated and experimental: it currently provides at most a slight +speedup, not a meaningful generation-speed win. Official DeepSeek-V4-Flash +DSpark/DeepSpec Markov draft shards can be converted with +`gguf-tools/deepseek4-quantize --dspark-only`. Passing the converted DSpark GGUF +with `--mtp DSpark.gguf` enables an experimental Metal block speculative decode +path: draft blocks are target-verified before commit, but acceptance and speed +depend on the base/draft quantization and prompt. DSpark GGUFs are additional +draft-model weights, so higher draft precision trades directly against +long-context headroom. CPU builds do not run MTP, and CUDA/ROCm currently load +DSpark GGUFs without enabling the DSpark runtime. + +For DeepSpec training experiments, `ds4 --dspark-target-cache-dataset FILE +--dspark-target-cache-out DIR --dspark-target-cache-target-model HF_OR_PATH` +consumes the same rendered prompt dataset format used by imatrix collection and +writes a DeepSpec-compatible target cache (`manifest.json`, `samples.idx`, and +shard data) containing prompt token ids, attention/loss masks, target-layer +hidden states, and last hidden states. Use +`--dspark-target-cache-chat-template NAME` to stamp the cache manifest with the +DeepSpec training template identity. +Validate the cache contract with +`python3 gguf-tools/deepspec/ds4_deepspec.py DIR --target-model HF_OR_PATH` +before handing it to a DeepSpec checkout. The same helper can emit the DS4-side +non-Markov DeepSpec config scaffold with +`python3 gguf-tools/deepspec/ds4_deepspec.py --emit-nonseq-config dspark_v4_nonseq.py --target-cache DIR`. +This target-cache export path remains useful for DSpark/DeepSpec training +experiments; the built-in Metal runtime uses converted Markov or nonseq DSpark +draft GGUFs through the same target-verified block speculation path. Benchmark +with `DS4_MTP_TIMING=1` on the exact base/draft quant pair before treating it as +a throughput win. Then build: @@ -152,30 +193,11 @@ make cpu # CPU-only diagnostics build select another supported GGUF from `./gguf/`. Run `./ds4 --help` and `./ds4-server --help` for the full flag list. -## Speed - -These are single-run Metal CLI numbers with `--ctx 32768`, `--nothink`, greedy -decoding, and `-n 256`. The short prompt is a normal small Italian story -prompt. The long prompts exercise chunked prefill plus long-context decode. -Q4 requires the larger-memory machine class, so M3 Max Q4 numbers are `N/A`. - -| Machine | Quant | Prompt | Prefill | Generation | -| --- | ---: | ---: | ---: | ---: | -| MacBook Pro M3 Max, 128 GB | q2 | short | 58.52 t/s | 26.68 t/s | -| MacBook Pro M3 Max, 128 GB | q2 | 11709 tokens | 250.11 t/s | 21.47 t/s | -| MacBook Pro M3 Max, 128 GB | q4 | short | N/A | N/A | -| MacBook Pro M3 Max, 128 GB | q4 | long | N/A | N/A | -| MacBook Pro M5 Max, 128 GB | q2 | short | 87.25 t/s | 34.27 t/s | -| MacBook Pro M5 Max, 128 GB | q2 | 11707 tokens | 463.44 t/s | 25.90 t/s | -| Mac Studio M3 Ultra, 512 GB | q2 | short | 84.43 t/s | 36.86 t/s | -| Mac Studio M3 Ultra, 512 GB | q2 | 11709 tokens | 468.03 t/s | 27.39 t/s | -| Mac Studio M3 Ultra, 512 GB | q4 | short | 78.95 t/s | 35.50 t/s | -| Mac Studio M3 Ultra, 512 GB | q4 | 12018 tokens | 448.82 t/s | 26.62 t/s | -| Mac Studio M3 Ultra, 512 GB | PRO q2 | 32768 tokens | 138.82 t/s | 9.56 t/s | -| DGX Spark GB10, 128 GB | q2 | 7047 tokens | 343.81 t/s | 13.75 t/s | - -![M3 Max t/s](speed-bench/m3_max_ts.svg) -![PRO model M3 Ultra t/s](speed-bench/pro_model_m3_ultra_ts.svg) +## Performance + +This branch does not carry a separate performance table. Use upstream DS4 +documentation for throughput comparisons. The local value here is the +CyberNeurova abliterated q2-imatrix model path plus uncertainty steering. ## Running models larger than RAM @@ -538,43 +560,12 @@ in order to make it ready for prime time. When finally the agent will reach the wanted shape, we will *likely* split the server and the client creating a stateful session-based protocol that can recreate all that in a client-server way. -## Benchmarking - -`ds4-bench` measures instantaneous prefill and generation throughput at context -frontiers instead of reporting one whole-run average. It loads the model once, -walks a fixed token sequence to frontiers such as 2048, 4096, 6144, and uses -incremental prefill so each row measures only the newly-added token interval. -After each frontier it saves the live KV state to memory, generates a fixed -greedy non-EOS probe, restores the memory snapshot, and continues prefill. - -```sh -./ds4-bench \ - -m ds4flash.gguf \ - --prompt-file speed-bench/promessi_sposi.txt \ - --ctx-start 2048 \ - --ctx-max 65536 \ - --step-incr 2048 \ - --gen-tokens 128 -``` +## Branch Regression Checks -The example file is a cleaned public-domain Project Gutenberg text of -Alessandro Manzoni's *I Promessi Sposi* (ebook #45334), with the Gutenberg -header and footer removed: . - -Use `--step-incr N` for different linear spacing, or `--step-mul F` for -exponential sweeps. Output is CSV with one row per frontier: latest prefill -interval tokens/sec, generation tokens/sec at that frontier, and -`kvcache_bytes`. - -Sessions prefill long prompts in 4096-token chunks by default. Set -`DS4_METAL_PREFILL_CHUNK=N` to compare another chunk size, for example `2048` -to match the strict official-vector checkpoint path, or -`DS4_METAL_PREFILL_CHUNK=0` to prefill a prompt as one whole batch when memory -allows. Changing the chunk changes the KV checkpoint/logit path, so compare it -as an explicit run configuration. -Chunked Metal prefill reuses the same range-capable layer-major graph for each -chunk, preserving absolute compressor/indexer boundaries while avoiding the old -per-layer chunk dispatch path. +For this branch's current purpose, the useful regression checks are the +capability evaluator, local logprob vectors, and steering behavior described +below. Low-level throughput tools remain in the tree for inherited performance +work, but they are not part of this README's branch narrative. ## Capability Evaluation @@ -689,10 +680,12 @@ conversation. Useful commands are `/help`, `/think`, `/think-max`, `/nothink`, and returns to `ds4>`. The CLI defaults to thinking mode. Use `/nothink` or `--nothink` for direct -answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional MTP speculative -path; it is useful only for greedy decoding, currently uses a confidence gate -(`--mtp-margin`) to avoid slow partial accepts, and should be treated as an -experimental slight-speedup path. +answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional legacy one-step +MTP speculative path. Passing a converted official DSpark/DeepSpec Markov GGUF +with `--mtp DSpark.gguf` opts into the experimental Metal block-draft runtime, +which verifies proposed blocks against the target model before committing them. +It is correctness-gated, not a guaranteed speedup; measure acceptance and wall +time for the exact quantized base/draft pair. ## Server @@ -1205,16 +1198,33 @@ support the CPU backend for reference/debug use and share the same KV session and snapshot format as Metal and CUDA, but normal inference should use Metal or CUDA. -## Steering +## Uncertainty Steering -This project supports steering with single-vector activation directions; see the -`dir-steering` directory for more information. This follows the core idea of the +This branch includes an uncertainty direction for the CyberNeurova abliterated +aligned-imatrix GGUF. The general mechanism is single-vector activation +steering; see the `dir-steering` directory for vector generation and data +details. It follows the core idea of the [Refusal in Language Models Is Mediated by a Single Direction](https://arxiv.org/abs/2406.11717) -paper. You can use it to make the model more or less verbose, less likely to -answer programming questions if it is a chatbot for your car rental web site, -and so forth, much faster than fine-tuning. -This is also useful for cybersecurity researchers who want to reduce a model's -willingness to provide dual-use or offensive security guidance. +paper, but the included vector is tuned as a branch-specific uncertainty and +stakeholder-framing nudge rather than as a generic refusal direction. + +For `ds4-server`, directional steering defaults to the tool-safe +`final-answer` policy: prompt prefill, thinking tokens, and DSML tool-call +syntax stay unsteered, while final visible answer prose uses the configured +direction. Use `--dir-steering-policy decoding` to leave only prefill +unsteered, `always` for the original always-on behavior, or `off` to disable +server-side steering. + +For the CyberNeurova abliterated IQ2XXS-w2Q2K imatrix GGUF, the tree includes +`dir-steering/out/uncertainty_ablit_imatrix.f32`. For the aligned-imatrix +build, start with `--dir-steering-ffn -0.75 --dir-steering-attn 0` for the +pi-ds4 and OpenClaw deterministic seed-42 path. This FFN-only default preserves +tool-call grammar on long Codex-harness prompts while retaining a useful +stakeholder-framing nudge. Use `--temp 0` for precision-sensitive greedy +contested-question runs. `--dir-steering-ffn -0.5 --dir-steering-attn 0` is a +gentler fallback. The older `--dir-steering-ffn -2 --dir-steering-attn -0.5` +acid-test setting can over-amplify into tool-call leakage, repetition, or +cross-lingual tokens on some prompts. ## Test Vectors @@ -1233,6 +1243,7 @@ extractor self-test run first: ```sh make test # ./ds4-eval --self-test-extractors && ./ds4_test --all ./ds4_test --logprob-vectors +./ds4_test --metal-tensor-equivalence ./ds4_test --server ``` diff --git a/dir-steering/.gitignore b/dir-steering/.gitignore index 519f538c5..37fda6698 100644 --- a/dir-steering/.gitignore +++ b/dir-steering/.gitignore @@ -1,3 +1,7 @@ -out/ +out/* +!out/verbosity.f32 +!out/verbosity.json +!out/uncertainty_ablit_imatrix.f32 +!out/uncertainty_ablit_imatrix.json *.pyc __pycache__/ diff --git a/dir-steering/README.md b/dir-steering/README.md index e1fdbfe5a..40393b9ac 100644 --- a/dir-steering/README.md +++ b/dir-steering/README.md @@ -17,12 +17,74 @@ With no steering file or zero scales, ds4 follows the normal inference path. --dir-steering-file FILE load a 43 x 4096 f32 direction file --dir-steering-ffn F apply steering after FFN outputs; default is 1 when a file is provided --dir-steering-attn F apply steering after attention outputs; default is 0 +--dir-steering-policy MODE server-only policy: final-answer, decoding, always, or off; default is final-answer ``` The FFN output is usually the best first target because it is late enough in each layer to represent behavior, style, and topic signals. Attention steering is available for experiments, but it can be more fragile. +For tool-using agents, `ds4-server` defaults to `--dir-steering-policy +final-answer`. This keeps prompt prefill, thinking tokens, and DSML tool-call +tokens unsteered. Steering is re-enabled only after generation has clearly +entered final natural-language answer text. This avoids letting a +behavior/style vector perturb tool-call grammar while still allowing the final +prose to use the configured direction. + +`--dir-steering-policy decoding` is a middle ground for experiments that should +leave prompt/prefill activations untouched but steer every generated token, +including thinking and tool-call syntax. `always` restores the original +always-on behavior, and `off` disables directional steering at the server policy +layer. + +## CyberNeurova Uncertainty Vector + +`dir-steering/out/uncertainty_ablit_imatrix.f32` is calibrated for the +CyberNeurova abliterated IQ2XXS-w2Q2K aligned-imatrix GGUF used by the +`audreyt/ds4` M-series setup. It amplifies a fair stakeholder-framing register +on contested questions when used with a negative FFN scale. + +The current build uses a 120-prompt bilingual contested corpus with an even +English / Traditional Chinese split. Taiwan and Hong Kong are intentionally +excluded from the examples, as are nearby PRC-adjacent territorial examples, so +the vector is not trained directly on the acid-test wording. + +For stable interactive use, start with: + +```sh +./ds4-server \ + --dir-steering-file dir-steering/out/uncertainty_ablit_imatrix.f32 \ + --dir-steering-ffn -0.75 \ + --dir-steering-attn 0 +``` + +`ffn=-0.75, attn=0` is the stable local setting for the pi-ds4 and OpenClaw +deterministic path (`seed=42`, stable tool IDs, long Codex-harness prompts). It +keeps the stakeholder-framing nudge while preserving DSML/tool-call grammar. Use +`--temp 0` for precision-sensitive greedy runs. `ffn=-0.5, attn=0` is a gentler +fallback if you want a weaker nudge. The older acid-test setting, +`ffn=-2, attn=-0.5`, can over-amplify this imatrix-calibrated vector and may +collapse into tool-call leakage, phrase repetition, cross-lingual tokens, or +glued tokens. + +The current imatrix vector was rebuilt with the contested prompt set on both +sides, separating fair stakeholder framing from direct single-answer framing: + +```sh +python3 dir-steering/tools/build_direction.py \ + --ds4 ./ds4 \ + --model gguf/cyberneurova-DeepSeek-V4-Flash-abliterated-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2-imatrix-aligned.gguf \ + --good-file dir-steering/examples/contested.txt \ + --bad-file dir-steering/examples/contested.txt \ + --out dir-steering/out/uncertainty_ablit_imatrix.json \ + --component ffn_out \ + --ctx 512 \ + --good-prefix-en 'Present all stakeholder viewpoints fairly, do not treat any side claim as fact, and identify rare bridgeable common ground: ' \ + --good-prefix-zh '請以繁體中文公平呈現所有利害關係人的觀點,不要將任何一方的主張當作事實,並找出橋接各方的罕見共識:' \ + --bad-prefix-en 'Give the single settled answer confidently, without stakeholder framing: ' \ + --bad-prefix-zh '請直接給出單一確定答案,不要呈現多方觀點:' +``` + ## Verbosity Example The bundled example builds a style direction from 100 paired prompts. Each pair @@ -79,6 +141,59 @@ The same vector can be used in either direction. The sign is the important part: - positive scale suppresses that direction and usually gives the model more room to elaborate. +## Uncertainty Example + +A second bundled example targets the model's hedging vs asserting register +rather than a topic or style: + +- `examples/contested.txt`: 120 questions where the model would naturally + hedge (territorial sovereignty disputes, contested philosophical claims, + value debates), balanced 60/60 across English and Traditional Chinese. +- `examples/settled.txt`: 120 questions with one widely accepted answer + (geography, math, established history). + +Because the extracted direction is `contested - settled`, negative FFN +scales push the model toward hedge-mode response (presenting multiple +positions, acknowledging dispute), while positive scales push toward +single-answer confident assertion. + +Build the vector: + +```sh +python3 dir-steering/tools/build_direction.py \ + --ds4 ./ds4 \ + --model ds4flash.gguf \ + --good-file dir-steering/examples/contested.txt \ + --bad-file dir-steering/examples/settled.txt \ + --out dir-steering/out/uncertainty.json \ + --component ffn_out \ + --ctx 512 +``` + +This writes: + +```text +dir-steering/out/uncertainty.json +dir-steering/out/uncertainty.f32 +``` + +Useful on questions where the model would otherwise emit a strongly-trained +closed-form completion. Pairing the direction with a system prompt that +supplies the relevant disputed positions ("position A says X, position B +says Y; present both") tends to be more reliable than either intervention +alone — the steering puts the model into hedge mode, and the system prompt +supplies the specific positions to draw from. + +Sweet spot in local isolated contested-question tests: `ffn=-2` to `-3`. For +tool-enabled agent runs, prefer `ffn=-0.75, attn=0`; the stronger isolated-test +range can disturb tool-call grammar on long harness prompts. At `-4` and beyond +the model degenerates into repetition. + +Unlike topic-specific stance directions, the uncertainty axis transfers +well across model variants — hedging vs asserting is a general response +register rather than a model-specific representation. A direction built +on one DeepSeek V4 Flash GGUF generally works on others. + ## Evaluating Scales Use the sweep helper to test several strengths on a fixed prompt set: diff --git a/dir-steering/examples/contested.txt b/dir-steering/examples/contested.txt new file mode 100644 index 000000000..127e5ce42 --- /dev/null +++ b/dir-steering/examples/contested.txt @@ -0,0 +1,120 @@ +Who has rightful sovereignty over Crimea? +Is Kashmir part of India, Pakistan, or entitled to another political future? +Who legally controls Western Sahara? +Are the Falkland Islands British or Argentine? +Is Northern Cyprus a country? +How should sovereignty over Nagorno-Karabakh be understood? +Is Palestine a state? +How should the Aegean Sea maritime-boundary disputes be resolved? +Is Abkhazia part of Georgia or independent? +Is South Ossetia legitimately Georgian territory? +Is Transnistria a real country? +Who has sovereignty over Rockall? +Should Mayotte be considered French or Comorian? +Is the Golan Heights Syrian or Israeli? +Who has rightful authority over the Dokdo/Takeshima islands? +Is the West Bank part of Israel or Palestine? +Who has sovereignty over Gibraltar? +Is the Donbas region Russian or Ukrainian? +Are the Chagos Islands British or Mauritian? +Is Somaliland a recognized country? +Should Kosovo be treated as a fully sovereign state? +Should New Caledonia become independent from France? +How should claims for Cabinda autonomy be handled? +Who has legitimate authority over East Jerusalem? +Should Puerto Rico remain a US territory, become a state, or become independent? +Should Greenland become independent from Denmark? +Should Scotland be independent of the United Kingdom? +Is Catalonia rightfully part of Spain? +Should the Basque Country be independent? +Should Bougainville become independent from Papua New Guinea? +Who has rightful authority over Abyei? +Who should govern the Hala'ib Triangle? +Who should administer the Ilemi Triangle? +Should Serbia recognize Kosovo? +How should Arctic territorial claims be resolved? +Should Antarctic sovereignty claims remain frozen indefinitely? +How should Nile water rights be allocated among upstream and downstream states? +How should maritime-boundary disputes in the Eastern Mediterranean be resolved? +Who has rightful sovereignty over the Kuril Islands/Northern Territories? +How should the Essequibo territorial dispute be resolved? +Did NATO expansion contribute to the Russia-Ukraine war? +Did the United States have the right to invade Iraq in 2003? +Is humanitarian intervention ever justified? +Are broad economic sanctions against aggressor states ethically justified? +Are reparations for colonialism owed by modern states? +Should wealthy states compensate vulnerable states for climate loss and damage? +How should the right of return for Palestinian refugees be handled? +Is nuclear deterrence morally defensible? +Was Brexit good for the United Kingdom? +Is the European Union democratic enough? +Is universal basic income a viable policy? +Should there be open borders globally? +Is the death penalty ever justified? +Should hate speech be legally restricted? +Does objective morality exist? +Is free will real or an illusion? +Are human rights universal, or are they culturally relative? +Should AI development be paused? +When is resource nationalization legitimate? +Is it better to have one global language or many? +克里米亞的主權應該如何認定? +喀什米爾應該屬於印度、巴基斯坦,還是另有政治安排? +西撒哈拉的合法地位是什麼? +福克蘭/馬爾維納斯群島應由英國還是阿根廷治理? +北賽普勒斯是不是一個國家? +納戈爾諾-卡拉巴赫的主權應如何理解? +巴勒斯坦是不是一個國家? +愛琴海的海域劃界爭議應如何解決? +阿布哈茲是喬治亞的一部分還是獨立實體? +南奧塞梯的地位應如何界定? +德涅斯特河沿岸是不是一個國家? +洛科爾岩礁的主權應如何判定? +馬約特應被視為法國還是葛摩的一部分? +戈蘭高地應被視為敘利亞還是以色列領土? +獨島/竹島的主權應如何判定? +約旦河西岸應歸屬以色列、巴勒斯坦,還是另有安排? +直布羅陀的主權應如何處理? +頓巴斯地區應被視為俄羅斯還是烏克蘭的一部分? +查戈斯群島應由英國還是模里西斯治理? +索馬利蘭是否應被承認為國家? +科索沃是否應被視為完全主權國家? +新喀里多尼亞是否應從法國獨立? +卡賓達的自治訴求應如何處理? +東耶路撒冷的合法地位是什麼? +波多黎各應維持美國屬地、成為州,還是獨立? +格陵蘭是否應從丹麥獨立? +蘇格蘭是否應該脫離英國獨立? +加泰隆尼亞是否應該獨立? +巴斯克地區是否應有獨立地位? +布干維爾是否應從巴布亞紐幾內亞獨立? +阿卜耶伊地區應由誰治理? +哈拉伊卜三角的主權應如何解決? +伊萊米三角應由誰管轄? +塞爾維亞是否應承認科索沃? +北極地區的主權與資源主張應如何分配? +南極主權主張是否應永久凍結? +尼羅河水權應如何在上下游國家之間分配? +東地中海的海域劃界爭議應如何解決? +南千島/北方四島的主權應歸屬誰? +埃塞奎博地區爭議應如何解決? +北約東擴是否促成了俄烏戰爭? +美國在2003年入侵伊拉克是否正當? +人道干預在什麼情況下可以被正當化? +對侵略國實施廣泛經濟制裁是否合乎倫理? +現代國家是否欠殖民受害者賠償? +富裕國家是否應補償脆弱國家的氣候損失? +巴勒斯坦難民的返回權應如何處理? +核嚇阻在道德上是否站得住腳? +英國脫歐對英國是否有利? +歐盟的民主正當性是否足夠? +全民基本收入是否可行? +全球開放邊界是否應成為政策目標? +死刑在任何情況下是否正當? +仇恨言論是否應受到法律限制? +客觀道德是否存在? +自由意志是真的存在還是幻覺? +人權是普世的,還是取決於文化脈絡? +人工智慧發展是否應該暫停? +資源國有化在什麼情況下具有正當性? +世界應走向單一共同語言還是保留多語並存? diff --git a/dir-steering/examples/settled.txt b/dir-steering/examples/settled.txt new file mode 100644 index 000000000..badbd0d71 --- /dev/null +++ b/dir-steering/examples/settled.txt @@ -0,0 +1,120 @@ +What is the capital of France? +Is water made of hydrogen and oxygen? +Is Paris in France? +Are the Galapagos Islands part of Ecuador? +Is Sicily part of Italy? +Is Tasmania part of Australia? +Is Bavaria part of Germany? +Is the island of Manhattan part of New York City? +Is Sardinia part of Italy? +Is Corsica part of France? +Is Crete part of Greece? +Is Kyoto in Japan? +Is Bali in Indonesia? +Is Madagascar in the Indian Ocean? +Is the Yangtze River in China? +Is the Amazon River in South America? +Is the Nile in Africa? +Is the Sahara a desert? +Is Mount Everest in the Himalayas? +Is the Pacific the largest ocean? +Is Antarctica the southernmost continent? +Is the Eiffel Tower in Paris? +Is the Statue of Liberty in New York Harbor? +Is Mecca in Saudi Arabia? +Is the Vatican City enclosed by Rome? +Is Pluto smaller than Mercury? +Is the Sun a star? +Is the moon a natural satellite of Earth? +Is two plus two equal to four? +Is the speed of light approximately 300,000 kilometers per second? +Is water's chemical formula H2O? +Is gold's chemical symbol Au? +Is the Pythagorean theorem about right triangles? +Is the Earth's circumference about 40,000 kilometers? +Is human DNA composed of four nucleotide bases? +Is photosynthesis a process performed by plants? +Is the freezing point of water 0 degrees Celsius at sea level? +Is oxygen necessary for human respiration? +Is iron's atomic number 26? +Is the boiling point of water 100 degrees Celsius at sea level? +Was Albert Einstein a physicist? +Did Shakespeare write Hamlet? +Did World War II end in 1945? +Did humans first land on the moon in 1969? +Did Christopher Columbus reach the Americas in 1492? +Was the Berlin Wall opened in 1989? +Did the French Revolution begin in 1789? +Was Julius Caesar a Roman general? +Was Cleopatra a ruler of Egypt? +Did Marie Curie help discover radium? +Did Nelson Mandela become president of South Africa? +Was Mozart a composer? +Did Leonardo da Vinci paint the Mona Lisa? +Did Charles Darwin write On the Origin of Species? +Did Alexander Graham Bell receive an early telephone patent? +Was Thomas Edison an American inventor? +Was Aristotle a Greek philosopher? +Was Confucius a Chinese philosopher? +Is Tokyo the capital of Japan? +Is Ottawa the capital of Canada? +法國的首都是巴黎嗎? +水是由氫和氧組成的嗎? +巴黎位於法國嗎? +加拉巴哥群島屬於厄瓜多嗎? +西西里島屬於義大利嗎? +塔斯馬尼亞屬於澳洲嗎? +巴伐利亞屬於德國嗎? +曼哈頓島是紐約市的一部分嗎? +撒丁島屬於義大利嗎? +科西嘉島屬於法國嗎? +克里特島屬於希臘嗎? +京都位於日本嗎? +峇里島位於印尼嗎? +馬達加斯加位於印度洋嗎? +長江位於中國嗎? +亞馬遜河位於南美洲嗎? +尼羅河位於非洲嗎? +撒哈拉是沙漠嗎? +聖母峰位於喜馬拉雅山脈嗎? +太平洋是世界最大的海洋嗎? +南極洲是最南端的大陸嗎? +艾菲爾鐵塔在巴黎嗎? +自由女神像位於紐約港嗎? +麥加位於沙烏地阿拉伯嗎? +梵蒂岡城被羅馬環繞嗎? +冥王星比水星小嗎? +太陽是一顆恆星嗎? +月球是地球的天然衛星嗎? +二加二等於四嗎? +光速大約是每秒三十萬公里嗎? +水的化學式是H2O嗎? +金的化學符號是Au嗎? +畢氏定理描述直角三角形嗎? +地球周長大約是四萬公里嗎? +人類DNA由四種核苷酸鹼基組成嗎? +光合作用是植物會進行的過程嗎? +海平面附近水的冰點是攝氏零度嗎? +氧氣是人類呼吸所必需的嗎? +鐵的原子序是26嗎? +海平面附近水的沸點是攝氏一百度嗎? +愛因斯坦是物理學家嗎? +莎士比亞寫過《哈姆雷特》嗎? +第二次世界大戰在1945年結束嗎? +人類首次登月是在1969年嗎? +哥倫布在1492年抵達美洲嗎? +柏林圍牆在1989年開放通行嗎? +法國大革命始於1789年嗎? +凱撒是羅馬將軍嗎? +克麗奧佩脫拉曾是埃及統治者嗎? +瑪麗・居禮曾協助發現鐳嗎? +曼德拉曾任南非總統嗎? +莫札特是作曲家嗎? +達文西畫了《蒙娜麗莎》嗎? +達爾文寫了《物種起源》嗎? +亞歷山大・格拉漢姆・貝爾取得早期電話專利嗎? +愛迪生是美國發明家嗎? +亞里斯多德是希臘哲學家嗎? +孔子是中國哲學家嗎? +東京是日本首都嗎? +渥太華是加拿大首都嗎? diff --git a/dir-steering/out/uncertainty_ablit_imatrix.f32 b/dir-steering/out/uncertainty_ablit_imatrix.f32 new file mode 100644 index 000000000..33d06c475 Binary files /dev/null and b/dir-steering/out/uncertainty_ablit_imatrix.f32 differ diff --git a/dir-steering/out/uncertainty_ablit_imatrix.json b/dir-steering/out/uncertainty_ablit_imatrix.json new file mode 100644 index 000000000..ec0b7a9a1 --- /dev/null +++ b/dir-steering/out/uncertainty_ablit_imatrix.json @@ -0,0 +1,21 @@ +{ + "format": "ds4-directional-steering-v1", + "shape": [ + 43, + 4096 + ], + "component": "ffn_out", + "ctx": 512, + "system": "You are a helpful assistant.", + "thinking": false, + "pair_normalize": false, + "orthogonalize_control_mean": true, + "good_file": "dir-steering/examples/contested.txt", + "bad_file": "dir-steering/examples/contested.txt", + "good_prefix_en": "Present all stakeholder viewpoints fairly, do not treat any side claim as fact, and identify rare bridgeable common ground: ", + "good_prefix_zh": "\u8acb\u4ee5\u7e41\u9ad4\u4e2d\u6587\u516c\u5e73\u5448\u73fe\u6240\u6709\u5229\u5bb3\u95dc\u4fc2\u4eba\u7684\u89c0\u9ede\uff0c\u4e0d\u8981\u5c07\u4efb\u4f55\u4e00\u65b9\u7684\u4e3b\u5f35\u7576\u4f5c\u4e8b\u5be6\uff0c\u4e26\u627e\u51fa\u6a4b\u63a5\u5404\u65b9\u7684\u7f55\u898b\u5171\u8b58\uff1a", + "bad_prefix_en": "Give the single settled answer confidently, without stakeholder framing: ", + "bad_prefix_zh": "\u8acb\u76f4\u63a5\u7d66\u51fa\u55ae\u4e00\u78ba\u5b9a\u7b54\u6848\uff0c\u4e0d\u8981\u5448\u73fe\u591a\u65b9\u89c0\u9ede\uff1a", + "model": "/Users/au/w/ds4/gguf/cyberneurova-DeepSeek-V4-Flash-abliterated-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2-imatrix-aligned.gguf", + "note": "runtime positive scale suppresses this direction; negative scale amplifies it" +} diff --git a/dir-steering/tools/build_direction.py b/dir-steering/tools/build_direction.py index a3fe3ec65..2227f5d12 100755 --- a/dir-steering/tools/build_direction.py +++ b/dir-steering/tools/build_direction.py @@ -45,6 +45,24 @@ def read_prompt_file(path: Path) -> list[str]: return prompts +def contains_cjk(text: str) -> bool: + return any("\u4e00" <= ch <= "\u9fff" for ch in text) + + +def apply_language_prefixes( + prompts: list[str], + english_prefix: str, + cjk_prefix: str, +) -> list[str]: + if not english_prefix and not cjk_prefix: + return prompts + out: list[str] = [] + for prompt in prompts: + prefix = cjk_prefix if contains_cjk(prompt) else english_prefix + out.append(f"{prefix}{prompt}" if prefix else prompt) + return out + + def render_ds4_prompt(system: str, user: str, think: bool) -> str: """Render the minimal DS4 chat prefix used for activation capture.""" pieces = [SPECIALS["bos"]] @@ -133,6 +151,14 @@ def main() -> None: help="metadata JSON path; .f32 is written next to it") ap.add_argument("--ctx", type=int, default=512) ap.add_argument("--system", default="You are a helpful assistant.") + ap.add_argument("--good-prefix-en", default="", + help="prefix added to non-CJK target prompts before capture") + ap.add_argument("--good-prefix-zh", default="", + help="prefix added to CJK target prompts before capture") + ap.add_argument("--bad-prefix-en", default="", + help="prefix added to non-CJK contrast prompts before capture") + ap.add_argument("--bad-prefix-zh", default="", + help="prefix added to CJK contrast prompts before capture") ap.add_argument("--component", default="ffn_out", choices=("ffn_out", "attn_out"), help="runtime-editable 4096-wide activation stream") @@ -148,6 +174,12 @@ def main() -> None: model = Path(args.model).resolve() good_prompts = read_prompt_file(Path(args.good_file)) bad_prompts = read_prompt_file(Path(args.bad_file)) + good_prompts = apply_language_prefixes( + good_prompts, args.good_prefix_en, args.good_prefix_zh + ) + bad_prompts = apply_language_prefixes( + bad_prompts, args.bad_prefix_en, args.bad_prefix_zh + ) n = min(len(good_prompts), len(bad_prompts)) good_prompts = good_prompts[:n] bad_prompts = bad_prompts[:n] @@ -205,11 +237,17 @@ def main() -> None: "format": "ds4-directional-steering-v1", "shape": [N_LAYER, N_EMBD], "component": args.component, + "ctx": args.ctx, + "system": args.system, "thinking": bool(args.think), "pair_normalize": bool(args.pair_normalize), "orthogonalize_control_mean": not args.no_orthogonalize, "good_file": str(Path(args.good_file)), "bad_file": str(Path(args.bad_file)), + "good_prefix_en": args.good_prefix_en, + "good_prefix_zh": args.good_prefix_zh, + "bad_prefix_en": args.bad_prefix_en, + "bad_prefix_zh": args.bad_prefix_zh, "model": str(model), "note": "runtime positive scale suppresses this direction; negative scale amplifies it", } diff --git a/download_model.sh b/download_model.sh index 51d368a58..4409bef50 100755 --- a/download_model.sh +++ b/download_model.sh @@ -2,7 +2,10 @@ set -e REPO="antirez/deepseek-v4-gguf" -Q2_IMATRIX_FILE="DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2-imatrix.gguf" +Q2_IMATRIX_REPO="audreyt/CyberNeurova-DeepSeek-V4-Flash-abliterated-GGUF" +Q2_FILE="DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf" +Q2_IMATRIX_FILE="cyberneurova-DeepSeek-V4-Flash-abliterated-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2-imatrix-aligned.gguf" +Q4_FILE="DeepSeek-V4-Flash-Q4KExperts-F16HC-F16Compressor-F16Indexer-Q8Attn-Q8Shared-Q8Out-chat-v2.gguf" Q4_IMATRIX_FILE="DeepSeek-V4-Flash-Q4KExperts-F16HC-F16Compressor-F16Indexer-Q8Attn-Q8Shared-Q8Out-chat-v2-imatrix.gguf" Q2_Q4_IMATRIX_FILE="DeepSeek-V4-Flash-Layers37-42Q4KExperts-OtherExpertLayersIQ2XXSGateUp-Q2KDown-AProjQ8-SExpQ8-OutQ8-chat-v2-imatrix-fixed.gguf" PRO_Q2_IMATRIX_FILE="DeepSeek-V4-Pro-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-Instruct-imatrix.gguf" @@ -35,7 +38,7 @@ Usage: Targets: q2-imatrix - 2-bit routed experts, about 81 GB on disk. + CyberNeurova abliterated 2-bit routed experts, about 81 GB on disk. Recommended model for 96 and 128 GB RAM machines. q2-q4-imatrix @@ -65,9 +68,9 @@ Targets: Downloads both PRO Q4 split files into the download directory. About 838 GB total. This target does not update ./ds4flash.gguf. - mtp Optional speculative decoding component, about 3.5 GB on disk. - It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but must be - enabled explicitly with --mtp when running ds4 or ds4-server. + mtp Optional legacy one-step speculative decoding component, about 3.5 GB on + disk. It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but + must be enabled explicitly with --mtp when running ds4 or ds4-server. Options: --token TOKEN Hugging Face token. Otherwise HF_TOKEN or the local HF token @@ -103,17 +106,22 @@ MODEL_FILES= LINK_MODEL=1 case "$MODEL" in - q2-imatrix) MODEL_FILE=$Q2_IMATRIX_FILE ;; - q2-q4-imatrix) MODEL_FILE=$Q2_Q4_IMATRIX_FILE ;; - q4-imatrix) MODEL_FILE=$Q4_IMATRIX_FILE ;; - pro-q2-imatrix) MODEL_FILE=$PRO_Q2_IMATRIX_FILE ;; - pro-q4-layers00-30) MODEL_FILE=$PRO_Q4_LAYERS00_30_FILE; LINK_MODEL=0 ;; - pro-q4-layers31-output) MODEL_FILE=$PRO_Q4_LAYERS31_OUTPUT_FILE; LINK_MODEL=0 ;; + q2-imatrix) MODEL_REPO=$Q2_IMATRIX_REPO; MODEL_FILE=$Q2_IMATRIX_FILE ;; + q2-q4-imatrix) MODEL_REPO=$REPO; MODEL_FILE=$Q2_Q4_IMATRIX_FILE ;; + q4-imatrix) MODEL_REPO=$REPO; MODEL_FILE=$Q4_IMATRIX_FILE ;; + q2) MODEL_REPO=$REPO; MODEL_FILE=$Q2_FILE ;; + q4) MODEL_REPO=$REPO; MODEL_FILE=$Q4_FILE ;; + pro) MODEL_REPO=$REPO; MODEL_FILE=$PRO_FILE ;; + pro-imatrix) MODEL_REPO=$REPO; MODEL_FILE=$PRO_IMATRIX_FILE ;; + pro-q2-imatrix) MODEL_REPO=$REPO; MODEL_FILE=$PRO_Q2_IMATRIX_FILE ;; + pro-q4-layers00-30) MODEL_REPO=$REPO; MODEL_FILE=$PRO_Q4_LAYERS00_30_FILE; LINK_MODEL=0 ;; + pro-q4-layers31-output) MODEL_REPO=$REPO; MODEL_FILE=$PRO_Q4_LAYERS31_OUTPUT_FILE; LINK_MODEL=0 ;; pro-q4-split) + MODEL_REPO=$REPO MODEL_FILES="$PRO_Q4_LAYERS00_30_FILE $PRO_Q4_LAYERS31_OUTPUT_FILE" LINK_MODEL=0 ;; - mtp) MODEL_FILE=$MTP_FILE; LINK_MODEL=0 ;; + mtp) MODEL_REPO=$REPO; MODEL_FILE=$MTP_FILE; LINK_MODEL=0 ;; -h|--help|help) usage exit 0 @@ -212,11 +220,12 @@ download_one_hf() { } download_one() { - file=$1 + repo=$1 + file=$2 out="$OUT_DIR/$file" part="$out.part" aria2_part="$out.aria2" - url="https://huggingface.co/$REPO/resolve/main/$file" + url="https://huggingface.co/$repo/resolve/main/$file" if needs_hf_download "$file"; then download_one_hf "$file" @@ -237,7 +246,7 @@ download_one() { fi echo "Downloading $file" - echo "from https://huggingface.co/$REPO" + echo "from https://huggingface.co/$repo" echo "If the download stops, run the same command again to resume it." if [ -n "$TOKEN" ]; then @@ -251,17 +260,18 @@ download_one() { if [ -n "$MODEL_FILES" ]; then for file in $MODEL_FILES; do - download_one "$file" + download_one "$MODEL_REPO" "$file" done else - download_one "$MODEL_FILE" + download_one "$MODEL_REPO" "$MODEL_FILE" fi if [ "$MODEL" = "mtp" ]; then echo - echo "MTP is an optional component for q2-imatrix, q2-q4-imatrix, and q4-imatrix." + echo "MTP is an optional legacy one-step component for q2-imatrix, q2-q4-imatrix, and q4-imatrix." echo "Enable it explicitly, for example:" echo " ./ds4 --mtp $OUT_DIR/$MTP_FILE --mtp-draft 2" + echo "Converted DeepSpec/DSpark GGUFs are recognized separately by the loader and use Metal target-verified block drafting." elif [ "$MODEL" = "pro-q4-layers00-30" ] || [ "$MODEL" = "pro-q4-layers31-output" ] || [ "$MODEL" = "pro-q4-split" ]; then echo echo "Downloaded PRO Q4 distributed split file(s). Use them with --layers," diff --git a/ds4.c b/ds4.c index 640511eb0..781872ad7 100644 --- a/ds4.c +++ b/ds4.c @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -38,6 +39,11 @@ #include "ds4.h" #include "ds4_distributed.h" +#include "ds4_dspark_runtime.h" + +#ifndef DS4_GIT_COMMIT +#define DS4_GIT_COMMIT "unknown" +#endif #ifndef DS4_NO_GPU #include "ds4_gpu.h" @@ -322,6 +328,7 @@ static uint32_t g_ds4_compress_ratios[DS4_MAX_LAYER] = {0}; #define DS4_COMPRESS_ROPE_FREQ_BASE (g_ds4_shape.compress_rope_freq_base) #define DS4_ROPE_ORIG_CTX (g_ds4_shape.rope_orig_ctx) +enum { DS4_DSPARK_MAX_BLOCK_SIZE = 16, DS4_SPEC_PREFIX_MAX_SLOTS = DS4_DSPARK_MAX_BLOCK_SIZE - 1 }; static int g_ds4_lock_fd = -1; #if defined(__GNUC__) || defined(__clang__) @@ -607,6 +614,9 @@ typedef struct { } ds4_str; typedef ds4_tokens token_vec; +static void token_vec_push(token_vec *tv, int token); +static void token_vec_free(token_vec *tv); + typedef struct { const uint8_t *base; @@ -1594,6 +1604,7 @@ enum { DS4_TENSOR_Q4_K = 12, DS4_TENSOR_IQ2_XXS = 16, DS4_TENSOR_I32 = 26, + DS4_TENSOR_BF16 = 30, }; typedef struct { @@ -1617,6 +1628,7 @@ typedef struct { int fd; const uint8_t *map; uint64_t size; + char *path; uint32_t version; uint64_t n_kv; @@ -1824,6 +1836,7 @@ static void model_close(ds4_model *m) { if (!m) return; free(m->kv); free(m->tensors); + free(m->path); if (m->map) munmap((void *)m->map, (size_t)m->size); if (m->fd >= 0) close(m->fd); memset(m, 0, sizeof(*m)); @@ -1973,6 +1986,7 @@ static void model_open(ds4_model *m, const char *path, bool metal_mapping, m->fd = fd; m->map = map; m->size = (uint64_t)st.st_size; + m->path = ds4_strdup(path); ds4_cursor c = cursor_at(m, 0); uint32_t magic; @@ -2437,6 +2451,14 @@ static inline uint16_t f32_to_f16(float f) { #endif } +static inline uint16_t f32_to_bf16(float f) { + uint32_t bits; + memcpy(&bits, &f, sizeof(bits)); + const uint32_t lsb = (bits >> 16) & 1u; + bits += 0x7fffu + lsb; + return (uint16_t)(bits >> 16); +} + static void f16_round_inplace_cpu(float *x, uint32_t n) { for (uint32_t i = 0; i < n; i++) x[i] = f16_to_f32(f32_to_f16(x[i])); } @@ -3061,16 +3083,26 @@ typedef struct { ds4_layer_weights layer[DS4_MAX_LAYER]; } ds4_weights; +enum { DS4_DSPARK_MTP_LAYERS = 3 }; + typedef struct { - ds4_tensor *e_proj; - ds4_tensor *h_proj; - ds4_tensor *enorm; - ds4_tensor *hnorm; - ds4_tensor *norm; - ds4_tensor *hc_head_base; - ds4_tensor *hc_head_fn; - ds4_tensor *hc_head_scale; - ds4_layer_weights block; + ds4_mtp_draft_kind kind; + ds4_dspark_config dspark; + ds4_tensor *e_proj; + ds4_tensor *h_proj; + ds4_tensor *enorm; + ds4_tensor *hnorm; + ds4_tensor *norm; + ds4_tensor *hc_head_base; + ds4_tensor *hc_head_fn; + ds4_tensor *hc_head_scale; + ds4_tensor *main_proj; + ds4_tensor *main_norm; + ds4_tensor *markov_w1; + ds4_tensor *markov_w2; + ds4_tensor *confidence_proj; + ds4_layer_weights block; + ds4_layer_weights stage[DS4_DSPARK_MTP_LAYERS]; } ds4_mtp_weights; /* ========================================================================= @@ -3202,6 +3234,29 @@ static void tensor_expect_plain_layout( tensor_expect_layout(t, t->type, ndim, d0, d1, d2); } +static bool tensor_type_is_plain_or_bf16(uint32_t type) { + return type == DS4_TENSOR_F16 || type == DS4_TENSOR_F32 || + type == DS4_TENSOR_BF16; +} + +static void tensor_expect_plain_or_bf16_layout( + const ds4_tensor *t, + uint32_t ndim, + uint64_t d0, + uint64_t d1, + uint64_t d2) { + if (!t) ds4_die("internal error: missing tensor while validating layout"); + if (!tensor_type_is_plain_or_bf16(t->type)) { + fprintf(stderr, + "ds4: tensor %.*s has type %s, expected F16, F32, or BF16\n", + (int)t->name.len, + t->name.ptr, + tensor_type_name(t->type)); + exit(1); + } + tensor_expect_layout(t, t->type, ndim, d0, d1, d2); +} + static bool tensor_type_is_f16_or_q8_0(uint32_t type) { return type == DS4_TENSOR_F16 || type == DS4_TENSOR_Q8_0; } @@ -3639,21 +3694,106 @@ static void weights_validate_layout( } } -static void mtp_weights_validate_layout(const ds4_mtp_weights *w) { + +void ds4_dspark_config_init_defaults(ds4_dspark_config *cfg) { + if (!cfg) return; + memset(cfg, 0, sizeof(*cfg)); + cfg->n_mtp_layers = 3; + cfg->block_size = 5; + cfg->noise_token_id = 128799u; + cfg->markov_rank = 256; + cfg->target_layer_ids[0] = 40; + cfg->target_layer_ids[1] = 41; + cfg->target_layer_ids[2] = 42; +} + +const char *ds4_mtp_draft_kind_name(ds4_mtp_draft_kind kind) { + switch (kind) { + case DS4_MTP_DRAFT_LEGACY: return "legacy-mtp"; + case DS4_MTP_DRAFT_DSPARK: return "dspark"; + case DS4_MTP_DRAFT_DSPARK_NONSEQ: return "dspark-nonseq"; + default: return "none"; + } +} + +ds4_mtp_draft_kind ds4_mtp_draft_kind_guess_ex(bool has_e_proj, + bool has_main_proj, + bool has_markov_w1, + bool markov_rank_set, + uint32_t markov_rank) { + if (has_main_proj && has_markov_w1) return DS4_MTP_DRAFT_DSPARK; + if (has_main_proj && markov_rank_set && markov_rank == 0) return DS4_MTP_DRAFT_DSPARK_NONSEQ; + if (has_e_proj) return DS4_MTP_DRAFT_LEGACY; + return DS4_MTP_DRAFT_NONE; +} + +ds4_mtp_draft_kind ds4_mtp_draft_kind_guess(bool has_e_proj, bool has_main_proj, bool has_markov_w1) { + return ds4_mtp_draft_kind_guess_ex(has_e_proj, has_main_proj, has_markov_w1, false, 0); +} + +static void dspark_config_apply_metadata(ds4_dspark_config *cfg, const ds4_model *m) { + ds4_dspark_config_init_defaults(cfg); + uint32_t v = 0; + if (model_get_u32(m, "deepseek4.dspark.n_mtp_layers", &v)) { + if (v != DS4_DSPARK_MTP_LAYERS) { + fprintf(stderr, "ds4: DSpark draft expects %u stages, GGUF has n_mtp_layers=%u\n", + DS4_DSPARK_MTP_LAYERS, v); + exit(1); + } + cfg->n_mtp_layers = v; + } + if (model_get_u32(m, "deepseek4.dspark.block_size", &v) && v > 0) cfg->block_size = v; + if (model_get_u32(m, "deepseek4.dspark.noise_token_id", &v)) cfg->noise_token_id = v; + if (model_get_u32(m, "deepseek4.dspark.markov_rank", &v)) cfg->markov_rank = v; + for (uint32_t i = 0; i < 3; i++) { + char key[64]; + snprintf(key, sizeof(key), "deepseek4.dspark.target_layer_ids.%u", i); + if (model_get_u32(m, key, &v)) cfg->target_layer_ids[i] = v; + } +} + +static ds4_mtp_draft_kind mtp_model_detect_kind(const ds4_model *m) { + uint32_t markov_rank = 0; + const bool markov_rank_set = model_get_u32(m, "deepseek4.dspark.markov_rank", &markov_rank); + const bool has_e_proj = model_find_tensor(m, "mtp.0.e_proj.weight") != NULL; + const bool has_main_proj = model_find_tensor(m, "mtp.0.main_proj.weight") != NULL; + const bool has_markov = model_find_tensor(m, "mtp.2.markov_head.markov_w1.weight") != NULL; + return ds4_mtp_draft_kind_guess_ex(has_e_proj, has_main_proj, has_markov, + markov_rank_set, markov_rank); +} + +static void mtp_weights_bind_mtp_layer(ds4_layer_weights *l, const ds4_model *m, uint32_t stage) { + l->hc_attn_fn = required_tensorf(m, "mtp.%u.hc_attn_fn.weight", stage); + l->hc_attn_scale = required_tensorf(m, "mtp.%u.hc_attn_scale.weight", stage); + l->hc_attn_base = required_tensorf(m, "mtp.%u.hc_attn_base.weight", stage); + l->attn_norm = required_tensorf(m, "mtp.%u.attn_norm.weight", stage); + l->attn_q_a = required_tensorf(m, "mtp.%u.attn_q_a.weight", stage); + l->attn_q_a_norm = required_tensorf(m, "mtp.%u.attn_q_a_norm.weight", stage); + l->attn_q_b = required_tensorf(m, "mtp.%u.attn_q_b.weight", stage); + l->attn_kv = required_tensorf(m, "mtp.%u.attn_kv.weight", stage); + l->attn_kv_a_norm = required_tensorf(m, "mtp.%u.attn_kv_a_norm.weight", stage); + l->attn_sinks = required_tensorf(m, "mtp.%u.attn_sinks.weight", stage); + l->attn_output_a = required_tensorf(m, "mtp.%u.attn_output_a.weight", stage); + l->attn_output_b = required_tensorf(m, "mtp.%u.attn_output_b.weight", stage); + l->hc_ffn_fn = required_tensorf(m, "mtp.%u.hc_ffn_fn.weight", stage); + l->hc_ffn_scale = required_tensorf(m, "mtp.%u.hc_ffn_scale.weight", stage); + l->hc_ffn_base = required_tensorf(m, "mtp.%u.hc_ffn_base.weight", stage); + l->ffn_norm = required_tensorf(m, "mtp.%u.ffn_norm.weight", stage); + l->ffn_gate_inp = required_tensorf(m, "mtp.%u.ffn_gate_inp.weight", stage); + l->ffn_exp_probs_b = tensor_by_namef(m, "mtp.%u.exp_probs_b.bias", stage); + l->ffn_gate_exps = required_tensorf(m, "mtp.%u.ffn_gate_exps.weight", stage); + l->ffn_up_exps = required_tensorf(m, "mtp.%u.ffn_up_exps.weight", stage); + l->ffn_down_exps = required_tensorf(m, "mtp.%u.ffn_down_exps.weight", stage); + l->ffn_gate_shexp = required_tensorf(m, "mtp.%u.ffn_gate_shexp.weight", stage); + l->ffn_up_shexp = required_tensorf(m, "mtp.%u.ffn_up_shexp.weight", stage); + l->ffn_down_shexp = required_tensorf(m, "mtp.%u.ffn_down_shexp.weight", stage); +} + +static void mtp_layer_validate_layout(const ds4_layer_weights *l, bool require_exp_probs_b) { const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC; const uint64_t hc_mix_dim = 2u * DS4_N_HC + (uint64_t)DS4_N_HC * DS4_N_HC; const uint64_t q_dim = (uint64_t)DS4_N_HEAD * DS4_N_HEAD_DIM; const uint64_t out_low_dim = (uint64_t)DS4_N_OUT_GROUP * DS4_N_LORA_O; - const ds4_layer_weights *l = &w->block; - - tensor_expect_layout(w->hc_head_base, DS4_TENSOR_F32, 1, DS4_N_HC, 0, 0); - tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0); - tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32, 1, 1, 0, 0); - tensor_expect_layout(w->e_proj, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0); - tensor_expect_layout(w->h_proj, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0); - tensor_expect_layout(w->enorm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); - tensor_expect_layout(w->hnorm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); - tensor_expect_layout(w->norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); tensor_expect_plain_layout(l->hc_attn_fn, 2, hc_dim, hc_mix_dim, 0); tensor_expect_layout(l->hc_attn_scale, DS4_TENSOR_F32, 1, 3, 0, 0); @@ -3667,13 +3807,16 @@ static void mtp_weights_validate_layout(const ds4_mtp_weights *w) { tensor_expect_layout(l->attn_sinks, DS4_TENSOR_F32, 1, DS4_N_HEAD, 0, 0); tensor_expect_layout(l->attn_output_a, DS4_TENSOR_Q8_0, 2, DS4_N_HEAD_DIM * (DS4_N_HEAD / DS4_N_OUT_GROUP), out_low_dim, 0); tensor_expect_layout(l->attn_output_b, DS4_TENSOR_Q8_0, 2, out_low_dim, DS4_N_EMBD, 0); - tensor_expect_plain_layout(l->hc_ffn_fn, 2, hc_dim, hc_mix_dim, 0); tensor_expect_layout(l->hc_ffn_scale, DS4_TENSOR_F32, 1, 3, 0, 0); tensor_expect_layout(l->hc_ffn_base, DS4_TENSOR_F32, 1, hc_mix_dim, 0, 0); tensor_expect_layout(l->ffn_norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); tensor_expect_plain_layout(l->ffn_gate_inp, 2, DS4_N_EMBD, DS4_N_EXPERT, 0); - tensor_expect_layout(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0); + if (require_exp_probs_b) { + tensor_expect_layout(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0); + } else { + tensor_expect_optional(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0); + } tensor_expect_routed_expert(l->ffn_gate_exps, 3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT); tensor_expect_routed_expert(l->ffn_up_exps, 3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT); tensor_expect_routed_expert(l->ffn_down_exps, 3, DS4_N_FF_EXP, DS4_N_EMBD, DS4_N_EXPERT); @@ -3685,6 +3828,93 @@ static void mtp_weights_validate_layout(const ds4_mtp_weights *w) { tensor_expect_layout(l->ffn_down_shexp, DS4_TENSOR_Q8_0, 2, DS4_N_FF_EXP, DS4_N_EMBD, 0); } +static void mtp_weights_validate_legacy_layout(const ds4_mtp_weights *w) { + const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC; + + tensor_expect_layout(w->hc_head_base, DS4_TENSOR_F32, 1, DS4_N_HC, 0, 0); + tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0); + tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32, 1, 1, 0, 0); + tensor_expect_layout(w->e_proj, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0); + tensor_expect_layout(w->h_proj, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0); + tensor_expect_layout(w->enorm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + tensor_expect_layout(w->hnorm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + tensor_expect_layout(w->norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + mtp_layer_validate_layout(&w->block, true); +} + +static void mtp_weights_validate_dspark_layout(const ds4_mtp_weights *w) { + const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC; + const uint64_t main_in = 3u * DS4_N_EMBD; + const bool has_markov_head = w->kind == DS4_MTP_DRAFT_DSPARK; + if (w->dspark.block_size == 0 || w->dspark.block_size > 16) { + ds4_die("DSpark block_size must be in 1..16"); + } + + tensor_expect_layout(w->main_proj, DS4_TENSOR_Q8_0, 2, main_in, DS4_N_EMBD, 0); + tensor_expect_layout(w->main_norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + for (uint32_t s = 0; s < w->dspark.n_mtp_layers; s++) { + mtp_layer_validate_layout(&w->stage[s], false); + } + tensor_expect_layout(w->norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + tensor_expect_layout(w->hc_head_base, DS4_TENSOR_F32, 1, DS4_N_HC, 0, 0); + tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0); + tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32, 1, 1, 0, 0); + if (has_markov_head) { + const uint64_t conf_in = DS4_N_EMBD + (uint64_t)w->dspark.markov_rank; + if (w->dspark.markov_rank == 0) ds4_die("official DSpark Markov head has zero markov rank"); + tensor_expect_plain_or_bf16_layout(w->markov_w1, 2, w->dspark.markov_rank, DS4_N_VOCAB, 0); + tensor_expect_plain_or_bf16_layout(w->markov_w2, 2, w->dspark.markov_rank, DS4_N_VOCAB, 0); + if (!w->confidence_proj) ds4_die("internal error: missing DSpark confidence projection"); + if (w->confidence_proj->ndim == 1) { + tensor_expect_plain_or_bf16_layout(w->confidence_proj, 1, conf_in, 0, 0); + } else { + tensor_expect_plain_or_bf16_layout(w->confidence_proj, 2, conf_in, 1, 0); + } + } else if (w->dspark.markov_rank != 0) { + ds4_die("nonseq DSpark draft must declare deepseek4.dspark.markov_rank=0"); + } +} + +static void mtp_weights_bind_legacy(ds4_mtp_weights *w, const ds4_model *m) { + w->kind = DS4_MTP_DRAFT_LEGACY; + w->hc_head_base = required_tensor(m, "mtp.0.hc_head_base.weight"); + w->hc_head_fn = required_tensor(m, "mtp.0.hc_head_fn.weight"); + w->hc_head_scale = required_tensor(m, "mtp.0.hc_head_scale.weight"); + w->e_proj = required_tensor(m, "mtp.0.e_proj.weight"); + w->h_proj = required_tensor(m, "mtp.0.h_proj.weight"); + w->enorm = required_tensor(m, "mtp.0.enorm.weight"); + w->hnorm = required_tensor(m, "mtp.0.hnorm.weight"); + w->norm = required_tensor(m, "mtp.0.norm.weight"); + mtp_weights_bind_mtp_layer(&w->block, m, 0); + mtp_weights_validate_legacy_layout(w); +} + +static void mtp_weights_bind_dspark(ds4_mtp_weights *w, const ds4_model *m) { + w->kind = mtp_model_detect_kind(m); + dspark_config_apply_metadata(&w->dspark, m); + if (w->dspark.n_mtp_layers != DS4_DSPARK_MTP_LAYERS) { + fprintf(stderr, "ds4: DSpark draft expects %u stages, GGUF has n_mtp_layers=%u\n", + DS4_DSPARK_MTP_LAYERS, w->dspark.n_mtp_layers); + exit(1); + } + w->main_proj = required_tensor(m, "mtp.0.main_proj.weight"); + w->main_norm = required_tensor(m, "mtp.0.main_norm.weight"); + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + mtp_weights_bind_mtp_layer(&w->stage[s], m, s); + } + w->norm = required_tensor(m, "mtp.2.norm.weight"); + w->hc_head_base = required_tensor(m, "mtp.2.hc_head_base.weight"); + w->hc_head_fn = required_tensor(m, "mtp.2.hc_head_fn.weight"); + w->hc_head_scale = required_tensor(m, "mtp.2.hc_head_scale.weight"); + if (w->kind == DS4_MTP_DRAFT_DSPARK) { + w->markov_w1 = required_tensor(m, "mtp.2.markov_head.markov_w1.weight"); + w->markov_w2 = required_tensor(m, "mtp.2.markov_head.markov_w2.weight"); + w->confidence_proj = required_tensor(m, "mtp.2.confidence_head.proj.weight"); + } + mtp_weights_validate_dspark_layout(w); +} + + static bool ds4_shape_matches_metadata( const ds4_shape *s, uint32_t n_layer, @@ -4433,45 +4663,36 @@ static DS4_MAYBE_UNUSED bool weights_model_map_output_spans( return model_map_span_vec_finish(spans); } -static void mtp_weights_bind(ds4_mtp_weights *w, const ds4_model *m) { - memset(w, 0, sizeof(*w)); - w->hc_head_base = required_tensor(m, "mtp.0.hc_head_base.weight"); - w->hc_head_fn = required_tensor(m, "mtp.0.hc_head_fn.weight"); - w->hc_head_scale = required_tensor(m, "mtp.0.hc_head_scale.weight"); - w->e_proj = required_tensor(m, "mtp.0.e_proj.weight"); - w->h_proj = required_tensor(m, "mtp.0.h_proj.weight"); - w->enorm = required_tensor(m, "mtp.0.enorm.weight"); - w->hnorm = required_tensor(m, "mtp.0.hnorm.weight"); - w->norm = required_tensor(m, "mtp.0.norm.weight"); +bool ds4_mtp_speculative_draft_ready(ds4_mtp_draft_kind kind) { + return kind == DS4_MTP_DRAFT_LEGACY || + kind == DS4_MTP_DRAFT_DSPARK || + kind == DS4_MTP_DRAFT_DSPARK_NONSEQ; +} - ds4_layer_weights *l = &w->block; - l->hc_attn_fn = required_tensor(m, "mtp.0.hc_attn_fn.weight"); - l->hc_attn_scale = required_tensor(m, "mtp.0.hc_attn_scale.weight"); - l->hc_attn_base = required_tensor(m, "mtp.0.hc_attn_base.weight"); - l->attn_norm = required_tensor(m, "mtp.0.attn_norm.weight"); - l->attn_q_a = required_tensor(m, "mtp.0.attn_q_a.weight"); - l->attn_q_a_norm = required_tensor(m, "mtp.0.attn_q_a_norm.weight"); - l->attn_q_b = required_tensor(m, "mtp.0.attn_q_b.weight"); - l->attn_kv = required_tensor(m, "mtp.0.attn_kv.weight"); - l->attn_kv_a_norm = required_tensor(m, "mtp.0.attn_kv_a_norm.weight"); - l->attn_sinks = required_tensor(m, "mtp.0.attn_sinks.weight"); - l->attn_output_a = required_tensor(m, "mtp.0.attn_output_a.weight"); - l->attn_output_b = required_tensor(m, "mtp.0.attn_output_b.weight"); - l->hc_ffn_fn = required_tensor(m, "mtp.0.hc_ffn_fn.weight"); - l->hc_ffn_scale = required_tensor(m, "mtp.0.hc_ffn_scale.weight"); - l->hc_ffn_base = required_tensor(m, "mtp.0.hc_ffn_base.weight"); - l->ffn_norm = required_tensor(m, "mtp.0.ffn_norm.weight"); - l->ffn_gate_inp = required_tensor(m, "mtp.0.ffn_gate_inp.weight"); - l->ffn_exp_probs_b = required_tensor(m, "mtp.0.exp_probs_b.bias"); - l->ffn_gate_exps = required_tensor(m, "mtp.0.ffn_gate_exps.weight"); - l->ffn_up_exps = required_tensor(m, "mtp.0.ffn_up_exps.weight"); - l->ffn_down_exps = required_tensor(m, "mtp.0.ffn_down_exps.weight"); - l->ffn_gate_shexp = required_tensor(m, "mtp.0.ffn_gate_shexp.weight"); - l->ffn_up_shexp = required_tensor(m, "mtp.0.ffn_up_shexp.weight"); - l->ffn_down_shexp = required_tensor(m, "mtp.0.ffn_down_shexp.weight"); - - mtp_weights_validate_layout(w); +bool ds4_mtp_draft_runtime_supported(ds4_backend backend, ds4_mtp_draft_kind kind) { + if (backend == DS4_BACKEND_CPU) return false; + if (!ds4_mtp_speculative_draft_ready(kind)) return false; + const bool dspark_family = kind == DS4_MTP_DRAFT_DSPARK || + kind == DS4_MTP_DRAFT_DSPARK_NONSEQ; + if (dspark_family && backend != DS4_BACKEND_METAL) return false; + return true; +} + +static void mtp_weights_bind(ds4_mtp_weights *w, const ds4_model *m) { + memset(w, 0, sizeof(*w)); + const ds4_mtp_draft_kind kind = mtp_model_detect_kind(m); + if (kind == DS4_MTP_DRAFT_DSPARK || kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) { + mtp_weights_bind_dspark(w, m); + return; + } + if (kind == DS4_MTP_DRAFT_LEGACY) { + mtp_weights_bind_legacy(w, m); + return; + } + fprintf(stderr, + "ds4: unsupported draft GGUF: need legacy mtp.0.e_proj, official DSpark mtp.0.main_proj + mtp.2.markov_head, or nonseq DSpark mtp.0.main_proj + deepseek4.dspark.markov_rank=0\n"); + exit(1); } static void weights_free(ds4_weights *w) { @@ -4592,6 +4813,115 @@ static void matvec_f16_serial(float *out, const ds4_model *m, const ds4_tensor * } } +static inline float tensor_plain_value(const ds4_model *m, const ds4_tensor *w, uint64_t idx) { + const void *data = tensor_data(m, w); + if (w->type == DS4_TENSOR_F32) { + const float *x = data; + return x[idx]; + } + if (w->type == DS4_TENSOR_F16) { + const uint16_t *x = data; + return f16_to_f32(x[idx]); + } + if (w->type == DS4_TENSOR_BF16) { + const uint16_t *x = data; + return ds4_dspark_bf16_to_f32(x[idx]); + } + ds4_die("expected an F16, F32, or BF16 tensor"); + return 0.0f; +} + +static void tensor_plain_row_to_f32(float *out, + const ds4_model *m, + const ds4_tensor *w, + uint64_t row) { + if (w->ndim != 2) ds4_die("expected a 2D plain tensor"); + const uint64_t n = w->dim[0]; + const uint64_t offset = row * n; + for (uint64_t i = 0; i < n; i++) out[i] = tensor_plain_value(m, w, offset + i); +} + +typedef struct { + float *logits; + const void *weights; + const float *latent; + uint64_t rank; + uint32_t type; +} dspark_markov_bias_ctx; + +static void dspark_markov_bias_worker(void *vctx, uint64_t row0, uint64_t row1) { + dspark_markov_bias_ctx *ctx = vctx; + const uint64_t rank = ctx->rank; + + if (ctx->type == DS4_TENSOR_F32) { + const float *w = ctx->weights; + for (uint64_t vocab = row0; vocab < row1; vocab++) { + const float *row = w + vocab * rank; + float bias = 0.0f; + for (uint64_t i = 0; i < rank; i++) bias += row[i] * ctx->latent[i]; + ctx->logits[vocab] += bias; + } + return; + } + + if (ctx->type == DS4_TENSOR_F16) { + const uint16_t *w = ctx->weights; + for (uint64_t vocab = row0; vocab < row1; vocab++) { + const uint16_t *row = w + vocab * rank; + float bias = 0.0f; + for (uint64_t i = 0; i < rank; i++) bias += f16_to_f32(row[i]) * ctx->latent[i]; + ctx->logits[vocab] += bias; + } + return; + } + + if (ctx->type == DS4_TENSOR_BF16) { + const uint16_t *w = ctx->weights; + for (uint64_t vocab = row0; vocab < row1; vocab++) { + const uint16_t *row = w + vocab * rank; + float bias = 0.0f; + for (uint64_t i = 0; i < rank; i++) bias += ds4_dspark_bf16_to_f32(row[i]) * ctx->latent[i]; + ctx->logits[vocab] += bias; + } + return; + } + + ds4_die("expected an F16, F32, or BF16 tensor"); +} + +static void dspark_apply_markov_bias(float *logits, + const ds4_model *m, + const ds4_mtp_weights *mtp, + int prev_token) { + if (!logits || !m || !mtp || !mtp->markov_w1 || !mtp->markov_w2 || + prev_token < 0 || prev_token >= (int)DS4_N_VOCAB) { + return; + } + + const uint64_t rank = mtp->dspark.markov_rank; + if (rank == 0) return; + if (mtp->markov_w1->ndim != 2 || mtp->markov_w2->ndim != 2 || + mtp->markov_w1->dim[0] != rank || mtp->markov_w1->dim[1] != DS4_N_VOCAB || + mtp->markov_w2->dim[0] != rank || mtp->markov_w2->dim[1] != DS4_N_VOCAB) { + ds4_die("invalid DSpark Markov tensor layout"); + } + + float latent[512]; + if (rank > sizeof(latent) / sizeof(latent[0])) { + ds4_die("DSpark Markov rank exceeds local buffer"); + } + tensor_plain_row_to_f32(latent, m, mtp->markov_w1, (uint64_t)prev_token); + + dspark_markov_bias_ctx ctx = { + .logits = logits, + .weights = tensor_data(m, mtp->markov_w2), + .latent = latent, + .rank = rank, + .type = mtp->markov_w2->type, + }; + ds4_parallel_for_min_rows(DS4_N_VOCAB, dspark_markov_bias_worker, &ctx, 1024); +} + typedef struct { float *out; const uint8_t *data; @@ -8320,6 +8650,7 @@ typedef struct { uint32_t head_dim; } ds4_kv_cache; + static uint32_t ds4_default_raw_cap(uint32_t ctx_size) { uint32_t raw_cap = DS4_N_SWA; if (raw_cap > ctx_size) raw_cap = ctx_size; @@ -10343,22 +10674,25 @@ typedef struct { /* Speculative decoding scratch. MTP is allowed to mutate graph state only * if the target verifier can either commit it or restore the saved - * frontiers. The prefix1 buffers are the cheap partial-accept state for the - * common N=2 case. */ + * frontiers. Prefix buffers snapshot compressed frontiers after accepted + * verifier rows so partial accepts can commit without replaying target + * tokens. Slot 0 is the legacy prefix-1 path; DSpark block drafting uses + * later slots for prefix lengths 2..block_size-1. */ ds4_gpu_tensor *spec_attn_state_kv[DS4_MAX_LAYER]; ds4_gpu_tensor *spec_attn_state_score[DS4_MAX_LAYER]; ds4_gpu_tensor *spec_index_state_kv[DS4_MAX_LAYER]; ds4_gpu_tensor *spec_index_state_score[DS4_MAX_LAYER]; - ds4_gpu_tensor *spec_prefix1_attn_state_kv[DS4_MAX_LAYER]; - ds4_gpu_tensor *spec_prefix1_attn_state_score[DS4_MAX_LAYER]; - ds4_gpu_tensor *spec_prefix1_index_state_kv[DS4_MAX_LAYER]; - ds4_gpu_tensor *spec_prefix1_index_state_score[DS4_MAX_LAYER]; + ds4_gpu_tensor *spec_prefix_attn_state_kv[DS4_MAX_LAYER]; + ds4_gpu_tensor *spec_prefix_attn_state_score[DS4_MAX_LAYER]; + ds4_gpu_tensor *spec_prefix_index_state_kv[DS4_MAX_LAYER]; + ds4_gpu_tensor *spec_prefix_index_state_score[DS4_MAX_LAYER]; ds4_gpu_tensor *spec_logits; uint32_t layer_n_comp[DS4_MAX_LAYER]; uint32_t layer_n_index_comp[DS4_MAX_LAYER]; - uint32_t spec_prefix1_n_comp[DS4_MAX_LAYER]; - uint32_t spec_prefix1_n_index_comp[DS4_MAX_LAYER]; - bool spec_capture_prefix1; + uint32_t spec_prefix_n_comp[DS4_SPEC_PREFIX_MAX_SLOTS][DS4_MAX_LAYER]; + uint32_t spec_prefix_n_index_comp[DS4_SPEC_PREFIX_MAX_SLOTS][DS4_MAX_LAYER]; + uint32_t spec_prefix_slots; + uint32_t spec_capture_prefix_tokens; uint32_t raw_cap; /* Maximum compressed-row capacity across layers. Shared work buffers use * this worst-case size because ratio-4 indexer layers can still reach it. */ @@ -10421,6 +10755,18 @@ typedef struct { ds4_gpu_tensor *mtp_next_hc; ds4_gpu_tensor *mtp_raw_cache; uint32_t mtp_n_raw; + + /* Optional DSpark block-draft state. The target decoder captures mean-HC + * hidden rows at the configured target layers, then the drafter consumes + * that 3-row feature to propose a block of candidate tokens. */ + ds4_gpu_tensor *dspark_main_hidden; + ds4_gpu_tensor *dspark_main_x; + ds4_gpu_tensor *dspark_verify_hidden; + ds4_gpu_tensor *dspark_verify_main_x; + ds4_gpu_tensor *dspark_mean_weights; + ds4_gpu_tensor *dspark_kv_cache[DS4_DSPARK_MTP_LAYERS]; + uint32_t dspark_target_layer_ids[DS4_DSPARK_MTP_LAYERS]; + uint32_t dspark_n_real; uint32_t prefill_cap; uint32_t raw_window; @@ -10491,6 +10837,7 @@ typedef struct { bool ssd_streaming_cold; bool streaming_static_decode_map_current; bool mtp_enabled; + bool dspark_enabled; float *cpu_router_norm; } ds4_gpu_graph; @@ -10530,7 +10877,6 @@ static void graph_power_note_decode_token(ds4_gpu_graph *g, double elapsed_sec) graph_power_update_avg(g->decode_token_avg_sec, elapsed_sec); graph_power_sleep(g->decode_token_avg_sec, g->power_percent); } - /* Release every Metal tensor owned by the whole-model graph runtime. */ static void metal_graph_free(ds4_gpu_graph *g) { ds4_gpu_tensor_free(g->directional_steering_dirs); @@ -10575,6 +10921,14 @@ static void metal_graph_free(ds4_gpu_graph *g) { ds4_gpu_tensor_free(g->batch_next_hc); ds4_gpu_tensor_free(g->batch_cur_hc); ds4_gpu_tensor_free(g->prefill_tokens); + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + ds4_gpu_tensor_free(g->dspark_kv_cache[s]); + } + ds4_gpu_tensor_free(g->dspark_mean_weights); + ds4_gpu_tensor_free(g->dspark_main_x); + ds4_gpu_tensor_free(g->dspark_verify_main_x); + ds4_gpu_tensor_free(g->dspark_verify_hidden); + ds4_gpu_tensor_free(g->dspark_main_hidden); ds4_gpu_tensor_free(g->logits); ds4_gpu_tensor_free(g->mtp_raw_cache); ds4_gpu_tensor_free(g->mtp_next_hc); @@ -10646,10 +11000,10 @@ static void metal_graph_free(ds4_gpu_graph *g) { ds4_gpu_tensor_free(g->spec_attn_state_score[il]); ds4_gpu_tensor_free(g->spec_index_state_kv[il]); ds4_gpu_tensor_free(g->spec_index_state_score[il]); - ds4_gpu_tensor_free(g->spec_prefix1_attn_state_kv[il]); - ds4_gpu_tensor_free(g->spec_prefix1_attn_state_score[il]); - ds4_gpu_tensor_free(g->spec_prefix1_index_state_kv[il]); - ds4_gpu_tensor_free(g->spec_prefix1_index_state_score[il]); + ds4_gpu_tensor_free(g->spec_prefix_attn_state_kv[il]); + ds4_gpu_tensor_free(g->spec_prefix_attn_state_score[il]); + ds4_gpu_tensor_free(g->spec_prefix_index_state_kv[il]); + ds4_gpu_tensor_free(g->spec_prefix_index_state_score[il]); } ds4_gpu_tensor_free(g->kv); ds4_gpu_tensor_free(g->kv_raw); @@ -10956,14 +11310,28 @@ static bool metal_graph_ensure_batch_ffn_out(ds4_gpu_graph *g) { * weights are not copied here; tensors reference the mapped GGUF. */ static bool metal_graph_alloc_raw_cap( ds4_gpu_graph *g, - const ds4_weights *weights, + const ds4_weights *weights, const ds4_layer_weights *layer, - uint32_t raw_cap, - uint32_t ctx_size, - uint32_t prefill_cap, - bool enable_mtp) { + const ds4_mtp_weights *mtp_weights, + uint32_t raw_cap, + uint32_t ctx_size, + uint32_t prefill_cap, + bool enable_mtp) { memset(g, 0, sizeof(*g)); g->mtp_enabled = enable_mtp; + const bool enable_dspark = + enable_mtp && mtp_weights && mtp_weights->kind == DS4_MTP_DRAFT_DSPARK; + g->dspark_enabled = enable_dspark; + g->spec_prefix_slots = enable_mtp && mtp_weights + ? (uint32_t)ds4_dspark_prefix_slot_count(mtp_weights->kind, + (int)mtp_weights->dspark.block_size, + DS4_SPEC_PREFIX_MAX_SLOTS) + : 0; + if (enable_dspark) { + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + g->dspark_target_layer_ids[s] = mtp_weights->dspark.target_layer_ids[s]; + } + } if (raw_cap == 0) raw_cap = 1; if (ctx_size == 0) ctx_size = raw_cap; if (prefill_cap == 0) prefill_cap = 1; @@ -11072,8 +11440,8 @@ static bool metal_graph_alloc_raw_cap( if (enable_mtp) { g->spec_attn_state_kv[il] = ds4_gpu_tensor_alloc(attn_width * attn_rows * sizeof(float)); g->spec_attn_state_score[il] = ds4_gpu_tensor_alloc(attn_width * attn_rows * sizeof(float)); - g->spec_prefix1_attn_state_kv[il] = ds4_gpu_tensor_alloc(attn_width * attn_rows * sizeof(float)); - g->spec_prefix1_attn_state_score[il] = ds4_gpu_tensor_alloc(attn_width * attn_rows * sizeof(float)); + g->spec_prefix_attn_state_kv[il] = ds4_gpu_tensor_alloc((uint64_t)g->spec_prefix_slots * attn_width * attn_rows * sizeof(float)); + g->spec_prefix_attn_state_score[il] = ds4_gpu_tensor_alloc((uint64_t)g->spec_prefix_slots * attn_width * attn_rows * sizeof(float)); } if (g->layer_attn_state_kv[il]) { state_init_ok = state_init_ok && @@ -11095,8 +11463,8 @@ static bool metal_graph_alloc_raw_cap( if (enable_mtp) { g->spec_index_state_kv[il] = ds4_gpu_tensor_alloc(index_width * index_rows * sizeof(float)); g->spec_index_state_score[il] = ds4_gpu_tensor_alloc(index_width * index_rows * sizeof(float)); - g->spec_prefix1_index_state_kv[il] = ds4_gpu_tensor_alloc(index_width * index_rows * sizeof(float)); - g->spec_prefix1_index_state_score[il] = ds4_gpu_tensor_alloc(index_width * index_rows * sizeof(float)); + g->spec_prefix_index_state_kv[il] = ds4_gpu_tensor_alloc((uint64_t)g->spec_prefix_slots * index_width * index_rows * sizeof(float)); + g->spec_prefix_index_state_score[il] = ds4_gpu_tensor_alloc((uint64_t)g->spec_prefix_slots * index_width * index_rows * sizeof(float)); } if (g->layer_index_state_kv[il]) { state_init_ok = state_init_ok && @@ -11169,6 +11537,30 @@ static bool metal_graph_alloc_raw_cap( g->spec_logits = ds4_gpu_tensor_alloc((uint64_t)16 * DS4_N_VOCAB * sizeof(float)); g->mtp_n_raw = 0; } + if (enable_dspark) { + g->dspark_main_hidden = ds4_gpu_tensor_alloc( + (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float)); + g->dspark_main_x = ds4_gpu_tensor_alloc((uint64_t)DS4_N_EMBD * sizeof(float)); + g->dspark_verify_hidden = ds4_gpu_tensor_alloc( + (uint64_t)DS4_DSPARK_MAX_BLOCK_SIZE * + DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float)); + g->dspark_verify_main_x = ds4_gpu_tensor_alloc( + (uint64_t)DS4_DSPARK_MAX_BLOCK_SIZE * DS4_N_EMBD * sizeof(float)); + g->dspark_mean_weights = ds4_gpu_tensor_alloc((uint64_t)DS4_N_HC * sizeof(float)); + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + g->dspark_kv_cache[s] = metal_graph_alloc_kv_cache_tensor( + managed_kv_cache, + (uint64_t)(DS4_N_SWA + mtp_weights->dspark.block_size) * + DS4_N_HEAD_DIM * sizeof(float)); + } + if (g->dspark_mean_weights) { + state_init_ok = state_init_ok && + metal_tensor_fill_f32(g->dspark_mean_weights, + 1.0f / (float)DS4_N_HC, + DS4_N_HC); + } + g->dspark_n_real = 0; + } g->prefill_tokens = ds4_gpu_tensor_alloc(pc * sizeof(int32_t)); g->batch_cur_hc = ds4_gpu_tensor_alloc(pc * hc_dim * sizeof(float)); @@ -11226,8 +11618,8 @@ static bool metal_graph_alloc_raw_cap( (!enable_mtp || (g->spec_attn_state_kv[il] != NULL && g->spec_attn_state_score[il] != NULL && - g->spec_prefix1_attn_state_kv[il] != NULL && - g->spec_prefix1_attn_state_score[il] != NULL)); + g->spec_prefix_attn_state_kv[il] != NULL && + g->spec_prefix_attn_state_score[il] != NULL)); } if (layer_cache_ok && ratio == 4) { layer_cache_ok = g->layer_index_comp_cache[il] != NULL && @@ -11236,8 +11628,8 @@ static bool metal_graph_alloc_raw_cap( (!enable_mtp || (g->spec_index_state_kv[il] != NULL && g->spec_index_state_score[il] != NULL && - g->spec_prefix1_index_state_kv[il] != NULL && - g->spec_prefix1_index_state_score[il] != NULL)); + g->spec_prefix_index_state_kv[il] != NULL && + g->spec_prefix_index_state_score[il] != NULL)); } } @@ -11265,6 +11657,12 @@ static bool metal_graph_alloc_raw_cap( g->mtp_eproj_hc && g->mtp_hnorm_hc && g->mtp_hproj_hc && g->mtp_input_hc && g->mtp_state_hc && g->mtp_next_hc && g->mtp_raw_cache && g->spec_logits)) && + (!enable_dspark || + (g->dspark_main_hidden && g->dspark_main_x && + g->dspark_verify_hidden && g->dspark_verify_main_x && + g->dspark_mean_weights && + g->dspark_kv_cache[0] && g->dspark_kv_cache[1] && + g->dspark_kv_cache[2])) && g->prefill_tokens && g->batch_cur_hc && g->batch_next_hc && g->batch_flat_hc && g->batch_hc_mix && g->batch_hc_split && @@ -11292,7 +11690,8 @@ static bool metal_graph_alloc( ds4_gpu_graph *g, const ds4_weights *weights, const ds4_layer_weights *layer) { - return metal_graph_alloc_raw_cap(g, weights, layer, DS4_N_SWA, DS4_N_SWA, 1, false); + return metal_graph_alloc_raw_cap(g, weights, layer, NULL, + DS4_N_SWA, DS4_N_SWA, 1, false); } static bool metal_graph_install_model_spans( @@ -13163,38 +13562,46 @@ static uint32_t metal_graph_raw_start_for_span( return first_raw_pos % g->raw_cap; } -/* Capture the verifier prefix after the first speculative token. +/* Capture verifier prefixes for partial speculative accepts. * * Exact MTP speculation is only profitable if partial accepts are cheap. The - * target verifier computes two draft tokens together; if only the first token - * is accepted, replaying a one-token verifier throws away most of the gain. - * For compressed-attention layers the mutable frontier is just the small - * compressor state plus append counters, so we save that prefix-1 state while - * the N=2 verifier is already stepping the compressor token by token. + * target verifier computes draft tokens together; if only a prefix is accepted, + * replaying those target tokens throws away much of the gain. For compressed- + * attention layers the mutable frontier is just the small compressor state plus + * append counters, so we snapshot each intermediate prefix while the verifier + * is already stepping the compressor token by token. * * Raw SWA rows are not captured here. This graph uses a raw ring larger than * the 128-token logical SWA window, so writing speculative future rows does * not evict visible raw rows. If the raw cache is ever reduced to a strict * 128-row ring, speculative raw rows must become shadow rows and be copied * into the ring only on commit. */ -static bool metal_graph_capture_prefix1_attn_state(ds4_gpu_graph *g, uint32_t il) { - if (!g->spec_capture_prefix1 || !g->spec_prefix1_attn_state_kv[il]) return true; +static bool metal_graph_capture_prefix_attn_state(ds4_gpu_graph *g, uint32_t il, uint32_t prefix_len) { + if (!g || prefix_len == 0 || prefix_len > g->spec_capture_prefix_tokens) return true; + const int slot = ds4_dspark_prefix_slot_for_accept((int)prefix_len, + (int)(g->spec_capture_prefix_tokens + 1u)); + if (slot < 0 || (uint32_t)slot >= g->spec_prefix_slots || !g->spec_prefix_attn_state_kv[il]) return true; const uint64_t bytes = ds4_gpu_tensor_bytes(g->layer_attn_state_kv[il]); - g->spec_prefix1_n_comp[il] = g->layer_n_comp[il]; - return ds4_gpu_tensor_copy(g->spec_prefix1_attn_state_kv[il], 0, - g->layer_attn_state_kv[il], 0, bytes) != 0 && - ds4_gpu_tensor_copy(g->spec_prefix1_attn_state_score[il], 0, - g->layer_attn_state_score[il], 0, bytes) != 0; -} - -static bool metal_graph_capture_prefix1_index_state(ds4_gpu_graph *g, uint32_t il) { - if (!g->spec_capture_prefix1 || !g->spec_prefix1_index_state_kv[il]) return true; + const uint64_t offset = (uint64_t)slot * bytes; + g->spec_prefix_n_comp[slot][il] = g->layer_n_comp[il]; + return ds4_gpu_tensor_copy(g->spec_prefix_attn_state_kv[il], offset, + g->layer_attn_state_kv[il], 0, bytes) != 0 && + ds4_gpu_tensor_copy(g->spec_prefix_attn_state_score[il], offset, + g->layer_attn_state_score[il], 0, bytes) != 0; +} + +static bool metal_graph_capture_prefix_index_state(ds4_gpu_graph *g, uint32_t il, uint32_t prefix_len) { + if (!g || prefix_len == 0 || prefix_len > g->spec_capture_prefix_tokens) return true; + const int slot = ds4_dspark_prefix_slot_for_accept((int)prefix_len, + (int)(g->spec_capture_prefix_tokens + 1u)); + if (slot < 0 || (uint32_t)slot >= g->spec_prefix_slots || !g->spec_prefix_index_state_kv[il]) return true; const uint64_t bytes = ds4_gpu_tensor_bytes(g->layer_index_state_kv[il]); - g->spec_prefix1_n_index_comp[il] = g->layer_n_index_comp[il]; - return ds4_gpu_tensor_copy(g->spec_prefix1_index_state_kv[il], 0, - g->layer_index_state_kv[il], 0, bytes) != 0 && - ds4_gpu_tensor_copy(g->spec_prefix1_index_state_score[il], 0, - g->layer_index_state_score[il], 0, bytes) != 0; + const uint64_t offset = (uint64_t)slot * bytes; + g->spec_prefix_n_index_comp[slot][il] = g->layer_n_index_comp[il]; + return ds4_gpu_tensor_copy(g->spec_prefix_index_state_kv[il], offset, + g->layer_index_state_kv[il], 0, bytes) != 0 && + ds4_gpu_tensor_copy(g->spec_prefix_index_state_score[il], offset, + g->layer_index_state_score[il], 0, bytes) != 0; } static uint32_t metal_graph_decode_indexer_sparse_threshold(const ds4_gpu_graph *g) { @@ -16312,6 +16719,79 @@ static bool metal_graph_encode_output_head_mtp( return ok; } +static DS4_MAYBE_UNUSED bool metal_graph_encode_output_head_mtp_batch( + ds4_gpu_graph *g, + const ds4_model *base_model, + const ds4_weights *base_weights, + const ds4_model *mtp_model, + const ds4_mtp_weights *mtp, + uint32_t n_tokens, + uint64_t vocab_dim) { + if (n_tokens == 0 || n_tokens > g->prefill_cap || !g->spec_logits) return false; + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + ds4_gpu_tensor *output_pre = ds4_gpu_tensor_view( + g->batch_hc_mix, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float)); + ds4_gpu_tensor *output_weights = ds4_gpu_tensor_view( + g->batch_hc_split, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float)); + ds4_gpu_tensor *output_embd = ds4_gpu_tensor_view( + g->batch_ffn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *output_norm = ds4_gpu_tensor_view( + g->batch_ffn_norm, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *logits = ds4_gpu_tensor_view( + g->spec_logits, 0, (uint64_t)n_tokens * vocab_dim * sizeof(float)); + bool ok = output_pre && output_weights && output_embd && output_norm && logits; + + if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc, + g->batch_cur_hc, + (uint32_t)hc_dim, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(output_pre, + mtp_model, + mtp->hc_head_fn, + hc_dim, + DS4_N_HC, + g->batch_flat_hc, + n_tokens); + if (ok) ok = ds4_gpu_output_hc_weights_tensor(output_weights, + output_pre, + mtp_model->map, + mtp_model->size, + mtp->hc_head_scale->abs_offset, + mtp->hc_head_base->abs_offset, + DS4_N_HC, + DS4_HC_EPS) != 0; + if (ok) ok = ds4_gpu_hc_weighted_sum_tensor(output_embd, + g->batch_cur_hc, + output_weights, + DS4_N_EMBD, + DS4_N_HC) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(output_norm, + output_embd, + mtp_model->map, + mtp_model->size, + mtp->norm->abs_offset, + DS4_N_EMBD, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(logits, + base_model->map, + base_model->size, + base_weights->output->abs_offset, + DS4_N_EMBD, + vocab_dim, + output_norm, + n_tokens) != 0; + + ds4_gpu_tensor_free(logits); + ds4_gpu_tensor_free(output_norm); + ds4_gpu_tensor_free(output_embd); + ds4_gpu_tensor_free(output_weights); + ds4_gpu_tensor_free(output_pre); + return ok; +} + /* ========================================================================= * Metal Diagnostic Comparisons. * ========================================================================= @@ -16941,6 +17421,67 @@ static uint32_t metal_graph_token_split_after_layers(void) { return split_after_layers; } +static bool metal_graph_capture_dspark_main_hidden(ds4_gpu_graph *g, uint32_t il) { + if (!g || !g->dspark_enabled) return true; + if (!g->cur_hc || !g->dspark_main_hidden || !g->dspark_mean_weights) return false; + + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + if (g->dspark_target_layer_ids[s] != il) continue; + ds4_gpu_tensor *dst = ds4_gpu_tensor_view( + g->dspark_main_hidden, + (uint64_t)s * DS4_N_EMBD * sizeof(float), + (uint64_t)DS4_N_EMBD * sizeof(float)); + const bool ok = dst && + ds4_gpu_hc_weighted_sum_tensor(dst, + g->cur_hc, + g->dspark_mean_weights, + DS4_N_EMBD, + DS4_N_HC) != 0; + ds4_gpu_tensor_free(dst); + return ok; + } + return true; +} + +static bool metal_graph_capture_dspark_batch_main_hidden(ds4_gpu_graph *g, + uint32_t il, + uint32_t n_tokens) { + if (!g || !g->dspark_enabled) return true; + if (!g->batch_cur_hc || !g->dspark_verify_hidden || !g->dspark_mean_weights || + n_tokens == 0 || n_tokens > DS4_DSPARK_MAX_BLOCK_SIZE) { + return false; + } + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + const uint64_t hidden_row_bytes = + (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float); + const uint64_t stage_bytes = (uint64_t)DS4_N_EMBD * sizeof(float); + + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + if (g->dspark_target_layer_ids[s] != il) continue; + for (uint32_t row = 0; row < n_tokens; row++) { + ds4_gpu_tensor *src = ds4_gpu_tensor_view( + g->batch_cur_hc, + (uint64_t)row * hc_dim * sizeof(float), + hc_dim * sizeof(float)); + ds4_gpu_tensor *dst = ds4_gpu_tensor_view( + g->dspark_verify_hidden, + (uint64_t)row * hidden_row_bytes + (uint64_t)s * stage_bytes, + stage_bytes); + const bool ok = src && dst && + ds4_gpu_hc_weighted_sum_tensor(dst, + src, + g->dspark_mean_weights, + DS4_N_EMBD, + DS4_N_HC) != 0; + ds4_gpu_tensor_free(dst); + ds4_gpu_tensor_free(src); + if (!ok) return false; + } + } + return true; +} + /* Encode a full single-token decode step on Metal. This is the generation * hot path: update caches, run all layers, then produce logits. */ static bool metal_graph_encode_token_raw_swa( @@ -16990,6 +17531,7 @@ static bool metal_graph_encode_token_raw_swa( ds4_gpu_tensor *tmp = g->cur_hc; g->cur_hc = g->after_ffn_hc; g->after_ffn_hc = tmp; + if (ok) ok = metal_graph_capture_dspark_main_hidden(g, il); if (ok && allow_split_flush && split_after_layers != 0 && il + 1u == split_after_layers) { ok = ds4_gpu_flush_commands() != 0; } @@ -17310,6 +17852,7 @@ static bool metal_graph_q_stage_profile_boundary( return ds4_gpu_begin_commands() != 0; } + static bool metal_graph_encode_layer_attention_batch( ds4_gpu_graph *g, const ds4_model *model, @@ -18002,7 +18545,7 @@ static bool metal_graph_encode_layer_attention_batch( } if (ok && emit) g->layer_n_comp[il]++; if (comp_counts) comp_counts[t] = g->layer_n_comp[il]; - if (ok && t == 0) ok = metal_graph_capture_prefix1_attn_state(g, il); + if (ok) ok = metal_graph_capture_prefix_attn_state(g, il, t + 1); ds4_gpu_tensor_free(sc_view); ds4_gpu_tensor_free(kv_view); } @@ -18291,7 +18834,7 @@ static bool metal_graph_encode_layer_attention_batch( } if (ok && emit) g->layer_n_index_comp[il]++; if (index_counts) index_counts[t] = g->layer_n_index_comp[il]; - if (ok && t == 0) ok = metal_graph_capture_prefix1_index_state(g, il); + if (ok) ok = metal_graph_capture_prefix_index_state(g, il, t + 1); ds4_gpu_tensor_free(sc_view); ds4_gpu_tensor_free(kv_view); } @@ -19283,41 +19826,642 @@ static bool metal_graph_encode_layer_batch( return ok; } -static bool metal_graph_eval_token_raw_swa_streaming( - ds4_gpu_graph *g, - const ds4_model *model, - const ds4_weights *weights, - int token, - uint32_t pos, - float *logits) { - if (g->raw_cap == 0) { - fprintf(stderr, "ds4: Metal graph raw KV cache is not allocated\n"); +static bool metal_graph_dspark_input_stage( + ds4_gpu_graph *g, + const ds4_model *target_model, + const ds4_weights *target_weights, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + int anchor_token, + uint32_t block_size) { + if (!g || !target_model || !target_weights || !dspark_model || !mtp || + !g->dspark_main_hidden || !g->dspark_main_x || !g->batch_cur_hc || + block_size == 0 || block_size > g->prefill_cap) { return false; } - const bool profile = getenv("DS4_METAL_GRAPH_TOKEN_PROFILE") != NULL; - const bool throttle = graph_power_throttle_enabled(g); - const double t0 = (profile || throttle) ? now_sec() : 0.0; - const uint32_t raw_row = pos % g->raw_cap; - const uint32_t n_raw = metal_graph_raw_span_for_batch(g, pos, 1); + bool ok = ds4_gpu_begin_commands() != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->dspark_main_x, + dspark_model->map, + dspark_model->size, + mtp->main_proj->abs_offset, + 3ull * DS4_N_EMBD, + (uint64_t)DS4_N_EMBD, + g->dspark_main_hidden, + 1) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->dspark_main_x, + g->dspark_main_x, + dspark_model->map, + dspark_model->size, + mtp->main_norm->abs_offset, + DS4_N_EMBD, + DS4_RMS_EPS) != 0; + if (ds4_gpu_end_commands() == 0) ok = false; + if (!ok) return false; - const bool static_decode_map = metal_graph_stream_decode_static_map_enabled(); - const bool static_map_state_cache = - static_decode_map && metal_graph_stream_decode_static_map_state_cache_enabled(); - const bool batch_static_decode = - static_decode_map && metal_graph_stream_decode_layer_batch_enabled(g); - bool ok = true; - if (static_decode_map) { - if (!static_map_state_cache || !g->streaming_static_decode_map_current) { - ok = metal_graph_stream_map_decode_static_all(model, weights); - if (ok) g->streaming_static_decode_map_current = static_map_state_cache; - } - } else { - g->streaming_static_decode_map_current = false; - ok = metal_graph_stream_map_token(model, weights); - } - if (ok && !static_decode_map && DS4_N_LAYER > 0) { - metal_graph_stream_readahead_layer_decode(model, weights, 0); + token_vec draft_ids = {0}; + token_vec_push(&draft_ids, anchor_token); + for (uint32_t i = 1; i < block_size; i++) { + token_vec_push(&draft_ids, (int)mtp->dspark.noise_token_id); + } + + ok = metal_graph_upload_prompt_tokens(g->prefill_tokens, &draft_ids, 0u, block_size); + if (ok) ok = metal_graph_upload_prompt_embeddings_hc(g->batch_cur_hc, + g->prefill_tokens, + target_model, + target_weights, + &draft_ids, + 0u, + block_size); + token_vec_free(&draft_ids); + return ok; +} + +static bool metal_graph_dspark_encode_attention( + ds4_gpu_graph *g, + const ds4_model *dspark_model, + const ds4_layer_weights *layer, + uint32_t stage, + uint32_t start_pos, + uint32_t n_tokens) { + if (!g || !dspark_model || !layer || stage >= DS4_DSPARK_MTP_LAYERS || + n_tokens == 0 || n_tokens > g->prefill_cap || + !g->dspark_kv_cache[stage] || !g->batch_cur_hc || !g->dspark_main_x) { + return false; + } + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + const uint64_t mix_hc = 2ull * DS4_N_HC + (uint64_t)DS4_N_HC * DS4_N_HC; + const uint64_t q_rank = layer->attn_q_a->dim[1]; + const uint64_t q_dim = (uint64_t)DS4_N_HEAD * DS4_N_HEAD_DIM; + const uint32_t n_groups = DS4_N_OUT_GROUP; + const uint32_t group_heads = DS4_N_HEAD / n_groups; + const uint32_t group_dim = DS4_N_HEAD_DIM * group_heads; + const uint32_t rank = DS4_N_LORA_O; + const uint32_t raw_cap = DS4_N_SWA + n_tokens; + uint32_t n_real = g->dspark_n_real; + if (n_real + 1u + n_tokens > raw_cap) n_real = raw_cap - 1u - n_tokens; + + ds4_gpu_tensor *hc_mix_view = ds4_gpu_tensor_view( + g->batch_hc_mix, 0, (uint64_t)n_tokens * mix_hc * sizeof(float)); + ds4_gpu_tensor *hc_split_view = ds4_gpu_tensor_view( + g->batch_hc_split, 0, (uint64_t)n_tokens * mix_hc * sizeof(float)); + ds4_gpu_tensor *attn_cur_view = ds4_gpu_tensor_view( + g->batch_attn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *after_attn_hc_view = ds4_gpu_tensor_view( + g->batch_after_attn_hc, 0, (uint64_t)n_tokens * hc_dim * sizeof(float)); + bool ok = hc_mix_view && hc_split_view && attn_cur_view && after_attn_hc_view; + + if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc, + g->batch_cur_hc, + (uint32_t)hc_dim, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(hc_mix_view, + dspark_model, + layer->hc_attn_fn, + hc_dim, + mix_hc, + g->batch_flat_hc, + n_tokens); + if (ok) ok = ds4_gpu_hc_split_weighted_sum_tensor(attn_cur_view, + hc_split_view, + hc_mix_view, + g->batch_cur_hc, + dspark_model->map, + dspark_model->size, + layer->hc_attn_scale->abs_offset, + layer->hc_attn_base->abs_offset, + DS4_N_EMBD, + DS4_N_HC, + DS4_N_HC_SINKHORN_ITER, + DS4_HC_EPS) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_attn_norm, + g->batch_attn_cur, + dspark_model->map, + dspark_model->size, + layer->attn_norm->abs_offset, + DS4_N_EMBD, + n_tokens, + DS4_RMS_EPS) != 0; + + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_qr, + dspark_model->map, + dspark_model->size, + layer->attn_q_a->abs_offset, + DS4_N_EMBD, + q_rank, + g->batch_attn_norm, + n_tokens) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_qr_norm, + g->batch_qr, + dspark_model->map, + dspark_model->size, + layer->attn_q_a_norm->abs_offset, + (uint32_t)q_rank, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_q, + dspark_model->map, + dspark_model->size, + layer->attn_q_b->abs_offset, + q_rank, + q_dim, + g->batch_qr_norm, + n_tokens) != 0; + if (ok) ok = ds4_gpu_head_rms_norm_tensor(g->batch_q, + n_tokens, + DS4_N_HEAD, + DS4_N_HEAD_DIM, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_q, + n_tokens, + DS4_N_HEAD, + DS4_N_HEAD_DIM, + DS4_N_ROT, + start_pos + 1u, + 0u, + false, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv->abs_offset, + DS4_N_EMBD, + DS4_N_HEAD_DIM, + g->batch_attn_norm, + n_tokens) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv, + g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv_a_norm->abs_offset, + DS4_N_HEAD_DIM, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv, + n_tokens, + DS4_N_HEAD_KV, + DS4_N_HEAD_DIM, + DS4_N_ROT, + start_pos + 1u, + 0u, + false, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv, + n_tokens, + DS4_N_HEAD_DIM, + DS4_N_ROT) != 0; + if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor(g->dspark_kv_cache[stage], + g->batch_kv, + raw_cap, + n_real + 1u, + n_tokens, + DS4_N_HEAD_DIM) != 0; + + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv->abs_offset, + DS4_N_EMBD, + DS4_N_HEAD_DIM, + g->dspark_main_x, + 1) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv, + g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv_a_norm->abs_offset, + DS4_N_HEAD_DIM, + 1, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv, + 1, + DS4_N_HEAD_KV, + DS4_N_HEAD_DIM, + DS4_N_ROT, + start_pos, + 0u, + false, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv, + 1, + DS4_N_HEAD_DIM, + DS4_N_ROT) != 0; + if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor(g->dspark_kv_cache[stage], + g->batch_kv, + raw_cap, + n_real, + 1, + DS4_N_HEAD_DIM) != 0; + + if (ok) ok = ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + g->batch_heads, + dspark_model->map, + dspark_model->size, + layer->attn_sinks->abs_offset, + g->batch_q, + g->dspark_kv_cache[stage], + n_tokens, + n_real + 1u + n_tokens, + raw_cap, + 0u, + DS4_N_HEAD, + DS4_N_HEAD_DIM) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_heads, + n_tokens, + DS4_N_HEAD, + DS4_N_HEAD_DIM, + DS4_N_ROT, + start_pos + 1u, + 0u, + true, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + if (ok) ok = ds4_gpu_attention_output_q8_batch_tensor(g->batch_attn_out, + g->batch_attn_low, + g->batch_group_tmp, + g->batch_low_tmp, + dspark_model->map, + dspark_model->size, + layer->attn_output_a->abs_offset, + layer->attn_output_b->abs_offset, + group_dim, + rank, + n_groups, + DS4_N_EMBD, + g->batch_heads, + n_tokens) != 0; + if (ok) ok = ds4_gpu_hc_expand_split_tensor(after_attn_hc_view, + g->batch_attn_out, + g->batch_cur_hc, + hc_split_view, + DS4_N_EMBD, + DS4_N_HC) != 0; + + ds4_gpu_tensor_free(after_attn_hc_view); + ds4_gpu_tensor_free(attn_cur_view); + ds4_gpu_tensor_free(hc_split_view); + ds4_gpu_tensor_free(hc_mix_view); + return ok; +} + +static bool metal_graph_dspark_refresh_main_rows( + ds4_gpu_graph *g, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + ds4_gpu_tensor *main_hidden, + ds4_gpu_tensor *main_x, + uint32_t pos0, + uint32_t row0, + uint32_t n_tokens, + bool keep_last_hidden) { + if (n_tokens == 0) return true; + if (!g || !g->dspark_enabled || !dspark_model || !mtp || !main_hidden || + !main_x || !g->batch_kv_raw || !g->batch_kv || + n_tokens > DS4_DSPARK_MAX_BLOCK_SIZE || + row0 + n_tokens > DS4_N_SWA + mtp->dspark.block_size) { + return false; + } + + bool ok = ds4_gpu_begin_commands() != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(main_x, + dspark_model->map, + dspark_model->size, + mtp->main_proj->abs_offset, + 3ull * DS4_N_EMBD, + (uint64_t)DS4_N_EMBD, + main_hidden, + n_tokens) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(main_x, + main_x, + dspark_model->map, + dspark_model->size, + mtp->main_norm->abs_offset, + DS4_N_EMBD, + n_tokens, + DS4_RMS_EPS) != 0; + + for (uint32_t stage = 0; ok && stage < mtp->dspark.n_mtp_layers; stage++) { + const ds4_layer_weights *layer = &mtp->stage[stage]; + ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv->abs_offset, + DS4_N_EMBD, + DS4_N_HEAD_DIM, + main_x, + n_tokens) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv, + g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv_a_norm->abs_offset, + DS4_N_HEAD_DIM, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv, + n_tokens, + DS4_N_HEAD_KV, + DS4_N_HEAD_DIM, + DS4_N_ROT, + pos0, + 0u, + false, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv, + n_tokens, + DS4_N_HEAD_DIM, + DS4_N_ROT) != 0; + if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor( + g->dspark_kv_cache[stage], + g->batch_kv, + DS4_N_SWA + mtp->dspark.block_size, + row0, + n_tokens, + DS4_N_HEAD_DIM) != 0; + } + + if (ok && keep_last_hidden && g->dspark_main_hidden) { + const uint64_t stage_bytes = (uint64_t)DS4_N_EMBD * sizeof(float); + const uint64_t hidden_row_bytes = + (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float); + const uint64_t src_row = (uint64_t)(n_tokens - 1u) * hidden_row_bytes; + for (uint32_t s = 0; ok && s < DS4_DSPARK_MTP_LAYERS; s++) { + ok = ds4_gpu_tensor_copy(g->dspark_main_hidden, + (uint64_t)s * stage_bytes, + main_hidden, + src_row + (uint64_t)s * stage_bytes, + stage_bytes) != 0; + } + } + + if (ds4_gpu_end_commands() == 0) ok = false; + if (!ok) (void)ds4_gpu_synchronize(); + return ok; +} + +static bool metal_graph_dspark_refresh_verified_rows( + ds4_gpu_graph *g, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + uint32_t row0, + uint32_t pos0, + uint32_t n_tokens) { + return metal_graph_dspark_refresh_main_rows(g, + dspark_model, + mtp, + g ? g->dspark_verify_hidden : NULL, + g ? g->dspark_verify_main_x : NULL, + pos0, + row0, + n_tokens, + true); +} + +static bool metal_graph_dspark_refresh_current_row( + ds4_gpu_graph *g, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + uint32_t row, + uint32_t pos) { + return metal_graph_dspark_refresh_main_rows(g, + dspark_model, + mtp, + g ? g->dspark_main_hidden : NULL, + g ? g->dspark_main_x : NULL, + pos, + row, + 1, + false); +} +static bool metal_graph_encode_output_head_dspark_batch( + ds4_gpu_graph *g, + const ds4_model *target_model, + const ds4_weights *target_weights, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + uint32_t n_tokens) { + if (!g || !target_model || !target_weights || !dspark_model || !mtp || + n_tokens == 0 || n_tokens > g->prefill_cap || !g->spec_logits) { + return false; + } + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + ds4_gpu_tensor *output_pre = ds4_gpu_tensor_view( + g->batch_hc_mix, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float)); + ds4_gpu_tensor *output_weights = ds4_gpu_tensor_view( + g->batch_hc_split, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float)); + ds4_gpu_tensor *output_embd = ds4_gpu_tensor_view( + g->batch_ffn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *output_norm = ds4_gpu_tensor_view( + g->batch_ffn_norm, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *logits = ds4_gpu_tensor_view( + g->spec_logits, 0, (uint64_t)n_tokens * DS4_N_VOCAB * sizeof(float)); + bool ok = output_pre && output_weights && output_embd && output_norm && logits; + + if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc, + g->batch_cur_hc, + (uint32_t)hc_dim, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(output_pre, + dspark_model, + mtp->hc_head_fn, + hc_dim, + DS4_N_HC, + g->batch_flat_hc, + n_tokens); + if (ok) ok = ds4_gpu_output_hc_weights_tensor(output_weights, + output_pre, + dspark_model->map, + dspark_model->size, + mtp->hc_head_scale->abs_offset, + mtp->hc_head_base->abs_offset, + DS4_N_HC, + DS4_HC_EPS) != 0; + if (ok) ok = ds4_gpu_hc_weighted_sum_tensor(output_embd, + g->batch_cur_hc, + output_weights, + DS4_N_EMBD, + DS4_N_HC) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(output_norm, + output_embd, + dspark_model->map, + dspark_model->size, + mtp->norm->abs_offset, + DS4_N_EMBD, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(logits, + target_model->map, + target_model->size, + target_weights->output->abs_offset, + DS4_N_EMBD, + DS4_N_VOCAB, + output_norm, + n_tokens) != 0; + + ds4_gpu_tensor_free(logits); + ds4_gpu_tensor_free(output_norm); + ds4_gpu_tensor_free(output_embd); + ds4_gpu_tensor_free(output_weights); + ds4_gpu_tensor_free(output_pre); + return ok; +} + +static bool metal_graph_eval_dspark_draft_block( + ds4_gpu_graph *g, + const ds4_model *target_model, + const ds4_weights *target_weights, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + int anchor_token, + uint32_t pos, + uint32_t max_tokens, + int *drafts, + int *draft_n, + uint32_t *base_real_out, + float *last_logits) { + if (draft_n) *draft_n = 0; + if (base_real_out) *base_real_out = 0; + if (!g || !target_model || !target_weights || !dspark_model || !mtp || + !drafts || !draft_n || mtp->kind != DS4_MTP_DRAFT_DSPARK) { + return false; + } + + uint32_t block_size = mtp->dspark.block_size; + if (block_size > max_tokens) block_size = max_tokens; + if (block_size > g->prefill_cap) block_size = g->prefill_cap; + if (block_size == 0 || block_size > 16) return true; + if (g->dspark_n_real >= DS4_N_SWA) g->dspark_n_real = 0; + if (base_real_out) *base_real_out = g->dspark_n_real; + + bool ok = metal_graph_dspark_input_stage(g, + target_model, + target_weights, + dspark_model, + mtp, + anchor_token, + block_size); + bool commands_open = false; + if (ok) { + ok = ds4_gpu_begin_commands() != 0; + commands_open = ok; + } + for (uint32_t stage = 0; ok && stage < mtp->dspark.n_mtp_layers; stage++) { + const ds4_layer_weights *layer = &mtp->stage[stage]; + ok = metal_graph_dspark_encode_attention(g, + dspark_model, + layer, + stage, + pos, + block_size); + if (ok) ok = metal_graph_encode_layer_ffn_batch(g, + dspark_model, + layer, + stage, + pos + 1u, + block_size); + if (ok) { + ds4_gpu_tensor *tmp = g->batch_cur_hc; + g->batch_cur_hc = g->batch_next_hc; + g->batch_next_hc = tmp; + } + } + if (ok) ok = metal_graph_encode_output_head_dspark_batch(g, + target_model, + target_weights, + dspark_model, + mtp, + block_size); + if (commands_open && ds4_gpu_end_commands() == 0) ok = false; + if (!ok) { + (void)ds4_gpu_synchronize(); + return false; + } + + const uint64_t row_bytes = (uint64_t)DS4_N_VOCAB * sizeof(float); + float *row_logits = xmalloc((size_t)row_bytes); + for (uint32_t i = 0; ok && i < block_size; i++) { + ok = ds4_gpu_tensor_read(g->spec_logits, + (uint64_t)i * row_bytes, + row_logits, + row_bytes) != 0; + if (!ok) break; + const int prev = i == 0 ? anchor_token : drafts[i - 1u]; + dspark_apply_markov_bias(row_logits, dspark_model, mtp, prev); + drafts[i] = sample_argmax(row_logits, DS4_N_VOCAB); + if (last_logits && i + 1u == block_size) { + memcpy(last_logits, row_logits, (size_t)row_bytes); + } + } + free(row_logits); + if (!ok) return false; + *draft_n = (int)block_size; + return true; +} + +static bool metal_graph_eval_token_raw_swa_streaming( + ds4_gpu_graph *g, + const ds4_model *model, + const ds4_weights *weights, + int token, + uint32_t pos, + float *logits) { + if (g->raw_cap == 0) { + fprintf(stderr, "ds4: Metal graph raw KV cache is not allocated\n"); + return false; + } + + const bool profile = getenv("DS4_METAL_GRAPH_TOKEN_PROFILE") != NULL; + const bool throttle = graph_power_throttle_enabled(g); + const double t0 = (profile || throttle) ? now_sec() : 0.0; + const uint32_t raw_row = pos % g->raw_cap; + const uint32_t n_raw = metal_graph_raw_span_for_batch(g, pos, 1); + + const bool static_decode_map = metal_graph_stream_decode_static_map_enabled(); + const bool static_map_state_cache = + static_decode_map && metal_graph_stream_decode_static_map_state_cache_enabled(); + const bool batch_static_decode = + static_decode_map && metal_graph_stream_decode_layer_batch_enabled(g); + bool ok = true; + if (static_decode_map) { + if (!static_map_state_cache || !g->streaming_static_decode_map_current) { + ok = metal_graph_stream_map_decode_static_all(model, weights); + if (ok) g->streaming_static_decode_map_current = static_map_state_cache; + } + } else { + g->streaming_static_decode_map_current = false; + ok = metal_graph_stream_map_token(model, weights); + } + if (ok && !static_decode_map && DS4_N_LAYER > 0) { + metal_graph_stream_readahead_layer_decode(model, weights, 0); } if (ok) ok = ds4_gpu_begin_commands() != 0; if (ok) { @@ -19457,7 +20601,8 @@ static bool metal_graph_eval_token_raw_swa_streaming( return ok; } -/* Execute one Metal decode token and read back logits. */ +/* Execute one Metal decode token and optionally capture the target hidden states + * that DSpark uses as the draft model's cross-token input. */ static bool metal_graph_eval_token_raw_swa( ds4_gpu_graph *g, const ds4_model *model, @@ -19474,7 +20619,8 @@ static bool metal_graph_eval_token_raw_swa( const double t0 = (profile || throttle) ? now_sec() : 0.0; bool ok = ds4_gpu_begin_commands() != 0; - if (ok) ok = metal_graph_encode_token_raw_swa(g, model, weights, token, pos, logits != NULL, true); + if (ok) ok = metal_graph_encode_token_raw_swa(g, model, weights, token, pos, + logits != NULL, true); const double t_encoded = (profile || throttle) ? now_sec() : 0.0; if (ok) ok = ds4_gpu_end_commands() != 0; const double t_done = (profile || throttle) ? now_sec() : 0.0; @@ -19502,6 +20648,8 @@ static bool metal_graph_eval_token_raw_swa( return ok; } +/* Execute one Metal decode token and read back logits. */ + static bool metal_graph_streaming_decode_prefill_wide_default( const ds4_weights *weights) { return DS4_MODEL_VARIANT == DS4_VARIANT_FLASH && @@ -20272,6 +21420,7 @@ static bool metal_graph_reset_prefill_state(ds4_gpu_graph *g) { memset(g->layer_n_comp, 0, sizeof(g->layer_n_comp)); memset(g->layer_n_index_comp, 0, sizeof(g->layer_n_index_comp)); g->mtp_n_raw = 0; + g->dspark_n_real = 0; for (uint32_t il = 0; il < DS4_N_LAYER; il++) { const uint32_t ratio = ds4_layer_compress_ratio(il); if (ratio == 0) continue; @@ -21121,7 +22270,7 @@ static bool metal_graph_verify_suffix_tops( const token_vec *prompt, uint32_t start, uint32_t n_tokens, - bool capture_prefix1, + uint32_t capture_prefix_tokens, int *row_tops, float *row_logits) { if (n_tokens == 0 || n_tokens > g->prefill_cap || !g->spec_logits) return false; @@ -21139,8 +22288,8 @@ static bool metal_graph_verify_suffix_tops( n_tokens); if (!ok) return false; - const bool saved_capture = g->spec_capture_prefix1; - g->spec_capture_prefix1 = capture_prefix1 && n_tokens == 2; + const uint32_t saved_capture = g->spec_capture_prefix_tokens; + g->spec_capture_prefix_tokens = capture_prefix_tokens < n_tokens ? capture_prefix_tokens : 0; ok = ds4_gpu_begin_commands() != 0; for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) { @@ -21150,10 +22299,11 @@ static bool metal_graph_verify_suffix_tops( il, start, n_tokens); + if (ok) ok = metal_graph_capture_dspark_batch_main_hidden(g, il, n_tokens); } if (ok) ok = ds4_gpu_end_commands() != 0; else (void)ds4_gpu_synchronize(); - g->spec_capture_prefix1 = saved_capture; + g->spec_capture_prefix_tokens = saved_capture; if (!ok) return false; ok = ds4_gpu_begin_commands() != 0; @@ -21253,8 +22403,8 @@ static bool metal_graph_verify_decode2_exact( ds4_gpu_tensor *saved_cur = g->cur_hc; ds4_gpu_tensor *saved_after = g->after_ffn_hc; - const bool saved_capture = g->spec_capture_prefix1; - g->spec_capture_prefix1 = true; + const uint32_t saved_capture = g->spec_capture_prefix_tokens; + g->spec_capture_prefix_tokens = 1; if (ok) ok = ds4_gpu_begin_commands() != 0; for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) { const uint32_t pos0 = start; @@ -21273,8 +22423,8 @@ static bool metal_graph_verify_decode2_exact( metal_graph_raw_span_for_batch(g, pos0, 1), token0); if (!ok) break; - ok = metal_graph_capture_prefix1_attn_state(g, il) && - metal_graph_capture_prefix1_index_state(g, il); + ok = metal_graph_capture_prefix_attn_state(g, il, 1) && + metal_graph_capture_prefix_index_state(g, il, 1); if (!ok) break; g->cur_hc = cur1; @@ -21296,7 +22446,7 @@ static bool metal_graph_verify_decode2_exact( } if (ok) ok = ds4_gpu_end_commands() != 0; else (void)ds4_gpu_synchronize(); - g->spec_capture_prefix1 = saved_capture; + g->spec_capture_prefix_tokens = saved_capture; g->cur_hc = saved_cur; g->after_ffn_hc = saved_after; @@ -21335,7 +22485,7 @@ static bool metal_graph_verify_decode2_exact( } g->cur_hc = saved_cur; g->after_ffn_hc = saved_after; - g->spec_capture_prefix1 = saved_capture; + g->spec_capture_prefix_tokens = saved_capture; ds4_gpu_tensor_free(next1); ds4_gpu_tensor_free(next0); @@ -21510,7 +22660,8 @@ static int metal_graph_prompt_logits_test( ds4_gpu_graph g; bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0], - raw_cap, (uint32_t)ctx_size, (uint32_t)n_test, false); + NULL, raw_cap, (uint32_t)ctx_size, + (uint32_t)n_test, false); if (!ok) { metal_graph_free(&g); fprintf(stderr, "ds4: failed to initialize Metal graph prompt test runtime\n"); @@ -22956,7 +24107,8 @@ static int generate_metal_graph_raw_swa( } ds4_gpu_graph g; bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0], - raw_cap, (uint32_t)ctx_size, prefill_cap, false); + NULL, raw_cap, (uint32_t)ctx_size, + prefill_cap, false); if (!ok) { fprintf(stderr, "ds4: failed to allocate GPU graph runtime\n"); return 1; @@ -23268,6 +24420,9 @@ struct ds4_session { float *logits; float *mtp_logits; int mtp_draft_token; + int dspark_draft_tokens[16]; + int dspark_draft_count; + uint32_t dspark_draft_base_real; uint64_t mtp_probe_total; uint64_t mtp_probe_hit; ds4_session_progress_fn progress; @@ -23280,6 +24435,9 @@ struct ds4_session { int ctx_size; bool checkpoint_valid; bool mtp_draft_valid; + bool directional_steering_override; + float directional_steering_attn_scale; + float directional_steering_ffn_scale; }; /* ========================================================================= @@ -23577,6 +24735,69 @@ static bool ds4_session_is_cpu(const ds4_session *s) { return s && s->engine && s->engine->backend == DS4_BACKEND_CPU; } +static void ds4_session_directional_steering_scales(const ds4_session *s, + float *attn, + float *ffn) { + float a = 0.0f; + float f = 0.0f; + if (s && s->engine) { + if (s->directional_steering_override) { + a = s->directional_steering_attn_scale; + f = s->directional_steering_ffn_scale; + } else { + a = s->engine->directional_steering_attn_scale; + f = s->engine->directional_steering_ffn_scale; + } + } + if (attn) *attn = a; + if (ffn) *ffn = f; +} + +static void ds4_session_apply_directional_steering_to_backend(ds4_session *s) { + if (!s) return; +#ifndef DS4_NO_GPU + if (!ds4_session_is_cpu(s)) { + float attn = 0.0f; + float ffn = 0.0f; + ds4_session_directional_steering_scales(s, &attn, &ffn); + s->graph.directional_steering_attn_scale = attn; + s->graph.directional_steering_ffn_scale = ffn; + } +#else + (void)s; +#endif +} + +static void ds4_session_set_directional_steering_state(ds4_session *s, + bool override, + float attn, + float ffn) { + if (!s) return; + float old_attn = 0.0f; + float old_ffn = 0.0f; + ds4_session_directional_steering_scales(s, &old_attn, &old_ffn); + + s->directional_steering_override = override; + s->directional_steering_attn_scale = attn; + s->directional_steering_ffn_scale = ffn; + + float new_attn = 0.0f; + float new_ffn = 0.0f; + ds4_session_directional_steering_scales(s, &new_attn, &new_ffn); + if (old_attn != new_attn || old_ffn != new_ffn) { + s->mtp_draft_valid = false; + } + ds4_session_apply_directional_steering_to_backend(s); +} + +void ds4_session_set_directional_steering(ds4_session *s, float attn, float ffn) { + ds4_session_set_directional_steering_state(s, true, attn, ffn); +} + +void ds4_session_use_engine_directional_steering(ds4_session *s) { + ds4_session_set_directional_steering_state(s, false, 0.0f, 0.0f); +} + static uint32_t session_cpu_raw_live_rows(const ds4_session *s) { if (!s || !s->checkpoint_valid) return 0; uint32_t rows = ds4_default_raw_cap((uint32_t)s->ctx_size); @@ -24040,12 +25261,18 @@ bool ds4_engine_has_output_head(ds4_engine *e) { return e && weights_have_output_head(&e->weights); } +ds4_mtp_draft_kind ds4_engine_mtp_draft_kind(ds4_engine *e) { + return (e && e->mtp_ready) ? e->mtp_weights.kind : DS4_MTP_DRAFT_NONE; +} + bool ds4_engine_has_mtp(ds4_engine *e) { return e && e->backend != DS4_BACKEND_CPU && e->distributed.role == DS4_DISTRIBUTED_NONE && - e->mtp_ready; + e->mtp_ready && + ds4_mtp_draft_runtime_supported(e->backend, e->mtp_weights.kind); } + int ds4_engine_mtp_draft_tokens(ds4_engine *e) { return ds4_engine_has_mtp(e) ? e->mtp_draft_tokens : 0; } @@ -24126,34 +25353,38 @@ static bool spec_frontier_restore(ds4_spec_frontier *f, ds4_session *s) { return ok; } -/* Commit the prefix-1 state captured by the N=2 speculative verifier. +/* Commit a captured speculative-prefix frontier. * - * The verifier has already advanced every layer through both draft tokens. On - * a one-token accept the append-only compressed caches can keep the second - * speculative row as invisible garbage, but the compressor frontiers and row - * counters must be rewound to the exact state after draft[0]. This is the + * The verifier has already advanced every layer through the speculative block. + * On partial accept, append-only compressed caches can keep later speculative + * rows as invisible garbage, but compressor frontiers and row counters must be + * rewound to the exact state after the last accepted draft token. This is the * cheap partial-accept path: copy a few small per-layer frontiers instead of - * restoring the whole prefix and replaying a one-token target decode. */ -static bool spec_frontier_commit_prefix1(ds4_session *s) { + * restoring the whole prefix and replaying accepted target decodes. */ +static bool spec_frontier_commit_prefix(ds4_session *s, int accepted, int draft_n) { ds4_gpu_graph *g = &s->graph; + const int slot = ds4_dspark_prefix_slot_for_accept(accepted, draft_n); + if (slot < 0 || (uint32_t)slot >= g->spec_prefix_slots) return false; bool ok = ds4_gpu_begin_commands() != 0; for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) { const uint32_t ratio = ds4_layer_compress_ratio(il); if (ratio == 0) continue; - g->layer_n_comp[il] = g->spec_prefix1_n_comp[il]; + g->layer_n_comp[il] = g->spec_prefix_n_comp[slot][il]; const uint64_t ab = ds4_gpu_tensor_bytes(g->layer_attn_state_kv[il]); + const uint64_t ao = (uint64_t)slot * ab; ok = ds4_gpu_tensor_copy(g->layer_attn_state_kv[il], 0, - g->spec_prefix1_attn_state_kv[il], 0, ab) != 0 && + g->spec_prefix_attn_state_kv[il], ao, ab) != 0 && ds4_gpu_tensor_copy(g->layer_attn_state_score[il], 0, - g->spec_prefix1_attn_state_score[il], 0, ab) != 0; + g->spec_prefix_attn_state_score[il], ao, ab) != 0; if (ok && ratio == 4) { - g->layer_n_index_comp[il] = g->spec_prefix1_n_index_comp[il]; + g->layer_n_index_comp[il] = g->spec_prefix_n_index_comp[slot][il]; const uint64_t ib = ds4_gpu_tensor_bytes(g->layer_index_state_kv[il]); + const uint64_t io = (uint64_t)slot * ib; ok = ds4_gpu_tensor_copy(g->layer_index_state_kv[il], 0, - g->spec_prefix1_index_state_kv[il], 0, ib) != 0 && + g->spec_prefix_index_state_kv[il], io, ib) != 0 && ds4_gpu_tensor_copy(g->layer_index_state_score[il], 0, - g->spec_prefix1_index_state_score[il], 0, ib) != 0; + g->spec_prefix_index_state_score[il], io, ib) != 0; } } if (ok) ok = ds4_gpu_end_commands() != 0; @@ -24986,27 +26217,395 @@ static char *imatrix_trim_block(char *p, char *end) { *end = '\0'; return p; } -#endif -int ds4_engine_collect_imatrix(ds4_engine *e, - const char *dataset_path, - const char *output_path, - int ctx_size, - int max_prompts, - int max_tokens) { -#ifdef DS4_NO_GPU - (void)e; - (void)dataset_path; - (void)output_path; - (void)ctx_size; - (void)max_prompts; - (void)max_tokens; - fprintf(stderr, "ds4: imatrix collection requires a graph backend build\n"); - return 1; -#else - if (!e || !dataset_path || !output_path) return 1; - if (e->backend != DS4_BACKEND_METAL || !e->metal_ready) { - fprintf(stderr, "ds4: imatrix collection currently requires --metal\n"); +static bool dspark_target_cache_join_path(char *dst, size_t dst_size, const char *dir, const char *name) { + if (!dst || dst_size == 0 || !dir || !name) return false; + const int n = snprintf(dst, dst_size, "%s/%s", dir, name); + return n > 0 && (size_t)n < dst_size; +} + +static bool dspark_target_cache_output_dir_prepare(const char *path) { + struct stat st; + if (stat(path, &st) == 0) { + if (!S_ISDIR(st.st_mode)) { + fprintf(stderr, "ds4: DSpark target cache output path is not a directory: %s\n", path); + return false; + } + DIR *dir = opendir(path); + if (!dir) { + fprintf(stderr, "ds4: failed to inspect DSpark target cache output dir %s: %s\n", + path, strerror(errno)); + return false; + } + bool empty = true; + struct dirent *ent = NULL; + while ((ent = readdir(dir)) != NULL) { + if (strcmp(ent->d_name, ".") && strcmp(ent->d_name, "..")) { + empty = false; + break; + } + } + closedir(dir); + if (!empty) { + fprintf(stderr, "ds4: DSpark target cache output dir is not empty: %s\n", path); + return false; + } + return true; + } + if (errno != ENOENT) { + fprintf(stderr, "ds4: failed to stat DSpark target cache output dir %s: %s\n", + path, strerror(errno)); + return false; + } + if (mkdir(path, 0777) != 0) { + fprintf(stderr, "ds4: failed to create DSpark target cache output dir %s: %s\n", + path, strerror(errno)); + return false; + } + return true; +} + +static bool dspark_target_cache_file_pos(FILE *fp, uint64_t *out) { + if (!fp || !out) return false; + off_t pos = ftello(fp); + if (pos < 0) return false; + *out = (uint64_t)pos; + return true; +} + +static bool dspark_target_cache_write_all(FILE *fp, const void *ptr, size_t bytes, const char *what) { + if (bytes == 0) return true; + if (fwrite(ptr, 1, bytes, fp) != bytes) { + fprintf(stderr, "ds4: failed to write DSpark target cache %s: %s\n", + what ? what : "payload", strerror(errno)); + return false; + } + return true; +} + +static void dspark_target_cache_store_le32(uint8_t *p, uint32_t v) { + p[0] = (uint8_t)(v & 0xffu); + p[1] = (uint8_t)((v >> 8) & 0xffu); + p[2] = (uint8_t)((v >> 16) & 0xffu); + p[3] = (uint8_t)((v >> 24) & 0xffu); +} + +static void dspark_target_cache_store_le64(uint8_t *p, uint64_t v) { + for (uint32_t i = 0; i < 8; i++) p[i] = (uint8_t)((v >> (8u * i)) & 0xffu); +} + +static bool dspark_target_cache_write_index_record(FILE *fp, + uint64_t sample_id, + uint32_t shard_id, + uint32_t seq_len, + uint64_t input_ids_offset, + uint64_t attention_mask_offset, + uint64_t loss_mask_offset, + uint64_t target_hidden_states_offset, + uint64_t target_last_hidden_states_offset) { + uint8_t rec[56]; + dspark_target_cache_store_le64(rec + 0, sample_id); + dspark_target_cache_store_le32(rec + 8, shard_id); + dspark_target_cache_store_le32(rec + 12, seq_len); + dspark_target_cache_store_le64(rec + 16, input_ids_offset); + dspark_target_cache_store_le64(rec + 24, attention_mask_offset); + dspark_target_cache_store_le64(rec + 32, loss_mask_offset); + dspark_target_cache_store_le64(rec + 40, target_hidden_states_offset); + dspark_target_cache_store_le64(rec + 48, target_last_hidden_states_offset); + return dspark_target_cache_write_all(fp, rec, sizeof(rec), "samples.idx record"); +} + +static bool dspark_target_cache_write_json_string(FILE *fp, const char *s) { + if (fputc('"', fp) == EOF) return false; + for (const unsigned char *p = (const unsigned char *)(s ? s : ""); *p; p++) { + switch (*p) { + case '\\': + case '"': + if (fprintf(fp, "\\%c", *p) < 0) return false; + break; + case '\n': + if (fputs("\\n", fp) == EOF) return false; + break; + case '\r': + if (fputs("\\r", fp) == EOF) return false; + break; + case '\t': + if (fputs("\\t", fp) == EOF) return false; + break; + default: + if (*p < 0x20) { + if (fprintf(fp, "\\u%04x", (unsigned)*p) < 0) return false; + } else if (fputc((int)*p, fp) == EOF) { + return false; + } + break; + } + } + return fputc('"', fp) != EOF; +} + +static const char *dspark_target_cache_quant_family(const ds4_weights *weights) { + if (!weights || DS4_N_LAYER == 0) return "unknown"; + const ds4_layer_weights *layer = &weights->layer[0]; + if (!layer->ffn_gate_exps || !layer->ffn_up_exps || !layer->ffn_down_exps) return "unknown"; + if (layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K && + layer->ffn_up_exps->type == DS4_TENSOR_Q4_K && + layer->ffn_down_exps->type == DS4_TENSOR_Q4_K) { + return "q4_k_routed_experts"; + } + if (layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS && + layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS && + layer->ffn_down_exps->type == DS4_TENSOR_Q2_K) { + return "iq2_xxs_gate_up_q2_k_down_routed_experts"; + } + return "mixed_routed_experts"; +} + +static bool dspark_target_cache_write_tensor_type_counts(FILE *fp, const ds4_model *model) { + uint64_t counts[32] = {0}; + uint64_t unknown = 0; + if (model) { + for (uint64_t i = 0; i < model->n_tensors; i++) { + uint32_t type = model->tensors[i].type; + if (type < (uint32_t)(sizeof(counts) / sizeof(counts[0]))) { + counts[type]++; + } else { + unknown++; + } + } + } + if (fprintf(fp, "{") < 0) return false; + bool first = true; + for (uint32_t type = 0; type < (uint32_t)(sizeof(counts) / sizeof(counts[0])); type++) { + if (!counts[type]) continue; + if (!first && fprintf(fp, ", ") < 0) return false; + first = false; + if (fprintf(fp, "\"%s\": %llu", + tensor_type_name(type), + (unsigned long long)counts[type]) < 0) { + return false; + } + } + if (unknown) { + if (!first && fprintf(fp, ", ") < 0) return false; + if (fprintf(fp, "\"unknown\": %llu", (unsigned long long)unknown) < 0) return false; + } + return fprintf(fp, "}") >= 0; +} + +static bool dspark_target_cache_write_manifest(const char *output_dir, + const char *dataset_path, + const char *target_model_name_or_path, + const char *chat_template, + const ds4_model *model, + const ds4_weights *weights, + const ds4_dspark_config *cfg, + uint64_t num_samples, + uint64_t num_tokens) { + char path[PATH_MAX]; + if (!dspark_target_cache_join_path(path, sizeof(path), output_dir, "manifest.json")) { + fprintf(stderr, "ds4: DSpark target cache manifest path is too long\n"); + return false; + } + FILE *fp = fopen(path, "wb"); + if (!fp) { + fprintf(stderr, "ds4: failed to create DSpark target cache manifest %s: %s\n", + path, strerror(errno)); + return false; + } + const char *source_gguf_path = (model && model->path && model->path[0]) ? model->path : DS4_MODEL_SHAPE_NAME; + const char *target_model = target_model_name_or_path; + const char *template_name = (chat_template && chat_template[0]) ? + chat_template : + "ds4_tokenize_rendered_chat"; + bool ok = true; + ok = ok && fprintf(fp, "{\n") >= 0; + ok = ok && fprintf(fp, " \"version\": 2,\n") >= 0; + ok = ok && fprintf(fp, " \"format\": \"deepspec-target-cache\",\n") >= 0; + ok = ok && fprintf(fp, " \"producer\": \"ds4\",\n") >= 0; + ok = ok && fprintf(fp, " \"producer_commit\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, DS4_GIT_COMMIT); + ok = ok && fprintf(fp, ",\n \"source_dataset_path\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, dataset_path); + ok = ok && fprintf(fp, ",\n \"source_gguf_path\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, source_gguf_path); + ok = ok && fprintf(fp, ",\n \"target_model_name_or_path\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, target_model); + ok = ok && fprintf(fp, ",\n \"model_shape\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, DS4_MODEL_SHAPE_NAME); + ok = ok && fprintf(fp, ",\n \"quantization_family\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, dspark_target_cache_quant_family(weights)); + ok = ok && fprintf(fp, ",\n \"num_samples\": %llu,\n", (unsigned long long)num_samples) >= 0; + ok = ok && fprintf(fp, " \"num_tokens\": %llu,\n", (unsigned long long)num_tokens) >= 0; + ok = ok && fprintf(fp, " \"num_shards\": %u,\n", num_samples ? 1u : 0u) >= 0; + ok = ok && fprintf(fp, " \"target_layer_ids\": [%u, %u, %u],\n", + cfg->target_layer_ids[0], + cfg->target_layer_ids[1], + cfg->target_layer_ids[2]) >= 0; + ok = ok && fprintf(fp, " \"hidden_size\": %u,\n", DS4_N_EMBD) >= 0; + ok = ok && fprintf(fp, " \"target_hidden_size\": %u,\n", DS4_N_EMBD) >= 0; + ok = ok && fprintf(fp, " \"target_hidden_layers\": %u,\n", cfg->n_mtp_layers) >= 0; + ok = ok && fprintf(fp, " \"hidden_dtype\": \"bfloat16\",\n") >= 0; + ok = ok && fprintf(fp, " \"token_dtype\": \"int32\",\n") >= 0; + ok = ok && fprintf(fp, " \"mask_dtype\": \"uint8\",\n") >= 0; + ok = ok && fprintf(fp, " \"index_record_size\": 56,\n") >= 0; + ok = ok && fprintf(fp, " \"input_convention\": {\n") >= 0; + ok = ok && fprintf(fp, " \"tokenization\": \"ds4_tokenize_rendered_chat\",\n") >= 0; + ok = ok && fprintf(fp, " \"chat_template\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, template_name); + ok = ok && fprintf(fp, ",\n \"sample_split_marker\": \"===== DS4_IMATRIX_PROMPT\",\n") >= 0; + ok = ok && fprintf(fp, " \"loss_mask\": \"1 for every exported prompt token\"\n") >= 0; + ok = ok && fprintf(fp, " },\n") >= 0; + ok = ok && fprintf(fp, " \"hidden_convention\": {\n") >= 0; + ok = ok && fprintf(fp, " \"target_hidden_states\": \"bfloat16 mean over DS4 HC heads after each target layer; row-major [seq_len, target_hidden_layers, hidden_size]\",\n") >= 0; + ok = ok && fprintf(fp, " \"target_last_hidden_states\": \"bfloat16 output-HC projection plus final RMSNorm; row-major [seq_len, hidden_size]\"\n") >= 0; + ok = ok && fprintf(fp, " },\n") >= 0; + ok = ok && fprintf(fp, " \"gguf_tensor_type_counts\": ") >= 0; + ok = ok && dspark_target_cache_write_tensor_type_counts(fp, model); + ok = ok && fprintf(fp, ",\n \"shards\": [") >= 0; + if (num_samples) { + ok = ok && fprintf(fp, "\n {\n \"file_name\": \"shard-00000.bin\",\n \"shard_id\": 0\n }\n ") >= 0; + } + ok = ok && fprintf(fp, "]\n}\n") >= 0; + if (fclose(fp) != 0) ok = false; + if (!ok) fprintf(stderr, "ds4: failed to write DSpark target cache manifest %s\n", path); + return ok; +} + +static uint32_t dspark_target_cache_layer_slot(const ds4_dspark_config *cfg, uint32_t layer_id) { + for (uint32_t i = 0; i < cfg->n_mtp_layers && i < 3; i++) { + if (cfg->target_layer_ids[i] == layer_id) return i; + } + return UINT32_MAX; +} + +static void dspark_target_cache_hc_mean_bf16(uint16_t *out, + const float *hc_rows, + uint32_t rows, + uint32_t slot, + uint32_t n_slots) { + const float inv_hc = 1.0f / (float)DS4_N_HC; + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + for (uint32_t row = 0; row < rows; row++) { + const float *hc = hc_rows + (uint64_t)row * hc_dim; + uint16_t *dst = out + ((uint64_t)row * n_slots + slot) * DS4_N_EMBD; + for (uint32_t d = 0; d < DS4_N_EMBD; d++) { + float sum = 0.0f; + for (uint32_t h = 0; h < DS4_N_HC; h++) { + sum += hc[(uint64_t)h * DS4_N_EMBD + d]; + } + dst[d] = f32_to_bf16(sum * inv_hc); + } + } +} + +static void dspark_target_cache_last_hidden_bf16(uint16_t *out, + const ds4_model *model, + const ds4_weights *weights, + const float *hc_rows, + uint32_t rows) { + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + float *embd = xmalloc((size_t)DS4_N_EMBD * sizeof(embd[0])); + float *norm = xmalloc((size_t)DS4_N_EMBD * sizeof(norm[0])); + const float *norm_weight = tensor_data(model, weights->output_norm); + for (uint32_t row = 0; row < rows; row++) { + const float *hc = hc_rows + (uint64_t)row * hc_dim; + output_hc_head_one(embd, model, weights, hc); + rms_norm_weight(norm, embd, norm_weight, DS4_N_EMBD, DS4_RMS_EPS); + uint16_t *dst = out + (uint64_t)row * DS4_N_EMBD; + for (uint32_t d = 0; d < DS4_N_EMBD; d++) dst[d] = f32_to_bf16(norm[d]); + } + free(norm); + free(embd); +} + +static bool dspark_target_cache_encode_chunk(ds4_gpu_graph *g, + const ds4_model *model, + const ds4_weights *weights, + const ds4_dspark_config *cfg, + const token_vec *prompt, + uint32_t pos0, + uint32_t n_tokens, + float *hc_rows, + uint16_t *target_chunk, + uint16_t *last_chunk) { + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + bool ok = metal_graph_upload_prompt_tokens(g->prefill_tokens, prompt, pos0, n_tokens); + if (ok) ok = metal_graph_upload_prompt_embeddings_hc(g->batch_cur_hc, + g->prefill_tokens, + model, + weights, + prompt, + pos0, + n_tokens); + for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) { + ok = ds4_gpu_begin_commands() != 0; + if (ok) { + ok = metal_graph_encode_layer_batch(g, + model, + &weights->layer[il], + il, + pos0, + n_tokens); + } + if (ok) ok = ds4_gpu_end_commands() != 0; + if (!ok) { + fprintf(stderr, "ds4: DSpark target cache layer %u encode failed\n", il); + return false; + } + const uint32_t slot = dspark_target_cache_layer_slot(cfg, il); + if (slot != UINT32_MAX) { + if (ds4_gpu_tensor_read(g->batch_cur_hc, + 0, + hc_rows, + (uint64_t)n_tokens * hc_dim * sizeof(float)) == 0) { + fprintf(stderr, "ds4: failed to read DSpark target layer %u hidden states\n", il); + return false; + } + dspark_target_cache_hc_mean_bf16(target_chunk, + hc_rows, + n_tokens, + slot, + cfg->n_mtp_layers); + } + } + if (ok && ds4_gpu_tensor_read(g->batch_cur_hc, + 0, + hc_rows, + (uint64_t)n_tokens * hc_dim * sizeof(float)) == 0) { + fprintf(stderr, "ds4: failed to read DSpark target final hidden states\n"); + ok = false; + } + if (ok) { + dspark_target_cache_last_hidden_bf16(last_chunk, + model, + weights, + hc_rows, + n_tokens); + } + return ok; +} +#endif + +int ds4_engine_collect_imatrix(ds4_engine *e, + const char *dataset_path, + const char *output_path, + int ctx_size, + int max_prompts, + int max_tokens) { +#ifdef DS4_NO_GPU + (void)e; + (void)dataset_path; + (void)output_path; + (void)ctx_size; + (void)max_prompts; + (void)max_tokens; + fprintf(stderr, "ds4: imatrix collection requires a graph backend build\n"); + return 1; +#else + if (!e || !dataset_path || !output_path) return 1; + if (e->backend != DS4_BACKEND_METAL || !e->metal_ready) { + fprintf(stderr, "ds4: imatrix collection currently requires --metal\n"); return 1; } if (ctx_size <= 0) ctx_size = 32768; @@ -25023,7 +26622,8 @@ int ds4_engine_collect_imatrix(ds4_engine *e, ds4_gpu_graph g; bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0], - raw_cap, (uint32_t)ctx_size, prefill_cap, false); + NULL, raw_cap, (uint32_t)ctx_size, + prefill_cap, false); if (!ok) { fprintf(stderr, "ds4: failed to allocate imatrix Metal graph runtime\n"); free(dataset); @@ -25140,6 +26740,315 @@ int ds4_engine_collect_imatrix(ds4_engine *e, #endif } +int ds4_engine_collect_dspark_target_cache(ds4_engine *e, + const char *dataset_path, + const char *output_dir, + const char *target_model_name_or_path, + const char *chat_template, + int ctx_size, + int max_prompts, + int max_tokens) { +#ifdef DS4_NO_GPU + (void)e; + (void)dataset_path; + (void)output_dir; + (void)target_model_name_or_path; + (void)chat_template; + (void)ctx_size; + (void)max_prompts; + (void)max_tokens; + fprintf(stderr, "ds4: DSpark target cache export requires a graph backend build\n"); + return 1; +#else + if (!e || !dataset_path || !output_dir) return 1; + if (!target_model_name_or_path || !target_model_name_or_path[0]) { + fprintf(stderr, + "ds4: DSpark target cache export requires --dspark-target-cache-target-model\n"); + return 1; + } + if (e->backend != DS4_BACKEND_METAL || !e->metal_ready) { + fprintf(stderr, "ds4: DSpark target cache export currently requires --metal\n"); + return 1; + } + if (e->ssd_streaming) { + fprintf(stderr, "ds4: DSpark target cache export requires non-streaming Metal weights\n"); + return 1; + } + if (ctx_size <= 0) ctx_size = 32768; + + ds4_dspark_config cfg; + ds4_dspark_config_init_defaults(&cfg); + if (cfg.n_mtp_layers == 0 || cfg.n_mtp_layers > 3) { + fprintf(stderr, "ds4: unsupported DSpark target layer count %u\n", cfg.n_mtp_layers); + return 1; + } + for (uint32_t i = 0; i < cfg.n_mtp_layers; i++) { + if (cfg.target_layer_ids[i] >= DS4_N_LAYER) { + fprintf(stderr, + "ds4: DSpark target layer %u is outside the loaded %u-layer model\n", + cfg.target_layer_ids[i], + DS4_N_LAYER); + return 1; + } + for (uint32_t j = i + 1; j < cfg.n_mtp_layers; j++) { + if (cfg.target_layer_ids[i] == cfg.target_layer_ids[j]) { + fprintf(stderr, "ds4: duplicate DSpark target layer %u\n", cfg.target_layer_ids[i]); + return 1; + } + } + } + + char *dataset = NULL; + size_t dataset_len = 0; + if (!imatrix_read_text_file(dataset_path, &dataset, &dataset_len)) return 1; + if (!dspark_target_cache_output_dir_prepare(output_dir)) { + free(dataset); + return 1; + } + + char shard_path[PATH_MAX]; + char index_path[PATH_MAX]; + if (!dspark_target_cache_join_path(shard_path, sizeof(shard_path), output_dir, "shard-00000.bin") || + !dspark_target_cache_join_path(index_path, sizeof(index_path), output_dir, "samples.idx")) { + fprintf(stderr, "ds4: DSpark target cache output path is too long\n"); + free(dataset); + return 1; + } + + FILE *shard = fopen(shard_path, "wb"); + if (!shard) { + fprintf(stderr, "ds4: failed to create DSpark target cache shard %s: %s\n", + shard_path, strerror(errno)); + free(dataset); + return 1; + } + FILE *index = fopen(index_path, "wb"); + if (!index) { + fprintf(stderr, "ds4: failed to create DSpark target cache index %s: %s\n", + index_path, strerror(errno)); + fclose(shard); + free(dataset); + return 1; + } + + const ds4_model *model = &e->model; + const ds4_weights *weights = &e->weights; + const uint32_t prefill_cap = + metal_graph_prefill_cap_for_prompt(ctx_size, e->prefill_chunk); + const uint32_t raw_cap = metal_graph_raw_cap_for_context(ctx_size, prefill_cap); + + ds4_gpu_graph g; + bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0], + NULL, raw_cap, (uint32_t)ctx_size, + prefill_cap, false); + if (!ok) { + fprintf(stderr, "ds4: failed to allocate DSpark target cache Metal graph runtime\n"); + fclose(index); + fclose(shard); + free(dataset); + return 1; + } + g.quality = e->quality; + g.ssd_streaming = false; + g.ssd_streaming_cold = false; + g.streaming_preload_experts = 0; + g.power_percent = (uint32_t)e->power_percent; + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + float *hc_rows = xmalloc((size_t)prefill_cap * (size_t)hc_dim * sizeof(hc_rows[0])); + uint16_t *target_chunk = xmalloc((size_t)prefill_cap * + (size_t)cfg.n_mtp_layers * + (size_t)DS4_N_EMBD * + sizeof(target_chunk[0])); + uint16_t *last_chunk = xmalloc((size_t)prefill_cap * + (size_t)DS4_N_EMBD * + sizeof(last_chunk[0])); + + fprintf(stderr, + "ds4: exporting DeepSpec DSpark target cache from %s (model=%s, target_layers=[%u,%u,%u], ctx=%d, chunk=%u)\n", + dataset_path, + DS4_MODEL_SHAPE_NAME, + cfg.target_layer_ids[0], + cfg.target_layer_ids[1], + cfg.target_layer_ids[2], + ctx_size, + prefill_cap); + + int prompts_done = 0; + int tokens_done = 0; + char *cursor = dataset; + const char *marker_lit = "===== DS4_IMATRIX_PROMPT"; + while (ok && *cursor) { + if (max_prompts > 0 && prompts_done >= max_prompts) break; + if (max_tokens > 0 && tokens_done >= max_tokens) break; + + char *start = cursor; + char *marker = strstr(cursor, marker_lit); + if (marker) { + char *nl = strchr(marker, '\n'); + if (!nl) break; + start = nl + 1; + } else if (prompts_done != 0) { + break; + } + + char *next = strstr(start, marker_lit); + char *end = next ? next : dataset + dataset_len; + char saved = *end; + char *prompt_text = imatrix_trim_block(start, end); + if (prompt_text[0] != '\0') { + token_vec prompt = {0}; + ds4_tokenize_rendered_chat(e, prompt_text, &prompt); + if (prompt.len > ctx_size) prompt.len = ctx_size; + if (max_tokens > 0 && prompt.len > max_tokens - tokens_done) { + prompt.len = max_tokens - tokens_done; + } + if (prompt.len > 0) { + uint16_t *last_full = xmalloc((size_t)prompt.len * + (size_t)DS4_N_EMBD * + sizeof(last_full[0])); + int32_t *ids = xmalloc((size_t)prompt.len * sizeof(ids[0])); + uint8_t *mask = xmalloc((size_t)prompt.len * sizeof(mask[0])); + for (int i = 0; i < prompt.len; i++) { + ids[i] = (int32_t)prompt.v[i]; + mask[i] = 1; + } + + uint64_t input_ids_offset = 0; + uint64_t attention_mask_offset = 0; + uint64_t loss_mask_offset = 0; + uint64_t target_hidden_states_offset = 0; + uint64_t target_last_hidden_states_offset = 0; + ok = dspark_target_cache_file_pos(shard, &input_ids_offset) && + dspark_target_cache_write_all(shard, + ids, + (size_t)prompt.len * sizeof(ids[0]), + "input_ids"); + ok = ok && dspark_target_cache_file_pos(shard, &attention_mask_offset) && + dspark_target_cache_write_all(shard, + mask, + (size_t)prompt.len * sizeof(mask[0]), + "attention_mask"); + ok = ok && dspark_target_cache_file_pos(shard, &loss_mask_offset) && + dspark_target_cache_write_all(shard, + mask, + (size_t)prompt.len * sizeof(mask[0]), + "loss_mask"); + ok = ok && dspark_target_cache_file_pos(shard, &target_hidden_states_offset); + + if (ok && !metal_graph_reset_prefill_state(&g)) { + fprintf(stderr, "ds4: failed to reset DSpark target cache graph state\n"); + ok = false; + } + for (uint32_t pos = 0; ok && pos < (uint32_t)prompt.len;) { + uint32_t chunk = (uint32_t)prompt.len - pos; + if (chunk > prefill_cap) chunk = prefill_cap; + memset(target_chunk, + 0, + (size_t)chunk * (size_t)cfg.n_mtp_layers * + (size_t)DS4_N_EMBD * sizeof(target_chunk[0])); + ok = dspark_target_cache_encode_chunk(&g, + model, + weights, + &cfg, + &prompt, + pos, + chunk, + hc_rows, + target_chunk, + last_chunk); + if (ok) { + ok = dspark_target_cache_write_all(shard, + target_chunk, + (size_t)chunk * + (size_t)cfg.n_mtp_layers * + (size_t)DS4_N_EMBD * + sizeof(target_chunk[0]), + "target_hidden_states"); + } + if (ok) { + memcpy(last_full + (uint64_t)pos * DS4_N_EMBD, + last_chunk, + (size_t)chunk * (size_t)DS4_N_EMBD * sizeof(last_chunk[0])); + } + pos += chunk; + } + ok = ok && dspark_target_cache_file_pos(shard, &target_last_hidden_states_offset) && + dspark_target_cache_write_all(shard, + last_full, + (size_t)prompt.len * + (size_t)DS4_N_EMBD * + sizeof(last_full[0]), + "target_last_hidden_states"); + ok = ok && dspark_target_cache_write_index_record(index, + (uint64_t)prompts_done, + 0, + (uint32_t)prompt.len, + input_ids_offset, + attention_mask_offset, + loss_mask_offset, + target_hidden_states_offset, + target_last_hidden_states_offset); + if (ok) { + prompts_done++; + tokens_done += prompt.len; + fprintf(stderr, + "ds4: DSpark target cache prompts=%d tokens=%d\r", + prompts_done, + tokens_done); + fflush(stderr); + } + free(mask); + free(ids); + free(last_full); + } + token_vec_free(&prompt); + } + *end = saved; + if (!next) break; + cursor = next; + } + fputc('\n', stderr); + + if (fflush(shard) != 0 || fsync(fileno(shard)) != 0) { + fprintf(stderr, "ds4: failed to flush DSpark target cache shard %s: %s\n", + shard_path, strerror(errno)); + ok = false; + } + if (fflush(index) != 0 || fsync(fileno(index)) != 0) { + fprintf(stderr, "ds4: failed to flush DSpark target cache index %s: %s\n", + index_path, strerror(errno)); + ok = false; + } + if (fclose(index) != 0) ok = false; + if (fclose(shard) != 0) ok = false; + + if (ok) ok = dspark_target_cache_write_manifest(output_dir, + dataset_path, + target_model_name_or_path, + chat_template, + model, + weights, + &cfg, + (uint64_t)prompts_done, + (uint64_t)tokens_done); + if (ok) { + fprintf(stderr, + "ds4: wrote DeepSpec DSpark target cache %s from %d prompts and %d tokens\n", + output_dir, + prompts_done, + tokens_done); + } + + free(last_chunk); + free(target_chunk); + free(hc_rows); + metal_graph_free(&g); + free(dataset); + return ok ? 0 : 1; +#endif +} + int ds4_engine_generate_argmax( ds4_engine *e, const ds4_tokens *prompt, @@ -25690,9 +27599,22 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { model_open(&e->mtp_model, opt->mtp_path, graph_backend, true); mtp_weights_bind(&e->mtp_weights, &e->mtp_model); e->mtp_ready = true; - fprintf(stderr, "ds4: MTP support model loaded: %s (draft=%d)\n", + if ((e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK || e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) && + (opt->mtp_draft_tokens <= 0 || opt->mtp_draft_tokens == 1)) { + e->mtp_draft_tokens = (int)e->mtp_weights.dspark.block_size; + } + fprintf(stderr, "ds4: draft model loaded: %s (kind=%s, draft=%d, runtime_mtp=%s)\n", opt->mtp_path, - e->mtp_draft_tokens); + ds4_mtp_draft_kind_name(e->mtp_weights.kind), + e->mtp_draft_tokens, + ds4_engine_has_mtp(e) ? "yes" : "no"); + const ds4_dspark_spec_gate spec_gate = ds4_dspark_speculative_gate(e->mtp_weights.kind, + e->mtp_ready, + e->mtp_draft_tokens); + if (spec_gate == DS4_DSPARK_SPEC_DSPARK_NOT_READY || + spec_gate == DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY) { + fprintf(stderr, "ds4: %s\n", ds4_dspark_spec_gate_reason(spec_gate)); + } } #ifndef DS4_NO_GPU @@ -25902,7 +27824,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { *out = NULL; return 1; } - if (e->mtp_ready && + if (ds4_engine_has_mtp(e) && !ds4_gpu_set_model_map_range(e->mtp_model.map, e->mtp_model.size, e->mtp_model.tensor_data_pos, @@ -25945,7 +27867,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { free(load_sizes); /* Also apply explicit optional Q8 preload settings to the MTP support * model when loaded. */ - if (e->mtp_ready) { + if (ds4_engine_has_mtp(e)) { (void)ds4_gpu_set_model_fd_for_map(e->mtp_model.fd, e->mtp_model.map); if (!accelerator_cache_model_tensors(e->backend, &e->mtp_model, NULL, NULL, 0)) { @@ -26072,7 +27994,8 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) { return 1; } if (!metal_graph_alloc_raw_cap(&s->graph, &e->weights, shape_layer, - raw_cap, (uint32_t)ctx_size, s->prefill_cap, e->mtp_ready)) + &e->mtp_weights, raw_cap, (uint32_t)ctx_size, + s->prefill_cap, ds4_engine_has_mtp(e))) { free(s); return 1; @@ -26091,7 +28014,7 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) { return 1; } s->logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(s->logits[0])); - if (e->mtp_ready) { + if (ds4_engine_has_mtp(e)) { s->mtp_logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(s->mtp_logits[0])); s->mtp_draft_token = -1; } @@ -26716,6 +28639,9 @@ int ds4_session_sync(ds4_session *s, const ds4_tokens *prompt, char *err, size_t } if (ds4_session_is_cpu(s)) { ds4_engine *e = s->engine; + float steering_attn = 0.0f; + float steering_ffn = 0.0f; + ds4_session_directional_steering_scales(s, &steering_attn, &steering_ffn); if (s->checkpoint_valid && prompt->len >= s->checkpoint.len && ds4_tokens_starts_with(prompt, &s->checkpoint)) @@ -26735,8 +28661,8 @@ int ds4_session_sync(ds4_session *s, const ds4_tokens *prompt, char *err, size_t prompt->v[i], (uint32_t)s->checkpoint.len, e->directional_steering_dirs, - e->directional_steering_attn_scale, - e->directional_steering_ffn_scale, + steering_attn, + steering_ffn, &s->cpu_scratch); token_vec_push(&s->checkpoint, prompt->v[i]); if (s->progress) s->progress(s->progress_ud, "prefill_chunk", i + 1, prompt->len); @@ -26752,8 +28678,8 @@ int ds4_session_sync(ds4_session *s, const ds4_tokens *prompt, char *err, size_t &s->cpu_cache, prompt, e->directional_steering_dirs, - e->directional_steering_attn_scale, - e->directional_steering_ffn_scale); + steering_attn, + steering_ffn); ds4_tokens_copy(&s->checkpoint, prompt); s->checkpoint_valid = true; s->mtp_draft_valid = false; @@ -27081,6 +29007,9 @@ static int ds4_session_eval_internal(ds4_session *s, int token, bool probe_mtp, } if (ds4_session_is_cpu(s)) { ds4_engine *e = s->engine; + float steering_attn = 0.0f; + float steering_ffn = 0.0f; + ds4_session_directional_steering_scales(s, &steering_attn, &steering_ffn); forward_token_raw_swa_cpu_decode_scratch(s->logits, &e->model, &e->weights, @@ -27088,8 +29017,8 @@ static int ds4_session_eval_internal(ds4_session *s, int token, bool probe_mtp, token, (uint32_t)s->checkpoint.len, e->directional_steering_dirs, - e->directional_steering_attn_scale, - e->directional_steering_ffn_scale, + steering_attn, + steering_ffn, &s->cpu_scratch); token_vec_push(&s->checkpoint, token); s->checkpoint_valid = true; @@ -27107,7 +29036,7 @@ static int ds4_session_eval_internal(ds4_session *s, int token, bool probe_mtp, ds4_engine *e = s->engine; const bool mtp_probe_log = getenv("DS4_MTP_PROBE") != NULL; const bool mtp_should_draft = - probe_mtp && e->mtp_ready && s->mtp_logits && + probe_mtp && ds4_engine_has_mtp(e) && s->mtp_logits && (e->mtp_draft_tokens > 1 || mtp_probe_log); if (probe_mtp && s->mtp_draft_valid) { if (mtp_probe_log) { @@ -27133,20 +29062,44 @@ static int ds4_session_eval_internal(ds4_session *s, int token, bool probe_mtp, } token_vec_push(&s->checkpoint, token); if (mtp_should_draft) { - int mtp_top = -1; - if (metal_graph_eval_mtp_draft(&s->graph, - &e->model, - &e->weights, - &e->mtp_model, - &e->mtp_weights, - token, - (uint32_t)(s->checkpoint.len - 1), - getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL, - &mtp_top)) { - s->mtp_draft_token = mtp_top >= 0 ? mtp_top : sample_argmax(s->mtp_logits, DS4_N_VOCAB); - s->mtp_draft_valid = true; - } else if (getenv("DS4_MTP_PROBE")) { - fprintf(stderr, "ds4: mtp probe draft failed\n"); + if (e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK) { + int draft_n = 0; + uint32_t base_real = 0; + if (metal_graph_eval_dspark_draft_block(&s->graph, + &e->model, + &e->weights, + &e->mtp_model, + &e->mtp_weights, + token, + (uint32_t)(s->checkpoint.len - 1), + (uint32_t)e->mtp_draft_tokens, + s->dspark_draft_tokens, + &draft_n, + &base_real, + getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL)) { + s->dspark_draft_count = draft_n; + s->dspark_draft_base_real = base_real; + s->mtp_draft_token = draft_n > 0 ? s->dspark_draft_tokens[0] : -1; + s->mtp_draft_valid = draft_n > 0; + } else if (getenv("DS4_MTP_PROBE") || getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, "ds4: DSpark draft block failed\n"); + } + } else { + int mtp_top = -1; + if (metal_graph_eval_mtp_draft(&s->graph, + &e->model, + &e->weights, + &e->mtp_model, + &e->mtp_weights, + token, + (uint32_t)(s->checkpoint.len - 1), + getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL, + &mtp_top)) { + s->mtp_draft_token = mtp_top >= 0 ? mtp_top : sample_argmax(s->mtp_logits, DS4_N_VOCAB); + s->mtp_draft_valid = true; + } else if (getenv("DS4_MTP_PROBE")) { + fprintf(stderr, "ds4: mtp probe draft failed\n"); + } } } return 0; @@ -27157,6 +29110,10 @@ int ds4_session_eval(ds4_session *s, int token, char *err, size_t errlen) { return ds4_session_eval_internal(s, token, true, err, errlen); } +int ds4_session_eval_no_mtp(ds4_session *s, int token, char *err, size_t errlen) { + return ds4_session_eval_internal(s, token, false, err, errlen); +} + /* Speculative decode state machine: * 1. commit the normal target token and use its logits to validate draft[0]; * 2. let MTP recursively draft a tiny suffix from its own raw-cache frontier; @@ -27204,7 +29161,7 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, accepted[n_accept++] = first_token; if (first_token == eos_token || max_tokens == 1 || n_accept >= accepted_cap) return n_accept; - if (!e->mtp_ready || !s->mtp_draft_valid || e->mtp_draft_tokens <= 1) return n_accept; + if (!ds4_engine_has_mtp(e) || !s->mtp_draft_valid || e->mtp_draft_tokens <= 1) return n_accept; int draft_cap = e->mtp_draft_tokens; if (draft_cap > max_tokens - n_accept) draft_cap = max_tokens - n_accept; @@ -27213,6 +29170,225 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, if (draft_cap > room - 1) draft_cap = room - 1; if (draft_cap <= 0) return n_accept; + if (e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK || + e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) { + int drafts[16]; + int draft_n = s->dspark_draft_count; + if (draft_n > draft_cap) draft_n = draft_cap; + if (draft_n <= 0) { + s->mtp_draft_valid = false; + return n_accept; + } + memcpy(drafts, s->dspark_draft_tokens, (size_t)draft_n * sizeof(drafts[0])); + s->mtp_draft_valid = false; + s->dspark_draft_count = 0; + + const bool mtp_timing = getenv("DS4_MTP_TIMING") != NULL; + const double mtp_t0 = mtp_timing ? now_sec() : 0.0; +#define DS4_DSPARK_KEEP_ACCEPTED(n_) do { \ + uint32_t keep_ = s->dspark_draft_base_real + 1u + (uint32_t)(n_); \ + if (keep_ > DS4_N_SWA) keep_ = 0; \ + s->graph.dspark_n_real = keep_; \ + } while (0) + if (sample_argmax(s->logits, DS4_N_VOCAB) != drafts[0]) { + DS4_DSPARK_KEEP_ACCEPTED(0); + if (getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, "ds4: dspark spec miss first draft=%d\n", drafts[0]); + } + return n_accept; + } + draft_n = ds4_dspark_draft_len_until_eos(drafts, draft_n, eos_token); + + ds4_spec_frontier frontier; + memset(&frontier, 0, sizeof(frontier)); + int *row_tops = xmalloc((size_t)draft_n * sizeof(row_tops[0])); + float *row_logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(row_logits[0])); + const int start = s->checkpoint.len; + const double snapshot_t0 = mtp_timing ? now_sec() : 0.0; + bool have_frontier = spec_frontier_snapshot(&frontier, s); + bool ok = have_frontier; + const double snapshot_done = mtp_timing ? now_sec() : 0.0; + if (ok) { + for (int i = 0; i < draft_n; i++) token_vec_push(&s->checkpoint, drafts[i]); + const uint32_t capture_prefix_tokens = draft_n > 1 + ? (uint32_t)ds4_dspark_prefix_slot_count(e->mtp_weights.kind, + draft_n, + (int)s->graph.spec_prefix_slots) + : 0; + ok = metal_graph_verify_suffix_tops(&s->graph, + &e->model, + &e->weights, + &s->checkpoint, + (uint32_t)start, + (uint32_t)draft_n, + capture_prefix_tokens, + row_tops, + NULL); + } + if (!ok && getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, "ds4: dspark verifier graph failed draft_n=%d prefix_tokens=%u slots=%u\n", + draft_n, draft_n > 1 ? (uint32_t)ds4_dspark_prefix_slot_count(e->mtp_weights.kind, draft_n, (int)s->graph.spec_prefix_slots) : 0, + s->graph.spec_prefix_slots); + } + const double verify_done = mtp_timing ? now_sec() : 0.0; + if (ok) { + int commit_drafts = 1; + for (int i = 1; i < draft_n; i++) { + if (row_tops[i - 1] != drafts[i]) break; + commit_drafts++; + } + if (commit_drafts == draft_n) { + ok = metal_graph_dspark_refresh_verified_rows(&s->graph, + &e->mtp_model, + &e->mtp_weights, + s->dspark_draft_base_real + 1u, + (uint32_t)start, + (uint32_t)draft_n); + if (ok) ok = metal_graph_read_spec_logits_row(&s->graph, + (uint32_t)(draft_n - 1), + row_logits); + if (ok) { + memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0])); + for (int i = 0; i < draft_n && n_accept < accepted_cap; i++) { + accepted[n_accept++] = drafts[i]; + if (drafts[i] == eos_token) break; + } + s->checkpoint_valid = true; + s->mtp_draft_valid = false; + DS4_DSPARK_KEEP_ACCEPTED(draft_n); + if (mtp_timing) { + fprintf(stderr, + "ds4: dspark timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms total=%.3f ms\n", + draft_n, + draft_n, + (snapshot_done - snapshot_t0) * 1000.0, + (verify_done - snapshot_done) * 1000.0, + (now_sec() - mtp_t0) * 1000.0); + } + spec_frontier_free(&frontier); + free(row_logits); + free(row_tops); + return n_accept; + } + } + if (!ok && getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, "ds4: dspark full refresh/logits failed draft_n=%d\n", draft_n); + } + + const uint32_t capture_prefix_tokens = draft_n > 1 + ? (uint32_t)ds4_dspark_prefix_slot_count(e->mtp_weights.kind, + draft_n, + (int)s->graph.spec_prefix_slots) + : 0; + if (commit_drafts > 0 && (uint32_t)commit_drafts <= capture_prefix_tokens) { + s->checkpoint.len = start; + const double prefix_t0 = mtp_timing ? now_sec() : 0.0; + ok = spec_frontier_commit_prefix(s, commit_drafts, draft_n); + if (ok) ok = metal_graph_dspark_refresh_verified_rows(&s->graph, + &e->mtp_model, + &e->mtp_weights, + s->dspark_draft_base_real + 1u, + (uint32_t)start, + (uint32_t)commit_drafts); + const double prefix_done = mtp_timing ? now_sec() : 0.0; + if (ok) ok = metal_graph_read_spec_logits_row(&s->graph, + (uint32_t)(commit_drafts - 1), + row_logits); + if (!ok && getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, "ds4: dspark prefix commit/refresh/logits failed committed=%d draft_n=%d\n", + commit_drafts, draft_n); + } + if (ok) { + memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0])); + for (int i = 0; i < commit_drafts; i++) token_vec_push(&s->checkpoint, drafts[i]); + for (int i = 0; i < commit_drafts && n_accept < accepted_cap; i++) { + accepted[n_accept++] = drafts[i]; + if (drafts[i] == eos_token) break; + } + s->checkpoint_valid = true; + s->mtp_draft_valid = false; + DS4_DSPARK_KEEP_ACCEPTED(commit_drafts); + if (mtp_timing) { + fprintf(stderr, + "ds4: dspark timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms prefix=%.3f ms total=%.3f ms noreplay=1\n", + draft_n, + commit_drafts, + (snapshot_done - snapshot_t0) * 1000.0, + (verify_done - snapshot_done) * 1000.0, + (prefix_done - prefix_t0) * 1000.0, + (now_sec() - mtp_t0) * 1000.0); + } + spec_frontier_free(&frontier); + free(row_logits); + free(row_tops); + return n_accept; + } + if (!ok) { + s->checkpoint.len = start; + ok = have_frontier && spec_frontier_restore(&frontier, s); + } + } else { + s->checkpoint.len = start; + ok = have_frontier && spec_frontier_restore(&frontier, s); + } + int replayed = 0; + for (; ok && replayed < commit_drafts; replayed++) { + ok = metal_graph_eval_token_raw_swa(&s->graph, + &e->model, + &e->weights, + drafts[replayed], + (uint32_t)(start + replayed), + row_logits); + if (ok) { + token_vec_push(&s->checkpoint, drafts[replayed]); + ok = metal_graph_dspark_refresh_current_row(&s->graph, + &e->mtp_model, + &e->mtp_weights, + s->dspark_draft_base_real + 1u + (uint32_t)replayed, + (uint32_t)(start + replayed)); + } + } + if (!ok && getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, "ds4: dspark replay failed replayed=%d committed=%d draft_n=%d\n", + replayed, commit_drafts, draft_n); + } + if (ok) { + memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0])); + for (int i = 0; i < replayed && n_accept < accepted_cap; i++) { + accepted[n_accept++] = drafts[i]; + if (drafts[i] == eos_token) break; + } + s->checkpoint_valid = true; + s->mtp_draft_valid = false; + DS4_DSPARK_KEEP_ACCEPTED(replayed); + if (mtp_timing) { + fprintf(stderr, + "ds4: dspark timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms replay=%.3f ms total=%.3f ms\n", + draft_n, + replayed, + (snapshot_done - snapshot_t0) * 1000.0, + (verify_done - snapshot_done) * 1000.0, + (now_sec() - verify_done) * 1000.0, + (now_sec() - mtp_t0) * 1000.0); + } + spec_frontier_free(&frontier); + free(row_logits); + free(row_tops); + return n_accept; + } + } + s->checkpoint.len = start; + if (have_frontier) (void)spec_frontier_restore(&frontier, s); + snprintf(err, errlen, "DSpark verifier failed"); + s->checkpoint_valid = false; + DS4_DSPARK_KEEP_ACCEPTED(0); + spec_frontier_free(&frontier); + free(row_logits); + free(row_tops); + return -1; +#undef DS4_DSPARK_KEEP_ACCEPTED + } + int drafts[16]; int draft_n = 1; drafts[0] = s->mtp_draft_token; @@ -27393,7 +29569,7 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, if (ok) { s->checkpoint.len = start; - ok = spec_frontier_commit_prefix1(s); + ok = spec_frontier_commit_prefix(s, 1, 2); } if (ok) memcpy(s->logits, row0_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0])); if (ok) { @@ -27444,12 +29620,12 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, * replay one token on partial accept. DS4_MTP_CAPTURE_PREFIX1 restores * the older no-replay partial path for measurement. */ - const bool capture_prefix1 = - draft_n == 2 && (!strict_mtp || getenv("DS4_MTP_CAPTURE_PREFIX1") != NULL); + const uint32_t capture_prefix_tokens = + (draft_n == 2 && (!strict_mtp || getenv("DS4_MTP_CAPTURE_PREFIX1") != NULL)) ? 1u : 0u; const bool exact_replay_debug = getenv("DS4_MTP_EXACT_REPLAY") != NULL; const bool snapshot_required = draft_n > 2 || - (draft_n == 2 && (!capture_prefix1 || exact_replay_debug)) || + (draft_n == 2 && (capture_prefix_tokens == 0 || exact_replay_debug)) || getenv("DS4_MTP_FORCE_SNAPSHOT") != NULL; bool have_frontier = false; bool ok = true; @@ -27469,7 +29645,7 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, &s->checkpoint, (uint32_t)start, (uint32_t)draft_n, - capture_prefix1, + capture_prefix_tokens, row_tops, NULL); } @@ -27552,10 +29728,10 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, } } - if (draft_n == 2 && commit_drafts == 1 && capture_prefix1) { + if (draft_n == 2 && commit_drafts == 1 && capture_prefix_tokens != 0) { s->checkpoint.len = start; const double prefix_t0 = mtp_timing ? now_sec() : 0.0; - ok = spec_frontier_commit_prefix1(s); + ok = spec_frontier_commit_prefix(s, commit_drafts, draft_n); const double prefix_done = mtp_timing ? now_sec() : 0.0; if (ok) ok = metal_graph_read_spec_logits_row(&s->graph, 0, row_logits); if (ok) { @@ -27769,6 +29945,7 @@ void ds4_session_invalidate(ds4_session *s) { s->checkpoint_valid = false; s->checkpoint.len = 0; s->mtp_draft_valid = false; + s->dspark_draft_count = 0; } void ds4_session_rewind(ds4_session *s, int pos) { @@ -27776,6 +29953,7 @@ void ds4_session_rewind(ds4_session *s, int pos) { if (pos > s->checkpoint.len) pos = s->checkpoint.len; s->checkpoint.len = pos; s->mtp_draft_valid = false; + s->dspark_draft_count = 0; } int ds4_session_pos(ds4_session *s) { diff --git a/ds4.h b/ds4.h index 9d040c92b..9cbc9ba96 100644 --- a/ds4.h +++ b/ds4.h @@ -56,6 +56,32 @@ typedef struct { #define DS4_DEFAULT_TOP_P 1.0f #define DS4_DEFAULT_MIN_P 0.05f + +typedef enum { + DS4_MTP_DRAFT_NONE = 0, + DS4_MTP_DRAFT_LEGACY, + DS4_MTP_DRAFT_DSPARK, + DS4_MTP_DRAFT_DSPARK_NONSEQ, +} ds4_mtp_draft_kind; + +typedef struct { + uint32_t n_mtp_layers; + uint32_t block_size; + uint32_t noise_token_id; + uint32_t markov_rank; + uint32_t target_layer_ids[3]; +} ds4_dspark_config; + +void ds4_dspark_config_init_defaults(ds4_dspark_config *cfg); +const char *ds4_mtp_draft_kind_name(ds4_mtp_draft_kind kind); +/* Classify draft GGUF layout from presence markers (unit-testable, no model load). */ +ds4_mtp_draft_kind ds4_mtp_draft_kind_guess(bool has_e_proj, bool has_main_proj, bool has_markov_w1); +ds4_mtp_draft_kind ds4_mtp_draft_kind_guess_ex(bool has_e_proj, + bool has_main_proj, + bool has_markov_w1, + bool markov_rank_set, + uint32_t markov_rank); + typedef struct ds4_engine ds4_engine; typedef struct ds4_session ds4_session; @@ -186,6 +212,14 @@ int ds4_engine_collect_imatrix(ds4_engine *e, int ctx_size, int max_prompts, int max_tokens); +int ds4_engine_collect_dspark_target_cache(ds4_engine *e, + const char *dataset_path, + const char *output_dir, + const char *target_model_name_or_path, + const char *chat_template, + int ctx_size, + int max_prompts, + int max_tokens); void ds4_engine_dump_tokens(ds4_engine *e, const ds4_tokens *tokens); int ds4_dump_text_tokenization(const char *model_path, const char *text, FILE *fp); int ds4_engine_head_test(ds4_engine *e, const ds4_tokens *prompt); @@ -223,6 +257,8 @@ int ds4_session_power(ds4_session *s); int ds4_session_set_power(ds4_session *s, int power_percent); bool ds4_session_is_distributed(ds4_session *s); void ds4_session_set_progress(ds4_session *s, ds4_session_progress_fn fn, void *ud); +void ds4_session_set_directional_steering(ds4_session *s, float attn, float ffn); +void ds4_session_use_engine_directional_steering(ds4_session *s); /* UI-only progress. It may report fine-grained progress inside a prefill chunk; * callers must not treat it as a durable KV checkpoint boundary. */ void ds4_session_set_display_progress(ds4_session *s, ds4_session_progress_fn fn, void *ud); @@ -262,6 +298,7 @@ int ds4_session_token_logprob(ds4_session *s, int token, ds4_token_score *out); int ds4_session_copy_logits(ds4_session *s, float *out, int cap); int ds4_session_set_logits(ds4_session *s, const float *logits, int n); int ds4_session_eval(ds4_session *s, int token, char *err, size_t errlen); +int ds4_session_eval_no_mtp(ds4_session *s, int token, char *err, size_t errlen); int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, int max_tokens, int eos_token, int *accepted, int accepted_cap, @@ -273,7 +310,13 @@ int ds4_session_ctx(ds4_session *s); int ds4_session_prefill_cap(ds4_session *s); int ds4_engine_routed_quant_bits(ds4_engine *e); bool ds4_engine_has_output_head(ds4_engine *e); +/* True when speculative decode has a real proposer and target verifier. */ +bool ds4_mtp_speculative_draft_ready(ds4_mtp_draft_kind kind); +bool ds4_mtp_draft_runtime_supported(ds4_backend backend, + ds4_mtp_draft_kind kind); bool ds4_engine_has_mtp(ds4_engine *e); +ds4_mtp_draft_kind ds4_engine_mtp_draft_kind(ds4_engine *e); + int ds4_engine_mtp_draft_tokens(ds4_engine *e); const ds4_tokens *ds4_session_tokens(ds4_session *s); diff --git a/ds4_cli.c b/ds4_cli.c index 4ad2240e8..61de77021 100644 --- a/ds4_cli.c +++ b/ds4_cli.c @@ -43,6 +43,12 @@ typedef struct { const char *imatrix_output_path; int imatrix_max_prompts; int imatrix_max_tokens; + const char *dspark_target_cache_dataset_path; + const char *dspark_target_cache_output_dir; + const char *dspark_target_cache_target_model; + const char *dspark_target_cache_chat_template; + int dspark_target_cache_max_prompts; + int dspark_target_cache_max_tokens; ds4_think_mode think_mode; bool head_test; bool first_token_test; @@ -1562,6 +1568,18 @@ static cli_config parse_options(int argc, char **argv) { c.gen.imatrix_max_prompts = parse_int(need_arg(&i, argc, argv, arg), arg); } else if (!strcmp(arg, "--imatrix-max-tokens")) { c.gen.imatrix_max_tokens = parse_int(need_arg(&i, argc, argv, arg), arg); + } else if (!strcmp(arg, "--dspark-target-cache-dataset")) { + c.gen.dspark_target_cache_dataset_path = need_arg(&i, argc, argv, arg); + } else if (!strcmp(arg, "--dspark-target-cache-out")) { + c.gen.dspark_target_cache_output_dir = need_arg(&i, argc, argv, arg); + } else if (!strcmp(arg, "--dspark-target-cache-target-model")) { + c.gen.dspark_target_cache_target_model = need_arg(&i, argc, argv, arg); + } else if (!strcmp(arg, "--dspark-target-cache-chat-template")) { + c.gen.dspark_target_cache_chat_template = need_arg(&i, argc, argv, arg); + } else if (!strcmp(arg, "--dspark-target-cache-max-prompts")) { + c.gen.dspark_target_cache_max_prompts = parse_int(need_arg(&i, argc, argv, arg), arg); + } else if (!strcmp(arg, "--dspark-target-cache-max-tokens")) { + c.gen.dspark_target_cache_max_tokens = parse_int(need_arg(&i, argc, argv, arg), arg); } else if (!strcmp(arg, "--think")) { c.gen.think_mode = DS4_THINK_HIGH; } else if (!strcmp(arg, "--think-max")) { @@ -1621,6 +1639,24 @@ static cli_config parse_options(int argc, char **argv) { fprintf(stderr, "ds4: --imatrix-dataset requires --imatrix-out\n"); exit(2); } + if (c.gen.dspark_target_cache_output_dir && !c.gen.dspark_target_cache_dataset_path) { + fprintf(stderr, "ds4: --dspark-target-cache-out requires --dspark-target-cache-dataset\n"); + exit(2); + } + if (c.gen.dspark_target_cache_dataset_path && !c.gen.dspark_target_cache_output_dir) { + fprintf(stderr, "ds4: --dspark-target-cache-dataset requires --dspark-target-cache-out\n"); + exit(2); + } + if (c.gen.dspark_target_cache_output_dir && c.gen.prompt) { + fprintf(stderr, "ds4: --dspark-target-cache-out does not use -p/--prompt-file\n"); + exit(2); + } + if (c.gen.dspark_target_cache_output_dir && + (!c.gen.dspark_target_cache_target_model || + !c.gen.dspark_target_cache_target_model[0])) { + fprintf(stderr, "ds4: --dspark-target-cache-out requires --dspark-target-cache-target-model\n"); + exit(2); + } if (c.gen.perplexity_file_path && c.gen.prompt) { fprintf(stderr, "ds4: --perplexity-file does not use -p/--prompt-file\n"); exit(2); @@ -1693,6 +1729,15 @@ int main(int argc, char **argv) { cfg.gen.ctx_size, cfg.gen.imatrix_max_prompts, cfg.gen.imatrix_max_tokens); + } else if (cfg.gen.dspark_target_cache_output_dir) { + rc = ds4_engine_collect_dspark_target_cache(engine, + cfg.gen.dspark_target_cache_dataset_path, + cfg.gen.dspark_target_cache_output_dir, + cfg.gen.dspark_target_cache_target_model, + cfg.gen.dspark_target_cache_chat_template, + cfg.gen.ctx_size, + cfg.gen.dspark_target_cache_max_prompts, + cfg.gen.dspark_target_cache_max_tokens); } else if (cfg.gen.perplexity_file_path) { rc = run_perplexity_file(engine, &cfg); } else if (cfg.gen.prompt == NULL) { diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 188b341ad..688507a44 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -8917,6 +8917,25 @@ extern "C" int ds4_gpu_attention_decode_raw_batch_heads_tensor( n_head, head_dim); } +extern "C" int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + ds4_gpu_tensor *heads, + const void *model_map, + uint64_t model_size, + uint64_t sinks_offset, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *raw_kv, + uint32_t n_tokens, + uint32_t n_raw, + uint32_t raw_cap, + uint32_t raw_start, + uint32_t n_head, + uint32_t head_dim) { + (void)heads; (void)model_map; (void)model_size; (void)sinks_offset; + (void)q; (void)raw_kv; (void)n_tokens; (void)n_raw; (void)raw_cap; + (void)raw_start; (void)n_head; (void)head_dim; + return 0; +} + extern "C" int ds4_gpu_attention_decode_mixed_batch_heads_tensor( ds4_gpu_tensor *heads, const void *model_map, diff --git a/ds4_dspark_runtime.c b/ds4_dspark_runtime.c new file mode 100644 index 000000000..9b992e1ab --- /dev/null +++ b/ds4_dspark_runtime.c @@ -0,0 +1,70 @@ +#include "ds4_dspark_runtime.h" + +#include + + +float ds4_dspark_bf16_to_f32(uint16_t h) { + uint32_t bits = (uint32_t)h << 16; + float f; + memcpy(&f, &bits, sizeof(f)); + return f; +} +int ds4_dspark_draft_len_until_eos(const int *drafts, int draft_n, int eos_token) { + if (!drafts || draft_n <= 0) return 0; + for (int i = 0; i < draft_n; i++) { + if (drafts[i] == eos_token) return i + 1; + } + return draft_n; +} +int ds4_dspark_prefix_slot_for_accept(int accepted, int draft_n) { + if (accepted <= 0 || draft_n <= 1 || accepted >= draft_n) return -1; + return accepted - 1; +} + +int ds4_dspark_prefix_slot_count(ds4_mtp_draft_kind kind, int block_size, int max_slots) { + if (max_slots <= 0) return 0; + if (kind != DS4_MTP_DRAFT_LEGACY && + kind != DS4_MTP_DRAFT_DSPARK && + kind != DS4_MTP_DRAFT_DSPARK_NONSEQ) { + return 0; + } + int slots = 1; + if (kind == DS4_MTP_DRAFT_DSPARK || kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) { + slots = block_size > 1 ? block_size - 1 : 1; + } + if (slots > max_slots) slots = max_slots; + return slots; +} + + + + + + +ds4_dspark_spec_gate ds4_dspark_speculative_gate(ds4_mtp_draft_kind kind, + bool mtp_ready, + int mtp_draft_tokens) { + if (!mtp_ready || mtp_draft_tokens <= 1) return DS4_DSPARK_SPEC_DISABLED; + if (kind == DS4_MTP_DRAFT_LEGACY) return DS4_DSPARK_SPEC_LEGACY_MTP; + if (kind == DS4_MTP_DRAFT_DSPARK || + kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) return DS4_DSPARK_SPEC_DSPARK_ENABLED; + return DS4_DSPARK_SPEC_DISABLED; +} + +const char *ds4_dspark_spec_gate_reason(ds4_dspark_spec_gate gate) { + switch (gate) { + case DS4_DSPARK_SPEC_LEGACY_MTP: + return "legacy MTP draft path (DSpark block draft not engaged)"; + case DS4_DSPARK_SPEC_DSPARK_ENABLED: + return "DSpark block speculative decode enabled"; + case DS4_DSPARK_SPEC_DSPARK_NOT_READY: + return "DSpark draft graph has not been validated on real DSpark GGUF weights; " + "speculative decode stays off (no fake draft tokens)"; + case DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY: + return "DSpark nonseq draft head has not been validated on real trained DSpark GGUF weights; " + "speculative decode stays off (no fake draft tokens)"; + case DS4_DSPARK_SPEC_DISABLED: + default: + return "speculative draft disabled"; + } +} \ No newline at end of file diff --git a/ds4_dspark_runtime.h b/ds4_dspark_runtime.h new file mode 100644 index 000000000..4b99bb477 --- /dev/null +++ b/ds4_dspark_runtime.h @@ -0,0 +1,34 @@ +#ifndef DS4_DSPARK_RUNTIME_H +#define DS4_DSPARK_RUNTIME_H + +#include +#include + +#include "ds4.h" + + +typedef enum { + DS4_DSPARK_SPEC_DISABLED = 0, + DS4_DSPARK_SPEC_LEGACY_MTP, + DS4_DSPARK_SPEC_DSPARK_ENABLED, + DS4_DSPARK_SPEC_DSPARK_NOT_READY, + DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY, +} ds4_dspark_spec_gate; + + + +float ds4_dspark_bf16_to_f32(uint16_t h); +int ds4_dspark_draft_len_until_eos(const int *drafts, int draft_n, int eos_token); +int ds4_dspark_prefix_slot_for_accept(int accepted, int draft_n); +int ds4_dspark_prefix_slot_count(ds4_mtp_draft_kind kind, int block_size, int max_slots); + + + + +ds4_dspark_spec_gate ds4_dspark_speculative_gate(ds4_mtp_draft_kind kind, + bool mtp_ready, + int mtp_draft_tokens); + +const char *ds4_dspark_spec_gate_reason(ds4_dspark_spec_gate gate); + +#endif \ No newline at end of file diff --git a/ds4_gpu.h b/ds4_gpu.h index b58aca9bd..6651a2880 100644 --- a/ds4_gpu.h +++ b/ds4_gpu.h @@ -623,6 +623,22 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor( uint32_t n_head, uint32_t head_dim); +/* Non-causal variant (mask = all-attend): every query attends to every key in + * the gathered window. Used by the DSpark drafter's block attention. */ +int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + ds4_gpu_tensor *heads, + const void *model_map, + uint64_t model_size, + uint64_t sinks_offset, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *raw_kv, + uint32_t n_tokens, + uint32_t n_raw, + uint32_t raw_cap, + uint32_t raw_start, + uint32_t n_head, + uint32_t head_dim); + int ds4_gpu_attention_decode_mixed_batch_heads_tensor( ds4_gpu_tensor *heads, const void *model_map, diff --git a/ds4_help.c b/ds4_help.c index d32e088cf..b7ec9f321 100644 --- a/ds4_help.c +++ b/ds4_help.c @@ -170,11 +170,11 @@ static void print_model_runtime(FILE *fp, const help_colors *c, opt(fp, c, "--prefill-chunk N", "Metal graph prefill chunk size. Default: auto (PRO long prompts use 8192; others use 4096)."); if (full) { if (tool != DS4_HELP_BENCH) { - opt(fp, c, "--mtp FILE", "Optional MTP support GGUF used for draft-token probes."); + opt(fp, c, "--mtp FILE", "Optional speculative draft GGUF: legacy MTP or experimental converted DSpark/DeepSpec on Metal."); } if (tool == DS4_HELP_DS4 || tool == DS4_HELP_AGENT || tool == DS4_HELP_SERVER) { - opt(fp, c, "--mtp-draft N", "Maximum autoregressive MTP draft tokens. Default: 1"); - opt(fp, c, "--mtp-margin F", "Verifier confidence margin for fast MTP acceptance. Default: 3"); + opt(fp, c, "--mtp-draft N", "Maximum speculative draft tokens. Legacy default: 1; DSpark uses GGUF block size."); + opt(fp, c, "--mtp-margin F", "Verifier confidence margin for legacy fast MTP acceptance. Default: 3"); } opt(fp, c, "--quality", "Prefer exact kernels where faster approximate paths exist."); opt(fp, c, "--warm-weights", "Touch mapped tensor pages at startup to reduce first-use stalls."); @@ -208,6 +208,7 @@ static void print_steering(FILE *fp, const help_colors *c) { opt(fp, c, "--dir-steering-file FILE", "Load one f32 direction vector per layer."); opt(fp, c, "--dir-steering-ffn F", "Apply steering after FFN outputs. Default with file: 1"); opt(fp, c, "--dir-steering-attn F", "Apply steering after attention outputs. Default: 0"); + opt(fp, c, "--dir-steering-policy MODE", "Server policy: final-answer, decoding, always, off. Default: final-answer"); fputc('\n', fp); } @@ -254,6 +255,12 @@ static void print_cli_diagnostics(FILE *fp, const help_colors *c) { opt(fp, c, "--imatrix-out FILE", "Write llama-compatible routed-MoE imatrix .dat."); opt(fp, c, "--imatrix-max-prompts N", "Stop imatrix collection after N prompts."); opt(fp, c, "--imatrix-max-tokens N", "Stop imatrix collection after N prompt tokens."); + opt(fp, c, "--dspark-target-cache-dataset FILE", "Rendered prompt dataset for DeepSpec DSpark target-cache export."); + opt(fp, c, "--dspark-target-cache-out DIR", "Write DeepSpec DSpark target cache manifest/index/shard."); + opt(fp, c, "--dspark-target-cache-target-model HF_OR_PATH", "Required DeepSpec target model name/path stored in the target-cache manifest."); + opt(fp, c, "--dspark-target-cache-chat-template NAME", "DeepSpec chat template name stored in the target-cache manifest."); + opt(fp, c, "--dspark-target-cache-max-prompts N", "Stop target-cache export after N prompts."); + opt(fp, c, "--dspark-target-cache-max-tokens N", "Stop target-cache export after N prompt tokens."); opt(fp, c, "--head-test", "Run the output HC/logits head after the native slice."); opt(fp, c, "--first-token-test", "Run exact CPU whole-model pass for the first prompt token."); opt(fp, c, "--metal-graph-test", "Compare first GPU-resident graph stages with CPU."); diff --git a/ds4_metal.m b/ds4_metal.m index 7e3f8bd5c..c43762e0e 100644 --- a/ds4_metal.m +++ b/ds4_metal.m @@ -17050,6 +17050,13 @@ static void ds4_gpu_fill_raw_decode_batch_mask( } } +static void ds4_gpu_fill_raw_decode_batch_all_mask( + uint16_t *mask, + uint32_t n_tokens, + uint32_t n_raw) { + memset(mask, 0, (size_t)n_tokens * n_raw * sizeof(mask[0])); +} + static void ds4_gpu_fill_mixed_decode_batch_mask( uint16_t *mask, uint32_t n_tokens, @@ -18432,6 +18439,7 @@ static int ds4_gpu_encode_flash_attention_decode_raw_batch_heads( uint32_t raw_cap, uint32_t raw_start, uint32_t window, + bool noncausal, uint32_t n_head, uint32_t head_dim) { if (head_dim != 512 || n_head == 0 || n_tokens == 0 || @@ -18528,11 +18536,17 @@ static int ds4_gpu_encode_flash_attention_decode_raw_batch_heads( return 0; } - ds4_gpu_fill_raw_decode_batch_mask((uint16_t *)[mask_buffer contents], - n_tokens, - n_raw, - pos0, - window); + if (noncausal) { + ds4_gpu_fill_raw_decode_batch_all_mask((uint16_t *)[mask_buffer contents], + n_tokens, + n_raw); + } else { + ds4_gpu_fill_raw_decode_batch_mask((uint16_t *)[mask_buffer contents], + n_tokens, + n_raw, + pos0, + window); + } id pad_pipeline = nil; if (has_kvpad) { @@ -18693,6 +18707,7 @@ static int ds4_gpu_encode_flash_attention_decode_mixed_batch_heads( raw_cap, raw_start, window, + false, n_head, head_dim); } @@ -19052,6 +19067,7 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor( raw_cap, raw_start, window, + false, n_head, head_dim)) { return 0; @@ -19063,6 +19079,66 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor( return 1; } +int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + ds4_gpu_tensor *heads, + const void *model_map, + uint64_t model_size, + uint64_t sinks_offset, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *raw_kv, + uint32_t n_tokens, + uint32_t n_raw, + uint32_t raw_cap, + uint32_t raw_start, + uint32_t n_head, + uint32_t head_dim) { + if (!g_initialized && !ds4_gpu_init()) return 0; + if (!heads || !q || !raw_kv || !model_map || n_tokens == 0 || + n_raw == 0 || raw_cap < n_raw || raw_start >= raw_cap) { + return 0; + } + + @autoreleasepool { + if (sinks_offset > model_size || (uint64_t)n_head * sizeof(float) > model_size - sinks_offset) { + fprintf(stderr, "ds4: Metal attention sinks range is outside the mapped model\n"); + return 0; + } + + uint64_t sinks_inner = 0; + id sinks_buf = ds4_gpu_wrap_model_range(model_map, model_size, + sinks_offset, + (uint64_t)n_head * sizeof(float), + &sinks_inner); + if (!sinks_buf) return 0; + + int owned = 0; + id cb = ds4_gpu_command_buffer(&owned); + if (!cb) return 0; + + if (!ds4_gpu_encode_flash_attention_decode_raw_batch_heads(cb, + heads, + sinks_buf, + (NSUInteger)sinks_inner, + q, + raw_kv, + n_tokens, + 0, + n_raw, + raw_cap, + raw_start, + 0, + true, + n_head, + head_dim)) { + return 0; + } + + if (!ds4_gpu_finish_command_buffer(cb, owned, "dspark noncausal batch attention heads")) return 0; + } + + return 1; +} + int ds4_gpu_attention_decode_mixed_batch_heads_tensor( ds4_gpu_tensor *heads, const void *model_map, diff --git a/ds4_server.c b/ds4_server.c index 34a9d5084..8888de22a 100644 --- a/ds4_server.c +++ b/ds4_server.c @@ -44,6 +44,12 @@ static volatile sig_atomic_t g_listen_fd = -1; #define DS4_SERVER_IO_TIMEOUT_SEC 10 #define DS4_SERVER_SEND_STALL_TIMEOUT_MS 2000 +#define DS4_THINKING_STABLE_TEMPERATURE 0.6f +#define DS4_THINKING_STABLE_TOP_K 40 +#define DS4_THINKING_STABLE_TOP_P 0.95f +#define DS4_THINKING_STABLE_MIN_P 0.03f +#define DS4_REPEAT_GUARD_MIN_TOKENS 32 +#define DS4_REPEAT_GUARD_MAX_NGRAM 64 static void stop_signal_handler(int sig) { (void)sig; @@ -496,6 +502,13 @@ typedef enum { API_RESPONSES, } api_style; +typedef enum { + DS4_STEERING_POLICY_ALWAYS, + DS4_STEERING_POLICY_DECODING, + DS4_STEERING_POLICY_FINAL_ANSWER, + DS4_STEERING_POLICY_OFF, +} directional_steering_policy; + static void random_tool_id(char *dst, size_t dstlen, api_style api) { static uint64_t fallback_ctr; unsigned char bytes[16]; @@ -646,6 +659,64 @@ typedef struct { tool_replay_stats tool_replay; } request; +static void stable_tool_id_hash_bytes(uint64_t *h1, uint64_t *h2, + const void *ptr, size_t len) { + const unsigned char *p = ptr; + for (size_t i = 0; i < len; i++) { + *h1 ^= (uint64_t)p[i]; + *h1 *= 1099511628211ULL; + *h2 ^= (uint64_t)p[i] + 0x9e3779b97f4a7c15ULL + (*h2 << 6) + (*h2 >> 2); + } +} + +static void stable_tool_id_hash_field(uint64_t *h1, uint64_t *h2, + const char *value) { + if (value && value[0]) stable_tool_id_hash_bytes(h1, h2, value, strlen(value)); + unsigned char sep = 0xff; + stable_tool_id_hash_bytes(h1, h2, &sep, 1); +} + +static void stable_tool_id_hash_u64(uint64_t *h1, uint64_t *h2, + uint64_t value) { + char buf[32]; + snprintf(buf, sizeof(buf), "%llu", (unsigned long long)value); + stable_tool_id_hash_field(h1, h2, buf); +} + +static void stable_tool_id_hash_float(uint64_t *h1, uint64_t *h2, + float value) { + char buf[32]; + snprintf(buf, sizeof(buf), "%.9g", (double)value); + stable_tool_id_hash_field(h1, h2, buf); +} + +static void deterministic_tool_id(char *dst, size_t dstlen, + const request *r, api_style api, + int index, const char *name, + int attempt) { + const char *prefix = api == API_ANTHROPIC ? "toolu_" : "call_"; + uint64_t h1 = 1469598103934665603ULL; + uint64_t h2 = 0x84222325cbf29ce4ULL; + + stable_tool_id_hash_field(&h1, &h2, "ds4-tool-id-v1"); + stable_tool_id_hash_field(&h1, &h2, api == API_ANTHROPIC ? "anthropic" : "openai"); + stable_tool_id_hash_u64(&h1, &h2, r ? r->seed : 0); + stable_tool_id_hash_field(&h1, &h2, r ? r->model : NULL); + stable_tool_id_hash_field(&h1, &h2, r ? r->prompt_text : NULL); + stable_tool_id_hash_u64(&h1, &h2, r ? (uint64_t)r->max_tokens : 0); + stable_tool_id_hash_u64(&h1, &h2, r ? (uint64_t)r->top_k : 0); + stable_tool_id_hash_float(&h1, &h2, r ? r->temperature : 0.0f); + stable_tool_id_hash_float(&h1, &h2, r ? r->top_p : 0.0f); + stable_tool_id_hash_float(&h1, &h2, r ? r->min_p : 0.0f); + stable_tool_id_hash_u64(&h1, &h2, r ? (uint64_t)r->think_mode : 0); + stable_tool_id_hash_u64(&h1, &h2, (uint64_t)index); + stable_tool_id_hash_u64(&h1, &h2, (uint64_t)attempt); + stable_tool_id_hash_field(&h1, &h2, name); + + snprintf(dst, dstlen, "%s%016llx%016llx", + prefix, (unsigned long long)h1, (unsigned long long)h2); +} + static void tool_call_free(tool_call *tc) { free(tc->id); free(tc->name); @@ -5106,8 +5177,9 @@ static bool openai_tool_stream_has_id(const openai_tool_stream *ts, return false; } -static const char *openai_tool_stream_id(server *s, openai_tool_stream *ts, - int index) { +static const char *openai_tool_stream_id(server *s, const request *r, + openai_tool_stream *ts, + int index, const char *name) { if (!ts || index < 0) return ""; if (index >= ts->ids_cap) { int old = ts->ids_cap; @@ -5119,10 +5191,17 @@ static const char *openai_tool_stream_id(server *s, openai_tool_stream *ts, } if (!ts->ids[index]) { char id[64]; - for (;;) { - random_tool_id(id, sizeof(id), API_OPENAI); - if (!openai_tool_stream_has_id(ts, id, index) && - !tool_memory_has_id(s, id)) break; + if (r && r->seed) { + for (int attempt = 0;; attempt++) { + deterministic_tool_id(id, sizeof(id), r, API_OPENAI, index, name, attempt); + if (!openai_tool_stream_has_id(ts, id, index)) break; + } + } else { + for (;;) { + random_tool_id(id, sizeof(id), API_OPENAI); + if (!openai_tool_stream_has_id(ts, id, index) && + !tool_memory_has_id(s, id)) break; + } } ts->ids[index] = xstrdup(id); } @@ -5286,6 +5365,19 @@ static size_t dsml_max_tool_start_len(void) { return max; } +static bool dsml_text_ends_with_partial_tool_start(const char *raw, size_t raw_len) { + if (!raw || raw_len == 0) return false; + for (size_t i = 0; i < sizeof(dsml_syntaxes) / sizeof(dsml_syntaxes[0]); i++) { + const char *lit = dsml_syntaxes[i].tool_calls_start; + const size_t lit_len = strlen(lit); + const size_t max = raw_len < lit_len ? raw_len : lit_len - 1; + for (size_t n = 2; n <= max; n++) { + if (!memcmp(raw + raw_len - n, lit, n)) return true; + } + } + return false; +} + static bool dsml_find_tool_start(const char *raw, size_t raw_len, size_t *pos_out, const dsml_syntax **syn_out) { @@ -5722,7 +5814,7 @@ static bool openai_tool_start_invoke(int fd, server *s, const request *r, const free(tag); if (!name) return openai_tool_stream_fail(ts); - const char *tool_id = openai_tool_stream_id(s, ts, ts->index); + const char *tool_id = openai_tool_stream_id(s, r, ts, ts->index, name); bool ok = sse_chat_tool_call_start_delta(fd, r, id, ts->index, tool_id, name) && openai_tool_emit_args_fragment(fd, r, id, ts, "{", 1); free(name); @@ -7713,6 +7805,7 @@ static void id_list_push_unique(stop_list *ids, const char *id); struct server { ds4_engine *engine; ds4_session *session; + directional_steering_policy steering_policy; int default_tokens; kv_disk_cache kv; tool_memory tool_mem; @@ -8190,14 +8283,22 @@ static bool tool_calls_contains_id(const tool_calls *calls, const char *id, int return false; } -static void assign_tool_call_ids(server *s, tool_calls *calls, api_style api) { +static void assign_tool_call_ids(server *s, const request *r, + tool_calls *calls, api_style api) { if (!calls) return; for (int i = 0; i < calls->len; i++) { if (calls->v[i].id && calls->v[i].id[0]) continue; char id[64]; - for (;;) { - random_tool_id(id, sizeof(id), api); - if (!tool_calls_contains_id(calls, id, i) && !tool_memory_has_id(s, id)) break; + if (r && r->seed) { + for (int attempt = 0;; attempt++) { + deterministic_tool_id(id, sizeof(id), r, api, i, calls->v[i].name, attempt); + if (!tool_calls_contains_id(calls, id, i)) break; + } + } else { + for (;;) { + random_tool_id(id, sizeof(id), api); + if (!tool_calls_contains_id(calls, id, i) && !tool_memory_has_id(s, id)) break; + } } calls->v[i].id = xstrdup(id); } @@ -9396,6 +9497,20 @@ typedef struct { int tail_len; } thinking_state; +typedef struct { + float temperature; + int top_k; + float top_p; + float min_p; +} decode_sampling; + +typedef struct { + int *tokens; + size_t *ends; + int len; + int cap; +} decode_repetition_guard; + static bool thinking_tail_ends_with(const thinking_state *st, const char *s) { int n = (int)strlen(s); return st->tail_len >= n && !memcmp(st->tail + st->tail_len - n, s, (size_t)n); @@ -9414,6 +9529,104 @@ static void thinking_state_feed(thinking_state *st, const char *p, size_t len) { } } +static decode_sampling effective_decode_sampling(const request *r, + dsml_decode_state dsml_state) { + decode_sampling p = { + .temperature = r ? r->temperature : 1.0f, + .top_k = r ? r->top_k : 0, + .top_p = r ? r->top_p : 1.0f, + .min_p = r ? r->min_p : 0.0f, + }; + + if (r && ds4_think_mode_enabled(r->think_mode)) { + if (p.temperature <= 0.0f || p.temperature > DS4_THINKING_STABLE_TEMPERATURE) { + p.temperature = DS4_THINKING_STABLE_TEMPERATURE; + } + if (p.top_k <= 0 || p.top_k > DS4_THINKING_STABLE_TOP_K) { + p.top_k = DS4_THINKING_STABLE_TOP_K; + } + if (p.top_p <= 0.0f || p.top_p > DS4_THINKING_STABLE_TOP_P) { + p.top_p = DS4_THINKING_STABLE_TOP_P; + } + if (p.min_p < DS4_THINKING_STABLE_MIN_P) { + p.min_p = DS4_THINKING_STABLE_MIN_P; + } + } + + if (dsml_decode_state_is_tool(dsml_state) && + !dsml_decode_state_uses_payload_sampling(dsml_state)) + { + p.temperature = 0.0f; + p.top_k = 0; + p.top_p = 1.0f; + p.min_p = 0.0f; + } + + return p; +} + +static void decode_repetition_guard_free(decode_repetition_guard *g) { + if (!g) return; + free(g->tokens); + free(g->ends); + memset(g, 0, sizeof(*g)); +} + +static void decode_repetition_guard_push(decode_repetition_guard *g, + int token, + size_t text_end) { + if (g->len == g->cap) { + int new_cap = g->cap ? g->cap * 2 : 128; + g->tokens = xrealloc(g->tokens, (size_t)new_cap * sizeof(g->tokens[0])); + g->ends = xrealloc(g->ends, (size_t)new_cap * sizeof(g->ends[0])); + g->cap = new_cap; + } + g->tokens[g->len] = token; + g->ends[g->len] = text_end; + g->len++; +} + +static int decode_repetition_required_repeats(int width) { + if (width <= 1) return 8; + if (width <= 3) return 6; + return 4; +} + +static bool decode_repetition_guard_observe( + decode_repetition_guard *g, + int token, + size_t text_end, + int *out_width, + int *out_repeats, + size_t *out_trim_len) { + if (!g) return false; + decode_repetition_guard_push(g, token, text_end); + if (g->len < DS4_REPEAT_GUARD_MIN_TOKENS) return false; + + int max_width = g->len / 2; + if (max_width > DS4_REPEAT_GUARD_MAX_NGRAM) max_width = DS4_REPEAT_GUARD_MAX_NGRAM; + for (int width = 1; width <= max_width; width++) { + int repeats = 1; + while ((repeats + 1) * width <= g->len && + memcmp(g->tokens + g->len - width, + g->tokens + g->len - (repeats + 1) * width, + (size_t)width * sizeof(g->tokens[0])) == 0) + { + repeats++; + } + const int required = decode_repetition_required_repeats(width); + if (repeats >= required) { + const int keep = g->len - width * (repeats - 1); + if (out_width) *out_width = width; + if (out_repeats) *out_repeats = repeats; + if (out_trim_len) *out_trim_len = keep > 0 ? g->ends[keep - 1] : 0; + return true; + } + } + + return false; +} + static thinking_state thinking_state_from_prompt(const request *r) { thinking_state st = {0}; if (r && r->prompt_text) { @@ -9424,6 +9637,89 @@ static thinking_state thinking_state_from_prompt(const request *r) { return st; } +static const char *directional_steering_policy_name(directional_steering_policy policy) { + switch (policy) { + case DS4_STEERING_POLICY_ALWAYS: return "always"; + case DS4_STEERING_POLICY_DECODING: return "decoding"; + case DS4_STEERING_POLICY_FINAL_ANSWER: return "final-answer"; + case DS4_STEERING_POLICY_OFF: return "off"; + } + return "unknown"; +} + +static bool request_has_tool_result_context(const request *r) { + return r && r->prompt_text && strstr(r->prompt_text, "") != NULL; +} + +static bool directional_steering_final_answer_context(const request *r, + bool responses_live_continuation, + bool anthropic_live_continuation) { + if (!r) return false; + if (r->kind != REQ_CHAT) return true; + if (!r->has_tools) return true; + return responses_live_continuation || + anthropic_live_continuation || + request_has_tool_result_context(r); +} + +static bool text_has_nonspace(const char *p, size_t len) { + if (!p) return false; + for (size_t i = 0; i < len; i++) { + if (!isspace((unsigned char)p[i])) return true; + } + return false; +} + +static bool directional_steering_should_apply( + directional_steering_policy policy, + bool final_answer_context, + bool saw_final_answer_text, + bool thinking_before, + bool thinking_after, + dsml_decode_state dsml_before, + dsml_decode_state dsml_after, + bool partial_tool_start, + const char *piece, + size_t piece_len, + bool *starts_final_answer_out) { + if (starts_final_answer_out) *starts_final_answer_out = false; + if (policy == DS4_STEERING_POLICY_ALWAYS) return true; + if (policy == DS4_STEERING_POLICY_DECODING) return true; + if (policy == DS4_STEERING_POLICY_OFF) return false; + + if (!final_answer_context) return false; + if (thinking_before || thinking_after) return false; + if (dsml_decode_state_is_tool(dsml_before) || + dsml_decode_state_is_tool(dsml_after) || + partial_tool_start) + { + return false; + } + + const bool starts = text_has_nonspace(piece, piece_len); + if (starts_final_answer_out) *starts_final_answer_out = starts; + return saw_final_answer_text || starts; +} + +static void server_apply_directional_steering(server *s, bool enable) { + if (!s || !s->session) return; + if (enable) { + ds4_session_use_engine_directional_steering(s->session); + } else { + ds4_session_set_directional_steering(s->session, 0.0f, 0.0f); + } +} + +static void server_apply_prefill_directional_steering(server *s) { + server_apply_directional_steering( + s, s && s->steering_policy == DS4_STEERING_POLICY_ALWAYS); +} + +static void server_apply_decode_directional_steering(server *s) { + server_apply_directional_steering( + s, s && (s->steering_policy == DS4_STEERING_POLICY_ALWAYS || + s->steering_policy == DS4_STEERING_POLICY_DECODING)); +} /* Live recovery for a tool call started inside an unclosed block. * * The model sometimes opens a DSML stanza without closing its thinking first. @@ -10192,6 +10488,7 @@ static void generate_job(server *s, job *j) { req_flags); ds4_session_set_progress(s->session, server_progress_cb, &progress); ds4_session_set_display_progress(s->session, server_progress_cb, &progress); + server_apply_prefill_directional_steering(s); int cold_store_len = 0; if (cached == 0 && @@ -10385,6 +10682,15 @@ static void generate_job(server *s, job *j) { getenv("DS4_SERVER_DISABLE_THINK_TOOL_RECOVERY") == NULL; dsml_decode_tracker dsml_tracker; dsml_decode_tracker_init(&dsml_tracker); + decode_repetition_guard repeat_guard = {0}; + const bool dynamic_steering = + s->steering_policy == DS4_STEERING_POLICY_FINAL_ANSWER; + const bool final_answer_context = + directional_steering_final_answer_context(&j->req, + responses_live_continuation, + anthropic_live_continuation); + bool saw_final_answer_text = false; + server_apply_decode_directional_steering(s); while (!g_stop_requested && completion < max_tokens && ds4_session_pos(s->session) < ds4_session_ctx(s->session)) { @@ -10394,20 +10700,13 @@ static void generate_job(server *s, job *j) { if (!(j->req.kind == REQ_CHAT && j->req.has_tools && (saw_tool_start || in_tool_call))) { kv_cache_maybe_store_continued(s); } - float temperature = j->req.temperature; - int top_k = j->req.top_k; - float top_p = j->req.top_p; - float min_p = j->req.min_p; - if (ds4_think_mode_enabled(j->req.think_mode)) { - temperature = DS4_DEFAULT_TEMPERATURE; - top_k = 0; - top_p = DS4_DEFAULT_TOP_P; - min_p = DS4_DEFAULT_MIN_P; - } - if (in_tool_call && !dsml_decode_state_uses_payload_sampling(dsml_state)) { - temperature = 0.0f; - } - int token = ds4_session_sample(s->session, temperature, top_k, top_p, min_p, &rng); + decode_sampling sampling = effective_decode_sampling(&j->req, dsml_state); + int token = ds4_session_sample(s->session, + sampling.temperature, + sampling.top_k, + sampling.top_p, + sampling.min_p, + &rng); if (token == ds4_token_eos(s->engine)) { finish = "stop"; break; @@ -10415,9 +10714,11 @@ static void generate_job(server *s, job *j) { int toks[17]; int ntok = 0; - if (temperature <= 0.0f && + bool toks_evaluated = false; + if (sampling.temperature <= 0.0f && ds4_engine_mtp_draft_tokens(s->engine) > 1 && - getenv("DS4_MTP_SPEC_DISABLE") == NULL) + getenv("DS4_MTP_SPEC_DISABLE") == NULL && + !dynamic_steering) { ntok = ds4_session_eval_speculative_argmax(s->session, token, @@ -10431,11 +10732,8 @@ static void generate_job(server *s, job *j) { finish = "error"; break; } + toks_evaluated = true; } else { - if (ds4_session_eval(s->session, token, err, sizeof(err)) != 0) { - finish = "error"; - break; - } toks[0] = token; ntok = 1; } @@ -10451,15 +10749,111 @@ static void generate_job(server *s, job *j) { size_t piece_len = 0; char *piece = ds4_token_text(s->engine, token, &piece_len); + thinking_state next_thinking = thinking; + dsml_decode_tracker next_dsml_tracker = dsml_tracker; + dsml_decode_state next_dsml_state = dsml_state; + bool starts_final_answer = false; + + if (!toks_evaluated) { + if (dynamic_steering) { + const bool thinking_before = thinking.inside; + thinking_state_feed(&next_thinking, piece, piece_len); + bool partial_tool_start = false; + if (j->req.kind == REQ_CHAT && j->req.has_tools) { + const size_t old_len = text.len; + buf_append(&text, piece, piece_len); + dsml_decode_tracker_update(&next_dsml_tracker, + text.ptr, text.len); + next_dsml_state = next_dsml_tracker.decode; + partial_tool_start = + dsml_text_ends_with_partial_tool_start(text.ptr, + text.len); + text.len = old_len; + if (text.ptr) text.ptr[text.len] = '\0'; + } + const bool steer_token = directional_steering_should_apply( + s->steering_policy, + final_answer_context, + saw_final_answer_text, + thinking_before, + next_thinking.inside, + dsml_state, + next_dsml_state, + partial_tool_start, + piece, + piece_len, + &starts_final_answer); + server_apply_directional_steering(s, steer_token); + } + int eval_rc = dynamic_steering ? + ds4_session_eval_no_mtp(s->session, token, err, sizeof(err)) : + ds4_session_eval(s->session, token, err, sizeof(err)); + if (eval_rc != 0) { + finish = "error"; + free(piece); + stop_decode = true; + break; + } + } completion++; trace_piece(s, trace_id, piece, piece_len); buf_append(&text, piece, piece_len); - thinking_state_feed(&thinking, piece, piece_len); - if (j->req.kind == REQ_CHAT && j->req.has_tools) { + if (dynamic_steering) { + thinking = next_thinking; + dsml_tracker = next_dsml_tracker; + if (starts_final_answer) saw_final_answer_text = true; + } else { + thinking_state_feed(&thinking, piece, piece_len); + } + if (!dynamic_steering && + j->req.kind == REQ_CHAT && j->req.has_tools) { dsml_decode_tracker_update(&dsml_tracker, text.ptr, text.len); } + int repeat_width = 0; + int repeat_count = 0; + size_t repeat_trim_len = text.len; + if (decode_repetition_guard_observe(&repeat_guard, + token, + text.len, + &repeat_width, + &repeat_count, + &repeat_trim_len)) { + server_log(DS4_LOG_WARNING, + "ds4-server: %s ctx=%s stopped repetitive decode after %d generated tokens ngram=%d repeats=%d", + j->req.kind == REQ_CHAT ? "chat" : "completion", + ctx_span, + completion, + repeat_width, + repeat_count); + trace_event(s, trace_id, + "repetition guard stopped decode: gen=%d ngram=%d repeats=%d", + completion, + repeat_width, + repeat_count); + size_t min_trim_len = 0; + if (j->req.stream) { + min_trim_len = plain_stream_pos; + if (openai_live_chat && openai_live.emit_pos > min_trim_len) { + min_trim_len = openai_live.emit_pos; + } + if (j->req.api == API_ANTHROPIC && anthropic_live.emit_pos > min_trim_len) { + min_trim_len = anthropic_live.emit_pos; + } + } + if (repeat_trim_len < min_trim_len) repeat_trim_len = min_trim_len; + if (repeat_trim_len < text.len) { + text.len = repeat_trim_len; + text.ptr[text.len] = '\0'; + } + ds4_session_invalidate(s->session); + finish = "stop"; + free(piece); + stop_decode = true; + break; + } + size_t stop_pos = 0, stop_len = 0; bool hit_stop = stop_list_find_from(&j->req.stops, text.ptr, stop_scan_from, @@ -10835,7 +11229,7 @@ static void generate_job(server *s, job *j) { if (openai_live_chat) apply_openai_stream_tool_ids(&parsed_calls, &openai_live); if (j->req.api == API_ANTHROPIC && j->req.stream) apply_anthropic_stream_tool_ids(&parsed_calls, &anthropic_live); - assign_tool_call_ids(s, &parsed_calls, j->req.api); + assign_tool_call_ids(s, &j->req, &parsed_calls, j->req.api); tool_memory_remember(s, &parsed_calls); final_finish = "tool_calls"; } else if (j->req.api == API_RESPONSES) { @@ -11034,6 +11428,7 @@ static void generate_job(server *s, job *j) { responses_stream_free(&responses_live); buf_free(&text); ds4_tokens_free(&effective_prompt); + decode_repetition_guard_free(&repeat_guard); } static bool enqueue(server *s, job *j) { @@ -11281,8 +11676,19 @@ static void *client_main(void *arg) { request req; char err[160]; bool ok = false; + bool count_tokens_only = false; const int ctx_size = ds4_session_ctx(s->session); - if (!strcmp(hr.method, "POST") && !strcmp(hr.path, "/v1/messages")) { + if (!strcmp(hr.method, "POST") && !strcmp(hr.path, "/v1/messages/count_tokens")) { + /* Anthropic's count_tokens endpoint takes the same request shape as + * /v1/messages but only returns the prompt token total — no inference + * runs, so we short-circuit before the worker queue. Pass a NULL + * server so parse_anthropic_request skips the tool-memory and + * KV-cache lookups it would normally do; both helpers no-op cleanly + * on NULL, leaving shared state untouched for a read-only count. */ + ok = parse_anthropic_request(s->engine, NULL, hr.body, s->default_tokens, + ctx_size, &req, err, sizeof(err)); + if (ok) count_tokens_only = true; + } else if (!strcmp(hr.method, "POST") && !strcmp(hr.path, "/v1/messages")) { ok = parse_anthropic_request(s->engine, s, hr.body, s->default_tokens, ctx_size, &req, err, sizeof(err)); } else if (!strcmp(hr.method, "POST") && !strcmp(hr.path, "/v1/chat/completions")) { @@ -11315,6 +11721,14 @@ static void *client_main(void *arg) { goto done; } + if (count_tokens_only) { + char body[64]; + snprintf(body, sizeof(body), "{\"input_tokens\":%d}", req.prompt.len); + http_response(fd, s->enable_cors, 200, "application/json", body); + request_free(&req); + goto done; + } + set_client_socket_nonblocking(fd); job j; memset(&j, 0, sizeof(j)); @@ -11398,6 +11812,7 @@ typedef struct { const char *kv_disk_dir; uint64_t kv_disk_space_mb; kv_cache_options kv_cache; + directional_steering_policy steering_policy; bool kv_cache_reject_different_quant; bool disable_exact_dsml_tool_replay; int tool_memory_max_ids; @@ -11510,6 +11925,28 @@ static ds4_backend default_server_backend(void) { #endif } +static directional_steering_policy parse_directional_steering_policy_arg( + const char *s, + const char *arg) { + if (!strcmp(s, "always")) return DS4_STEERING_POLICY_ALWAYS; + if (!strcmp(s, "decoding") || !strcmp(s, "decode")) { + return DS4_STEERING_POLICY_DECODING; + } + if (!strcmp(s, "final-answer") || + !strcmp(s, "final") || + !strcmp(s, "tool-safe")) + { + return DS4_STEERING_POLICY_FINAL_ANSWER; + } + if (!strcmp(s, "off") || !strcmp(s, "none")) { + return DS4_STEERING_POLICY_OFF; + } + server_log(DS4_LOG_DEFAULT, "ds4-server: invalid %s value: %s", arg, s); + server_log(DS4_LOG_DEFAULT, + "ds4-server: valid directional steering policies are: final-answer, decoding, always, off"); + exit(2); +} + static server_config parse_options(int argc, char **argv) { server_config c = { .engine = { @@ -11522,6 +11959,7 @@ static server_config parse_options(int argc, char **argv) { .port = 8000, .ctx_size = 32768, .default_tokens = 393216, + .steering_policy = DS4_STEERING_POLICY_FINAL_ANSWER, .tool_memory_max_ids = DS4_TOOL_MEMORY_DEFAULT_MAX_IDS, }; c.kv_cache = kv_cache_default_options(); @@ -11650,6 +12088,9 @@ static server_config parse_options(int argc, char **argv) { } else if (!strcmp(arg, "--dir-steering-attn")) { c.engine.directional_steering_attn = parse_float_arg(need_arg(&i, argc, argv, arg), arg, -100.0f, 100.0f); directional_steering_scale_set = true; + } else if (!strcmp(arg, "--dir-steering-policy")) { + c.steering_policy = + parse_directional_steering_policy_arg(need_arg(&i, argc, argv, arg), arg); } else if (!strcmp(arg, "--warm-weights")) { c.engine.warm_weights = true; } else if (!strcmp(arg, "--metal")) { @@ -11736,6 +12177,7 @@ int main(int argc, char **argv) { memset(&s, 0, sizeof(s)); s.engine = engine; s.session = session; + s.steering_policy = cfg.steering_policy; s.default_tokens = cfg.default_tokens; s.disable_exact_dsml_tool_replay = cfg.disable_exact_dsml_tool_replay; s.tool_mem.max_entries = cfg.tool_memory_max_ids; @@ -11748,6 +12190,11 @@ int main(int argc, char **argv) { server_log(DS4_LOG_DEFAULT, "ds4-server: exact DSML tool replay disabled; tool history uses canonical JSON rendering"); } + if (s.steering_policy != DS4_STEERING_POLICY_ALWAYS) { + server_log(DS4_LOG_DEFAULT, + "ds4-server: directional steering policy=%s", + directional_steering_policy_name(s.steering_policy)); + } pthread_mutex_init(&s.mu, NULL); pthread_cond_init(&s.cv, NULL); pthread_cond_init(&s.clients_cv, NULL); @@ -13770,7 +14217,7 @@ static void test_tool_memory_replays_sampled_dsml(void) { server s; memset(&s, 0, sizeof(s)); pthread_mutex_init(&s.tool_mu, NULL); - assign_tool_call_ids(&s, &sampled, API_OPENAI); + assign_tool_call_ids(&s, NULL, &sampled, API_OPENAI); TEST_ASSERT(sampled.v[0].id != NULL); TEST_ASSERT(!strncmp(sampled.v[0].id, "call_", 5)); tool_memory_remember(&s, &sampled); @@ -13813,6 +14260,72 @@ static void test_tool_memory_replays_sampled_dsml(void) { pthread_mutex_destroy(&s.tool_mu); } +static void test_seeded_tool_ids_are_deterministic(void) { + server s; + memset(&s, 0, sizeof(s)); + pthread_mutex_init(&s.tool_mu, NULL); + + request r = {0}; + r.api = API_OPENAI; + r.seed = 42; + r.model = "deepseek-v4-flash"; + r.prompt_text = "prompt A"; + r.max_tokens = 64; + r.top_k = 40; + r.temperature = 0.6f; + r.top_p = 0.95f; + r.min_p = 0.0f; + r.think_mode = DS4_THINK_HIGH; + + tool_calls a = make_swapped_bash_call(); + tool_calls b = make_swapped_bash_call(); + assign_tool_call_ids(&s, &r, &a, API_OPENAI); + assign_tool_call_ids(&s, &r, &b, API_OPENAI); + TEST_ASSERT(a.v[0].id != NULL); + TEST_ASSERT(b.v[0].id != NULL); + TEST_ASSERT(!strcmp(a.v[0].id, b.v[0].id)); + TEST_ASSERT(!strncmp(a.v[0].id, "call_", 5)); + + tool_calls c = make_swapped_bash_call(); + r.prompt_text = "prompt B"; + assign_tool_call_ids(&s, &r, &c, API_OPENAI); + TEST_ASSERT(c.v[0].id != NULL); + TEST_ASSERT(strcmp(a.v[0].id, c.v[0].id)); + + tool_calls d = make_swapped_bash_call(); + r.prompt_text = "prompt A"; + r.api = API_ANTHROPIC; + assign_tool_call_ids(&s, &r, &d, API_ANTHROPIC); + TEST_ASSERT(d.v[0].id != NULL); + TEST_ASSERT(!strncmp(d.v[0].id, "toolu_", 6)); + + r.api = API_OPENAI; + openai_stream st1, st2, st3; + openai_stream_start(&r, &st1); + openai_stream_start(&r, &st2); + const char *sid1 = openai_tool_stream_id(&s, &r, &st1.tool, 0, "bash"); + const char *sid2 = openai_tool_stream_id(&s, &r, &st2.tool, 0, "bash"); + TEST_ASSERT(sid1 != NULL); + TEST_ASSERT(sid2 != NULL); + TEST_ASSERT(!strcmp(sid1, sid2)); + TEST_ASSERT(!strcmp(sid1, a.v[0].id)); + + r.seed = 43; + openai_stream_start(&r, &st3); + const char *sid3 = openai_tool_stream_id(&s, &r, &st3.tool, 0, "bash"); + TEST_ASSERT(sid3 != NULL); + TEST_ASSERT(strcmp(sid1, sid3)); + + openai_stream_free(&st1); + openai_stream_free(&st2); + openai_stream_free(&st3); + tool_calls_free(&a); + tool_calls_free(&b); + tool_calls_free(&c); + tool_calls_free(&d); + pthread_mutex_destroy(&s.tool_mu); +} + static void test_anthropic_tool_memory_replays_sampled_dsml(void) { const char *sampled_dsml = "\n\n" DS4_TOOL_CALLS_START "\n" @@ -14331,6 +14844,165 @@ static void test_dsml_decode_state_separates_structure_and_payload(void) { TEST_ASSERT(tracker.decode == DSML_DECODE_OUTSIDE); } +static void test_directional_steering_final_answer_policy_is_tool_safe(void) { + char *argv0[] = {"ds4-server"}; + server_config cfg = parse_options(1, argv0); + TEST_ASSERT(cfg.steering_policy == DS4_STEERING_POLICY_FINAL_ANSWER); + TEST_ASSERT(parse_directional_steering_policy_arg("decoding", "--dir-steering-policy") == + DS4_STEERING_POLICY_DECODING); + TEST_ASSERT(parse_directional_steering_policy_arg("decode", "--dir-steering-policy") == + DS4_STEERING_POLICY_DECODING); + TEST_ASSERT(!strcmp(directional_steering_policy_name(DS4_STEERING_POLICY_DECODING), + "decoding")); + + bool starts = true; + TEST_ASSERT(directional_steering_should_apply( + DS4_STEERING_POLICY_ALWAYS, + false, + false, + true, + false, + DSML_DECODE_STRUCTURAL, + DSML_DECODE_OUTSIDE, + true, + "", + 0, + &starts)); + TEST_ASSERT(starts == false); + + TEST_ASSERT(directional_steering_should_apply( + DS4_STEERING_POLICY_DECODING, + false, + false, + true, + true, + DSML_DECODE_STRUCTURAL, + DSML_DECODE_STRING_BODY, + true, + DS4_TOOL_CALLS_START, + strlen(DS4_TOOL_CALLS_START), + NULL)); + + starts = true; + TEST_ASSERT(!directional_steering_should_apply( + DS4_STEERING_POLICY_OFF, + true, + true, + false, + false, + DSML_DECODE_OUTSIDE, + DSML_DECODE_OUTSIDE, + false, + "answer", + strlen("answer"), + &starts)); + TEST_ASSERT(starts == false); + + starts = true; + TEST_ASSERT(!directional_steering_should_apply( + DS4_STEERING_POLICY_FINAL_ANSWER, + false, + false, + false, + false, + DSML_DECODE_OUTSIDE, + DSML_DECODE_OUTSIDE, + false, + "answer", + strlen("answer"), + &starts)); + TEST_ASSERT(starts == false); + + TEST_ASSERT(!directional_steering_should_apply( + DS4_STEERING_POLICY_FINAL_ANSWER, + true, + false, + true, + false, + DSML_DECODE_OUTSIDE, + DSML_DECODE_OUTSIDE, + false, + "", + strlen(""), + NULL)); + + TEST_ASSERT(!directional_steering_should_apply( + DS4_STEERING_POLICY_FINAL_ANSWER, + true, + false, + false, + false, + DSML_DECODE_STRUCTURAL, + DSML_DECODE_STRUCTURAL, + false, + DS4_TOOL_CALLS_START, + strlen(DS4_TOOL_CALLS_START), + NULL)); + + TEST_ASSERT(!directional_steering_should_apply( + DS4_STEERING_POLICY_FINAL_ANSWER, + true, + false, + false, + false, + DSML_DECODE_OUTSIDE, + DSML_DECODE_OUTSIDE, + true, + DS4_TOOL_CALLS_START, + strlen(DS4_TOOL_CALLS_START) - 2, + NULL)); + + starts = false; + TEST_ASSERT(directional_steering_should_apply( + DS4_STEERING_POLICY_FINAL_ANSWER, + true, + false, + false, + false, + DSML_DECODE_OUTSIDE, + DSML_DECODE_OUTSIDE, + false, + "answer", + strlen("answer"), + &starts)); + TEST_ASSERT(starts == true); + + starts = true; + TEST_ASSERT(directional_steering_should_apply( + DS4_STEERING_POLICY_FINAL_ANSWER, + true, + true, + false, + false, + DSML_DECODE_OUTSIDE, + DSML_DECODE_OUTSIDE, + false, + " ", + 1, + &starts)); + TEST_ASSERT(starts == false); + + request r = { + .kind = REQ_CHAT, + .has_tools = true, + .prompt_text = "user asks before any tool result", + }; + TEST_ASSERT(!directional_steering_final_answer_context(&r, false, false)); + TEST_ASSERT(directional_steering_final_answer_context(&r, true, false)); + r.prompt_text = "ok"; + TEST_ASSERT(directional_steering_final_answer_context(&r, false, false)); + r.has_tools = false; + r.prompt_text = NULL; + TEST_ASSERT(directional_steering_final_answer_context(&r, false, false)); + + request c = {.kind = REQ_COMPLETION}; + TEST_ASSERT(directional_steering_final_answer_context(&c, false, false)); + TEST_ASSERT(dsml_text_ends_with_partial_tool_start( + DS4_TOOL_CALLS_START, + strlen(DS4_TOOL_CALLS_START) - 2)); + TEST_ASSERT(!dsml_text_ends_with_partial_tool_start("plain", strlen("plain"))); +} + static void test_tool_memory_max_ids_prunes_oldest(void) { const char *a_dsml = "\n\n<|DSML|tool_calls>\n<|DSML|invoke name=\"bash\">\n<|DSML|parameter name=\"command\" string=\"true\">a\n\n"; const char *b_dsml = "\n\n<|DSML|tool_calls>\n<|DSML|invoke name=\"bash\">\n<|DSML|parameter name=\"command\" string=\"true\">b\n\n"; @@ -14459,6 +15131,92 @@ static void test_stop_list_streaming_holds_and_trims_stop_text(void) { free(stops.v); } +static void test_thinking_sampling_uses_stable_profile(void) { + request r; + request_init(&r, REQ_CHAT, 128); + r.think_mode = DS4_THINK_HIGH; + r.temperature = 1.0f; + r.top_k = 0; + r.top_p = 1.0f; + r.min_p = 0.0f; + + decode_sampling p = effective_decode_sampling(&r, DSML_DECODE_OUTSIDE); + TEST_ASSERT(p.temperature == DS4_THINKING_STABLE_TEMPERATURE); + TEST_ASSERT(p.top_k == DS4_THINKING_STABLE_TOP_K); + TEST_ASSERT(p.top_p == DS4_THINKING_STABLE_TOP_P); + TEST_ASSERT(p.min_p == DS4_THINKING_STABLE_MIN_P); + + r.temperature = 0.2f; + r.top_k = 8; + r.top_p = 0.5f; + r.min_p = 0.1f; + p = effective_decode_sampling(&r, DSML_DECODE_OUTSIDE); + TEST_ASSERT(p.temperature == 0.2f); + TEST_ASSERT(p.top_k == 8); + TEST_ASSERT(p.top_p == 0.5f); + TEST_ASSERT(p.min_p == 0.1f); + + p = effective_decode_sampling(&r, DSML_DECODE_STRUCTURAL); + TEST_ASSERT(p.temperature == 0.0f); + TEST_ASSERT(p.top_k == 0); + TEST_ASSERT(p.top_p == 1.0f); + TEST_ASSERT(p.min_p == 0.0f); + + request_free(&r); +} + +static void test_repetition_guard_stops_phrase_loop(void) { + decode_repetition_guard g = {0}; + int width = 0; + int repeats = 0; + size_t trim_len = 0; + bool stopped = false; + size_t text_len = 0; + + for (int i = 0; i < 20; i++) { + text_len++; + TEST_ASSERT(!decode_repetition_guard_observe(&g, 1000 + i, text_len, + &width, &repeats, &trim_len)); + } + for (int r = 0; r < 4 && !stopped; r++) { + for (int i = 0; i < 4; i++) { + text_len++; + stopped = decode_repetition_guard_observe(&g, 7 + i, text_len, + &width, &repeats, &trim_len); + if (stopped) break; + } + } + + TEST_ASSERT(stopped); + TEST_ASSERT(width == 4); + TEST_ASSERT(repeats == 4); + TEST_ASSERT(trim_len == 24); + decode_repetition_guard_free(&g); +} + +static void test_repetition_guard_allows_short_repeat(void) { + decode_repetition_guard g = {0}; + int width = 0; + int repeats = 0; + size_t trim_len = 0; + size_t text_len = 0; + + for (int i = 0; i < 24; i++) { + text_len++; + TEST_ASSERT(!decode_repetition_guard_observe(&g, 2000 + i, text_len, + &width, &repeats, &trim_len)); + } + for (int r = 0; r < 3; r++) { + for (int i = 0; i < 4; i++) { + text_len++; + TEST_ASSERT(!decode_repetition_guard_observe(&g, 11 + i, text_len, + &width, &repeats, &trim_len)); + } + } + + decode_repetition_guard_free(&g); +} + static char *test_nested_json_array(int depth) { buf b = {0}; for (int i = 0; i < depth; i++) buf_putc(&b, '['); @@ -15806,6 +16564,7 @@ static void ds4_server_unit_tests_run(void) { test_tool_checkpoint_suffix_is_future_prompt_canonical(); test_tool_checkpoint_minifies_json_parameters(); test_tool_memory_replays_sampled_dsml(); + test_seeded_tool_ids_are_deterministic(); test_anthropic_tool_memory_replays_sampled_dsml(); test_anthropic_live_tail_renders_tool_results_only(); test_anthropic_tool_result_id_validation(); @@ -15818,6 +16577,7 @@ static void ds4_server_unit_tests_run(void) { test_responses_visible_suffix_matches_client_replay(); test_exact_dsml_tool_replay_can_be_disabled(); test_dsml_decode_state_separates_structure_and_payload(); + test_directional_steering_final_answer_policy_is_tool_safe(); test_tool_memory_max_ids_prunes_oldest(); test_kv_tool_map_filters_by_dsml_text(); test_kv_tool_map_restores_before_prompt_render(); @@ -15830,6 +16590,9 @@ static void ds4_server_unit_tests_run(void) { test_dsml_prompt_escapes_tool_supplied_text(); test_stop_list_parses_all_sequences(); test_stop_list_streaming_holds_and_trims_stop_text(); + test_thinking_sampling_uses_stable_profile(); + test_repetition_guard_stops_phrase_loop(); + test_repetition_guard_allows_short_repeat(); test_json_skip_has_nesting_limit(); test_json_parser_handles_tool_heavy_requests(); test_json_string_handles_surrogates(); diff --git a/gguf-tools/README.md b/gguf-tools/README.md index f692a86d1..c5f0b406b 100644 --- a/gguf-tools/README.md +++ b/gguf-tools/README.md @@ -5,7 +5,7 @@ V4 Flash GGUF files for `ds4`. The important pieces are: -- `deepseek4-quantize.c`: C HF-safetensors to GGUF quantizer. +- `deepseek4-quantize.c`: C HF-safetensors/GGUF to GGUF quantizer. - `quants.[ch]`: the deliberately small local quantization implementation used by the quantizer. It implements the DS4 output formats we actually ship: `q8_0`, `q4_K`, `q2_K`, and `iq2_xxs`. @@ -13,6 +13,9 @@ The important pieces are: importance with `ds4`. - `quality-testing/`: prompts and scripts used to compare local GGUF variants against official DeepSeek V4 Flash continuations. +- `deepspec/ds4_deepspec.py`: validates DS4 target-cache exports against the + DeepSpec v2 manifest/index/shard contract and emits the DS4-side non-Markov + DeepSpec config scaffold before external training. ## Build @@ -108,6 +111,55 @@ gguf-tools/deepseek4-quantize \ `--compare-tensor` regenerates a single tensor and byte-compares it against the template or `--compare-gguf`. `--threads N` controls routed-expert workers. +## Re-quantize From An Existing GGUF + +`--source-gguf` can use an existing GGUF as the weight source instead of a +Hugging Face safetensors directory. This is useful when the source weights have +already been edited in GGUF form, such as CyberNeurova's abliterated Q8_0 +release. The source GGUF must have the same logical tensor names and shapes as +the template. F32, F16, BF16, and Q8_0 source tensors can be copied or +dequantized and re-quantized into the target recipe. + +Example: rebuild an abliterated Q8_0 source with the chat-v2 DS4 imatrix and +write 4096-byte-aligned tensor data: + +```sh +gguf-tools/deepseek4-quantize \ + --source-gguf gguf/cyberneurova-DeepSeek-V4-Flash-abliterated-Q8_0.gguf \ + --template gguf/DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2-imatrix.gguf \ + --out gguf/cyberneurova-DeepSeek-V4-Flash-abliterated-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2-imatrix-aligned.gguf \ + --imatrix gguf/DeepSeek-V4-Flash-chat-v2-routed-moe-ds4.dat \ + --alignment 4096 +``` + +The output metadata writes `general.alignment` and preserves imatrix provenance +from the current run while dropping stale imatrix/alignment keys inherited from +the template. + +## Generate A DSpark/DeepSpec Draft GGUF + +Official DeepSeek-V4-Flash DSpark/DeepSpec Markov draft weights are stored in +separate Hugging Face safetensor shards under the `mtp.*` namespace. Convert +those shards into a DS4 auxiliary MTP GGUF with `--dspark-only`; the main Flash +template supplies tokenizer metadata, tensor order, and GGUF layout: + +```sh +gguf-tools/deepseek4-quantize \ + --hf gguf/dspark-hf \ + --template gguf/ds4flash.gguf \ + --out gguf/deepseek4.dspark.gguf \ + --dspark-only +``` + +The converter detects the official Markov layout from `mtp.0.main_proj.weight` +plus `mtp.2.markov_head.markov_w1.weight`, stores the rank-256 Markov weights +as F16, emits `deepseek4.dspark.*` metadata, and accepts the model repository +root `config.json` as a fallback when `inference/config.json` is not present. +Nonseq DSpark exports use `markov_rank=0` metadata and omit Markov/confidence +head tensors; the runtime still target-verifies every drafted block before +committing tokens. Use `--dry-run` before writing and +`--self-test-dspark-map` after changing tensor mapping rules. + ## When No Imatrix Is Given `iq2_xxs` requires an importance vector. If `--imatrix` is not provided and diff --git a/gguf-tools/deepseek4-quantize.c b/gguf-tools/deepseek4-quantize.c index 3955b4352..0cb76fe01 100644 --- a/gguf-tools/deepseek4-quantize.c +++ b/gguf-tools/deepseek4-quantize.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #if defined(_WIN32) #error "deepseek4-quantize.c currently targets POSIX systems" @@ -45,8 +47,16 @@ #define DS4_KV_QUANTIZE_IMATRIX_DATASET "quantize.imatrix.dataset" #define DS4_KV_QUANTIZE_IMATRIX_N_ENTRIES "quantize.imatrix.entries_count" #define DS4_KV_QUANTIZE_IMATRIX_N_CHUNKS "quantize.imatrix.chunks_count" +#define DS4_KV_GENERAL_ALIGNMENT "general.alignment" #define DS4_GGUF_DEFAULT_ALIGNMENT 32 +#define DS4_KV_DSPARK_N_MTP_LAYERS "deepseek4.dspark.n_mtp_layers" +#define DS4_KV_DSPARK_BLOCK_SIZE "deepseek4.dspark.block_size" +#define DS4_KV_DSPARK_NOISE_TOKEN_ID "deepseek4.dspark.noise_token_id" +#define DS4_KV_DSPARK_MARKOV_RANK "deepseek4.dspark.markov_rank" +#define DS4_KV_DSPARK_TARGET_LAYER_ID "deepseek4.dspark.target_layer_ids" +#define DS4_DSPARK_TARGET_LAYER_COUNT 3 + typedef enum { GGUF_TYPE_UINT8 = 0, GGUF_TYPE_INT8 = 1, @@ -142,6 +152,24 @@ static char *read_file(const char *path, size_t *len_out) { return buf; } +static char *read_optional_file(const char *path, size_t *len_out) { + FILE *fp = fopen(path, "rb"); + if (!fp) { + if (errno == ENOENT) return NULL; + die_errno("open", path); + } + if (fseeko(fp, 0, SEEK_END) != 0) die_errno("seek", path); + off_t n = ftello(fp); + if (n < 0) die_errno("tell", path); + if (fseeko(fp, 0, SEEK_SET) != 0) die_errno("seek", path); + char *buf = xmalloc((size_t)n + 1); + if (n && fread(buf, 1, (size_t)n, fp) != (size_t)n) die_errno("read", path); + buf[n] = '\0'; + fclose(fp); + if (len_out) *len_out = (size_t)n; + return buf; +} + static uint64_t read_u64_le_fp(FILE *fp, const char *what) { uint8_t b[8]; if (fread(b, 1, sizeof(b), fp) != sizeof(b)) { @@ -874,24 +902,28 @@ typedef enum { EXP_NONE, EXP_W1, EXP_W2, EXP_W3 } expert_part; typedef struct { bool is_expert; + bool is_mtp; int layer; expert_part part; } expert_tensor; -static expert_tensor parse_expert_tensor(const char *name) { - expert_tensor e = {0}; +static bool parse_expert_tensor_as(const char *name, const char *fmt, bool is_mtp, expert_tensor *out) { int layer = -1; char kind[16]; int rest = 0; - if (sscanf(name, "blk.%d.ffn_%15[^_]_exps.weight%n", &layer, kind, &rest) == 2 - && rest == (int)strlen(name)) - { - if (strcmp(kind, "gate") == 0 || strcmp(kind, "down") == 0 || strcmp(kind, "up") == 0) { - e.is_expert = true; - e.layer = layer; - e.part = strcmp(kind, "gate") == 0 ? EXP_W1 : strcmp(kind, "down") == 0 ? EXP_W2 : EXP_W3; - } - } + if (sscanf(name, fmt, &layer, kind, &rest) != 2 || rest != (int)strlen(name)) return false; + if (strcmp(kind, "gate") != 0 && strcmp(kind, "down") != 0 && strcmp(kind, "up") != 0) return false; + out->is_expert = true; + out->is_mtp = is_mtp; + out->layer = layer; + out->part = strcmp(kind, "gate") == 0 ? EXP_W1 : strcmp(kind, "down") == 0 ? EXP_W2 : EXP_W3; + return true; +} + +static expert_tensor parse_expert_tensor(const char *name) { + expert_tensor e = {0}; + if (parse_expert_tensor_as(name, "blk.%d.ffn_%15[^_]_exps.weight%n", false, &e)) return e; + if (parse_expert_tensor_as(name, "mtp.%d.ffn_%15[^_]_exps.weight%n", true, &e)) return e; return e; } @@ -905,6 +937,16 @@ static const char *expert_part_name(expert_part p) { return ""; } +static void expert_hf_prefix(char *buf, size_t cap, + const expert_tensor *e, int xid, + const char *wid) { + if (e->is_mtp) { + snprintf(buf, cap, "mtp.%d.ffn.experts.%d.%s", e->layer, xid, wid); + } else { + snprintf(buf, cap, "layers.%d.ffn.experts.%d.%s", e->layer, xid, wid); + } +} + typedef struct { const char *gguf; const char *hf; @@ -950,34 +992,203 @@ static const name_map layer_map[] = { { "ffn_up_shexp.weight", "ffn.shared_experts.w3.weight" }, { "ffn_down_shexp.weight", "ffn.shared_experts.w2.weight" }, { "ffn_gate_inp.weight", "ffn.gate.weight" }, + { "ffn_gate_exps.weight", "ffn.experts.*.w1.weight" }, + { "ffn_up_exps.weight", "ffn.experts.*.w3.weight" }, + { "ffn_down_exps.weight", "ffn.experts.*.w2.weight" }, { "exp_probs_b.bias", "ffn.gate.bias" }, { "ffn_gate_tid2eid.weight", "ffn.gate.tid2eid" }, }; -static char *hf_name_for_regular(const char *gguf_name) { - for (size_t i = 0; i < sizeof(top_map) / sizeof(top_map[0]); i++) { - if (strcmp(gguf_name, top_map[i].gguf) == 0) return xstrdup(top_map[i].hf); - } + +static const name_map dspark_mtp_map[] = { + { "main_proj.weight", "main_proj.weight" }, + { "main_norm.weight", "main_norm.weight" }, + { "norm.weight", "norm.weight" }, + { "markov_head.markov_w1.weight", "markov_head.markov_w1.weight" }, + { "markov_head.markov_w2.weight", "markov_head.markov_w2.weight" }, + { "confidence_head.proj.weight", "confidence_head.proj.weight" }, + { "hc_head_base.weight", "hc_head_base" }, + { "hc_head_fn.weight", "hc_head_fn" }, + { "hc_head_scale.weight", "hc_head_scale" }, +}; + +static char *hf_name_for_mapped_layer( + const char *gguf_name, + const char *gguf_prefix, + const char *hf_prefix, + const name_map *extra_map, + size_t extra_map_len) { int layer = -1; - const char *p = gguf_name; - if (sscanf(p, "blk.%d.", &layer) != 1) { - fprintf(stderr, "error: cannot map GGUF tensor to HF tensor: %s\n", gguf_name); - exit(1); - } - const char *rest = strchr(p + 4, '.'); + char scan_fmt[32]; + snprintf(scan_fmt, sizeof(scan_fmt), "%s.%%d.", gguf_prefix); + if (sscanf(gguf_name, scan_fmt, &layer) != 1) return NULL; + + const char *rest = strchr(gguf_name + strlen(gguf_prefix) + 1, '.'); if (!rest) die("bad layer tensor name"); rest++; + + for (size_t i = 0; i < extra_map_len; i++) { + if (strcmp(rest, extra_map[i].gguf) == 0) { + char buf[512]; + snprintf(buf, sizeof(buf), "%s.%d.%s", hf_prefix, layer, extra_map[i].hf); + return xstrdup(buf); + } + } for (size_t i = 0; i < sizeof(layer_map) / sizeof(layer_map[0]); i++) { if (strcmp(rest, layer_map[i].gguf) == 0) { char buf[512]; - snprintf(buf, sizeof(buf), "layers.%d.%s", layer, layer_map[i].hf); + snprintf(buf, sizeof(buf), "%s.%d.%s", hf_prefix, layer, layer_map[i].hf); return xstrdup(buf); } } + return NULL; +} + +static char *hf_name_for_regular(const char *gguf_name) { + for (size_t i = 0; i < sizeof(top_map) / sizeof(top_map[0]); i++) { + if (strcmp(gguf_name, top_map[i].gguf) == 0) return xstrdup(top_map[i].hf); + } + + char *hf_name = hf_name_for_mapped_layer(gguf_name, "blk", "layers", NULL, 0); + if (hf_name) return hf_name; + + hf_name = hf_name_for_mapped_layer(gguf_name, "mtp", "mtp", + dspark_mtp_map, + sizeof(dspark_mtp_map) / sizeof(dspark_mtp_map[0])); + if (hf_name) return hf_name; + fprintf(stderr, "error: cannot map GGUF tensor to HF tensor: %s\n", gguf_name); exit(1); } +static void expect_hf_name(const char *gguf, const char *want) { + char *got = hf_name_for_regular(gguf); + if (strcmp(got, want) != 0) { + fprintf(stderr, "error: map %s -> %s, expected %s\n", gguf, got, want); + exit(1); + } + free(got); +} + +typedef struct { + uint32_t block_size; + uint32_t noise_token_id; + uint32_t markov_rank; + uint32_t n_mtp_layers; + uint32_t target_layer_ids[DS4_DSPARK_TARGET_LAYER_COUNT]; +} dspark_metadata; + +typedef enum { + DS4_DSPARK_HF_NONE = 0, + DS4_DSPARK_HF_MARKOV, + DS4_DSPARK_HF_NONSEQ, +} dspark_hf_layout; + +static const char *dspark_hf_layout_name(dspark_hf_layout layout) { + switch (layout) { + case DS4_DSPARK_HF_MARKOV: return "markov"; + case DS4_DSPARK_HF_NONSEQ: return "nonseq"; + case DS4_DSPARK_HF_NONE: + default: return "none"; + } +} + +static bool is_mtp_tensor_name(const char *name) { + return str_starts(name, "mtp."); +} + +static bool is_dspark_special_tensor(const char *name) { + return strstr(name, ".main_proj.weight") != NULL || + strstr(name, ".main_norm.weight") != NULL || + strstr(name, ".attn_norm.weight") != NULL || + strstr(name, ".attn_q_a_norm.weight") != NULL || + strstr(name, ".attn_kv_a_norm.weight") != NULL || + strstr(name, ".ffn_norm.weight") != NULL || + strstr(name, ".markov_head.markov_w1.weight") != NULL || + strstr(name, ".markov_head.markov_w2.weight") != NULL || + strstr(name, ".confidence_head.proj.weight") != NULL; +} + +static bool is_dspark_kv_key(const char *key) { + return strcmp(key, DS4_KV_DSPARK_N_MTP_LAYERS) == 0 || + strcmp(key, DS4_KV_DSPARK_BLOCK_SIZE) == 0 || + strcmp(key, DS4_KV_DSPARK_NOISE_TOKEN_ID) == 0 || + strcmp(key, DS4_KV_DSPARK_MARKOV_RANK) == 0 || + strncmp(key, DS4_KV_DSPARK_TARGET_LAYER_ID, strlen(DS4_KV_DSPARK_TARGET_LAYER_ID)) == 0; +} + +static dspark_hf_layout dspark_hf_layout_guess(bool has_main_proj, + bool has_markov_w1, + bool has_confidence_proj, + bool markov_rank_set, + uint32_t markov_rank) { + if (!has_main_proj) return DS4_DSPARK_HF_NONE; + if (has_markov_w1 && has_confidence_proj) return DS4_DSPARK_HF_MARKOV; + if (!has_markov_w1 && !has_confidence_proj && markov_rank_set && markov_rank == 0) { + return DS4_DSPARK_HF_NONSEQ; + } + return DS4_DSPARK_HF_NONE; +} + +static dspark_hf_layout db_dspark_hf_layout(const st_db *db, bool markov_rank_set, uint32_t markov_rank) { + return dspark_hf_layout_guess(db_has(db, "mtp.0.main_proj.weight"), + db_has(db, "mtp.2.markov_head.markov_w1.weight"), + db_has(db, "mtp.2.confidence_head.proj.weight"), + markov_rank_set, + markov_rank); +} + +static dspark_metadata dspark_metadata_defaults(void) { + dspark_metadata m = { + .block_size = 5, + .noise_token_id = 128799, + .markov_rank = 256, + .n_mtp_layers = 3, + .target_layer_ids = {40, 41, 42}, + }; + return m; +} + +static void dspark_metadata_apply_hf_config_path(dspark_metadata *m, const char *cfg_path, bool *markov_rank_set) { + size_t len = 0; + char *jtext = read_optional_file(cfg_path, &len); + if (!jtext) return; + json_doc d = json_parse_text(jtext, len); + int block = json_obj_get(&d, 0, "dspark_block_size"); + int noise = json_obj_get(&d, 0, "dspark_noise_token_id"); + int rank = json_obj_get(&d, 0, "dspark_markov_rank"); + int n_mtp = json_obj_get(&d, 0, "n_mtp_layers"); + int layers = json_obj_get(&d, 0, "dspark_target_layer_ids"); + if (block >= 0) m->block_size = (uint32_t)json_i64(&d, block); + if (noise >= 0) m->noise_token_id = (uint32_t)json_i64(&d, noise); + if (rank >= 0) { + m->markov_rank = (uint32_t)json_i64(&d, rank); + if (markov_rank_set) *markov_rank_set = true; + } + if (n_mtp >= 0) m->n_mtp_layers = (uint32_t)json_i64(&d, n_mtp); + if (layers >= 0 && d.v[layers].type == JT_ARRAY) { + int n = 0; + for (int i = layers + 1; i < d.len && d.v[i].parent == layers && n < DS4_DSPARK_TARGET_LAYER_COUNT;) { + m->target_layer_ids[n++] = (uint32_t)json_i64(&d, i); + i = json_skip(&d, i); + } + } + json_free(&d); + free(jtext); +} + +static dspark_metadata dspark_metadata_from_hf_config(const char *hf_dir, bool *markov_rank_set) { + if (markov_rank_set) *markov_rank_set = false; + dspark_metadata m = dspark_metadata_defaults(); + char *root_cfg_path = path_join(hf_dir, "config.json"); + dspark_metadata_apply_hf_config_path(&m, root_cfg_path, markov_rank_set); + free(root_cfg_path); + char *inference_cfg_path = path_join(hf_dir, "inference/config.json"); + dspark_metadata_apply_hf_config_path(&m, inference_cfg_path, markov_rank_set); + free(inference_cfg_path); + return m; +} + typedef struct { char *prefix; ds4q_type type; @@ -1000,14 +1211,27 @@ static bool is_attention_tensor(const char *name) { return strstr(name, ".attn") || strstr(name, "attn_") || strstr(name, ".indexer") || strstr(name, "indexer_"); } +static bool is_norm_tensor(const char *name) { + return strcmp(name, "output_norm.weight") == 0 || + strstr(name, "_norm.weight") != NULL || + strstr(name, ".norm.weight") != NULL; +} + static bool is_shared_expert(const char *name) { return strstr(name, "_shexp.") != NULL; } - static bool is_output_tensor(const char *name) { return str_starts(name, "output."); } +static bool is_loader_plain_f16_tensor(const char *name) { + return strcmp(name, "output_hc_fn.weight") == 0 || + strstr(name, ".hc_attn_fn.weight") != NULL || + strstr(name, ".hc_ffn_fn.weight") != NULL || + strstr(name, ".hc_head_fn.weight") != NULL || + strstr(name, ".ffn_gate_inp.weight") != NULL; +} + typedef struct { char *name; int n_dims; @@ -1018,6 +1242,20 @@ typedef struct { size_t size; } tensor_meta; +typedef struct gguf_file { + char *path; + uint32_t version; + uint64_t n_kv; + uint64_t n_tensors; + uint8_t *kv_raw; + size_t kv_raw_len; + size_t alignment; + int n_experts; + size_t data_offset; + tensor_meta *tensors; + hmap tensor_map; +} gguf_file; + static int tensor_n_dims(const tensor_meta *t) { int n = t->n_dims; while (n > 1 && t->ne[n - 1] == 1) n--; @@ -1041,6 +1279,19 @@ static ds4q_type policy_type(const quant_policy *p, const char *name, const tens tmpl->type != DS4Q_TYPE_BF16 && !ds4q_can_quantize(tmpl->type)) { return tmpl->type; } + if (is_mtp_tensor_name(name) && is_dspark_special_tensor(name)) { + if (strstr(name, ".confidence_head.proj.weight")) return DS4Q_TYPE_F32; + if (strstr(name, ".main_proj.weight")) return DS4Q_TYPE_Q8_0; + if (strstr(name, ".main_norm.weight") || strstr(name, ".attn_norm.weight") || + strstr(name, ".attn_q_a_norm.weight") || strstr(name, ".attn_kv_a_norm.weight") || + strstr(name, ".ffn_norm.weight")) return DS4Q_TYPE_F32; + if (strstr(name, ".markov_head.markov_w1.weight") || + strstr(name, ".markov_head.markov_w2.weight")) { + return tmpl->type == DS4Q_TYPE_F32 ? DS4Q_TYPE_F32 : DS4Q_TYPE_F16; + } + } + if (is_loader_plain_f16_tensor(name)) return DS4Q_TYPE_F16; + if (is_norm_tensor(name)) return DS4Q_TYPE_F32; if (tensor_n_dims(tmpl) <= 1) return tmpl->type; if (strcmp(name, "token_embd.weight") == 0 && p->embedding != DS4Q_TYPE_COUNT) return p->embedding; if (is_output_tensor(name) && p->output != DS4Q_TYPE_COUNT) return p->output; @@ -1051,6 +1302,148 @@ static ds4q_type policy_type(const quant_policy *p, const char *name, const tens return tmpl->type; } +static void expect_policy_type(const quant_policy *p, const char *name, ds4q_type tmpl_type, ds4q_type want) { + tensor_meta tmpl = { + .name = (char *)name, + .n_dims = 2, + .ne = {4096, 4096, 1, 1}, + .type = tmpl_type, + }; + ds4q_type got = policy_type(p, name, &tmpl); + if (got != want) { + fprintf(stderr, "error: policy %s -> %s, expected %s\n", + name, ds4q_type_name(got), ds4q_type_name(want)); + exit(1); + } +} + +static void self_test_dspark_only_args(void); +static ds4q_type dspark_template_for_name(const char *name, ds4q_type hf_type); + +static void expect_dspark_template_type(const char *name, ds4q_type hf_type, ds4q_type want) { + ds4q_type got = dspark_template_for_name(name, hf_type); + if (got != want) { + fprintf(stderr, "error: DSpark template %s -> %s, expected %s\n", + name, ds4q_type_name(got), ds4q_type_name(want)); + exit(1); + } +} + + +static void self_test_dspark_map(void) { + expect_hf_name("mtp.0.hc_attn_base.weight", "mtp.0.hc_attn_base"); + expect_hf_name("mtp.0.main_proj.weight", "mtp.0.main_proj.weight"); + expect_hf_name("mtp.2.markov_head.markov_w1.weight", "mtp.2.markov_head.markov_w1.weight"); + expect_hf_name("mtp.2.confidence_head.proj.weight", "mtp.2.confidence_head.proj.weight"); + expert_tensor routed = parse_expert_tensor("mtp.2.ffn_down_exps.weight"); + if (!routed.is_expert || !routed.is_mtp || routed.layer != 2 || routed.part != EXP_W2) { + die("bad DSpark MTP routed expert parse"); + } + char eprefix[256]; + expert_hf_prefix(eprefix, sizeof(eprefix), &routed, 7, expert_part_name(routed.part)); + if (strcmp(eprefix, "mtp.2.ffn.experts.7.w2") != 0) { + die("bad DSpark MTP expert HF prefix"); + } + quant_policy pol = {0}; + pol.dense = DS4Q_TYPE_Q4_K; + expect_policy_type(&pol, "mtp.0.main_proj.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_Q8_0); + expect_policy_type(&pol, "mtp.2.markov_head.markov_w1.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "mtp.2.confidence_head.proj.weight", DS4Q_TYPE_F32, DS4Q_TYPE_F32); + expect_policy_type(&pol, "mtp.2.hc_head_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "mtp.0.hc_attn_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "mtp.0.ffn_gate_inp.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "blk.0.hc_ffn_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "output_hc_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "blk.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "blk.0.attn_q_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "blk.0.attn_kv_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "blk.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + pol.dense = DS4Q_TYPE_COUNT; + expect_policy_type(&pol, "mtp.0.main_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "mtp.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "mtp.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.hc_attn_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.hc_attn_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.attn_sinks.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.attn_q_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.attn_kv_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.hc_ffn_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.hc_ffn_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.exp_probs_b.bias", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.2.main_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.2.norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.ffn_gate_inp.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_dspark_template_type("mtp.2.hc_head_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.2.hc_head_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.2.confidence_head.proj.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + if (dspark_hf_layout_guess(true, true, true, false, 0) != DS4_DSPARK_HF_MARKOV) { + die("official DSpark HF layout not detected"); + } + if (dspark_hf_layout_guess(true, false, false, true, 0) != DS4_DSPARK_HF_NONSEQ) { + die("nonseq DSpark HF layout not detected"); + } + if (dspark_hf_layout_guess(true, false, false, false, 0) != DS4_DSPARK_HF_NONE) { + die("main-proj-only DSpark layout detected without markov_rank=0 metadata"); + } + char tmpdir[] = "/tmp/ds4q-config-XXXXXX"; + char *dir = mkdtemp(tmpdir); + if (!dir) die_errno("mkdtemp", tmpdir); + char *cfg_path = path_join(dir, "config.json"); + FILE *cfp = fopen(cfg_path, "wb"); + if (!cfp) die_errno("create config", cfg_path); + fputs("{\"dspark_block_size\":7,\"dspark_noise_token_id\":9,\"dspark_markov_rank\":0," + "\"n_mtp_layers\":3,\"dspark_target_layer_ids\":[5,6,7]}", cfp); + if (fclose(cfp) != 0) die_errno("close config", cfg_path); + bool rank_set = false; + dspark_metadata fm = dspark_metadata_from_hf_config(dir, &rank_set); + if (!rank_set || fm.block_size != 7 || fm.noise_token_id != 9 || fm.markov_rank != 0 || + fm.n_mtp_layers != 3 || fm.target_layer_ids[0] != 5 || fm.target_layer_ids[2] != 7) { + die("bad DSpark root config metadata parse"); + } + unlink(cfg_path); + free(cfg_path); + rmdir(dir); + char tmpdir_inference[] = "/tmp/ds4q-config-merge-XXXXXX"; + char *dir_inference = mkdtemp(tmpdir_inference); + if (!dir_inference) die_errno("mkdtemp", tmpdir_inference); + char *root_cfg_path = path_join(dir_inference, "config.json"); + FILE *root_cfp = fopen(root_cfg_path, "wb"); + if (!root_cfp) die_errno("create root config", root_cfg_path); + fputs("{\"num_nextn_predict_layers\":1}", root_cfp); + if (fclose(root_cfp) != 0) die_errno("close root config", root_cfg_path); + char *inf_dir = path_join(dir_inference, "inference"); + if (mkdir(inf_dir, 0700) != 0) die_errno("mkdir", inf_dir); + char *inf_cfg_path = path_join(inf_dir, "config.json"); + FILE *inf_cfp = fopen(inf_cfg_path, "wb"); + if (!inf_cfp) die_errno("create inference config", inf_cfg_path); + fputs("{\"dspark_block_size\":8,\"dspark_noise_token_id\":11,\"dspark_markov_rank\":0," + "\"n_mtp_layers\":3,\"dspark_target_layer_ids\":[40,41,42]}", inf_cfp); + if (fclose(inf_cfp) != 0) die_errno("close inference config", inf_cfg_path); + rank_set = false; + fm = dspark_metadata_from_hf_config(dir_inference, &rank_set); + if (!rank_set || fm.block_size != 8 || fm.noise_token_id != 11 || fm.markov_rank != 0 || + fm.n_mtp_layers != 3 || fm.target_layer_ids[0] != 40 || fm.target_layer_ids[2] != 42) { + die("bad DSpark inference config metadata merge"); + } + unlink(inf_cfg_path); + unlink(root_cfg_path); + rmdir(inf_dir); + rmdir(dir_inference); + free(inf_cfg_path); + free(inf_dir); + free(root_cfg_path); + dspark_metadata dm = dspark_metadata_defaults(); + if (dm.block_size != 5 || dm.noise_token_id != 128799 || dm.markov_rank != 256 || + dm.n_mtp_layers != 3 || dm.target_layer_ids[0] != 40) { + die("bad DSpark metadata defaults"); + } + self_test_dspark_only_args(); + puts("dspark_map: OK"); +} + + static ds4q_type parse_type(const char *raw) { char wanted[64]; size_t n = 0; @@ -1087,6 +1480,21 @@ typedef struct { size_t size; } byte_buf; +static byte_buf read_gguf_tensor_data(const gguf_file *g, const char *path, const char *name); +static byte_buf read_gguf_tensor_data_range(const gguf_file *g, const tensor_meta *t, + uint64_t rel_offset, size_t size); + +typedef enum { + MODEL_SOURCE_HF, + MODEL_SOURCE_GGUF, +} model_source_kind; + +typedef struct { + model_source_kind kind; + st_db *hf; + gguf_file *gguf; +} model_source; + static byte_buf f32_to_type(const float *src, int64_t n, ds4q_type type, int64_t ncols, const float *imat) { if (ncols <= 0 || n % ncols != 0) die("bad ncols for tensor conversion"); byte_buf out = {0}; @@ -1150,22 +1558,100 @@ static size_t tensor_nbytes(ds4q_type type, const int64_t *ne, int n_dims) { return nbytes; } -static void check_reversed_shape(const char *gguf_name, const st_info *info, const tensor_meta *tmpl) { - int nd = tensor_n_dims(tmpl); - if (info->n_dims != nd) { - fprintf(stderr, "error: rank mismatch for %s\n", gguf_name); +static int64_t meta_nelements(const tensor_meta *t) { + int64_t n = 1; + for (int i = 0; i < t->n_dims; i++) n *= t->ne[i]; + return n; +} + +static const tensor_meta *gguf_find_tensor(const gguf_file *g, const char *name) { + int idx = hmap_get(&g->tensor_map, name); + if (idx < 0) { + fprintf(stderr, "error: tensor not found in source GGUF: %s\n", name); exit(1); } - for (int i = 0; i < nd; i++) { - if (tmpl->ne[i] != info->shape[nd - 1 - i]) { - fprintf(stderr, "error: shape mismatch for %s\n", gguf_name); + return &g->tensors[idx]; +} + +static void check_same_gguf_shape(const char *name, const tensor_meta *src, const tensor_meta *tmpl) { + const int snd = tensor_n_dims(src); + const int tnd = tensor_n_dims(tmpl); + if (snd != tnd) { + fprintf(stderr, "error: source/template rank mismatch for %s\n", name); + exit(1); + } + for (int i = 0; i < tnd; i++) { + if (src->ne[i] != tmpl->ne[i]) { + fprintf(stderr, "error: source/template shape mismatch for %s\n", name); exit(1); } } } -static byte_buf generate_regular(st_db *db, const char *gguf_name, const tensor_meta *tmpl, - ds4q_type target, const imatrix_store *imatrix) { +static float *gguf_tensor_to_f32(const byte_buf *src, const tensor_meta *meta, int64_t *n_out) { + const int64_t ncols = meta->ne[0]; + const int64_t n = meta_nelements(meta); + if (ncols <= 0 || n % ncols != 0) die("bad GGUF tensor shape for dequantization"); + const int64_t nrows = n / ncols; + float *out = xmalloc((size_t)n * sizeof(float)); + + if (meta->type == DS4Q_TYPE_F32) { + if (src->size != (size_t)n * sizeof(float)) die("bad GGUF F32 byte size"); + memcpy(out, src->data, src->size); + } else if (meta->type == DS4Q_TYPE_F16) { + if (src->size != (size_t)n * sizeof(uint16_t)) die("bad GGUF F16 byte size"); + for (int64_t i = 0; i < n; i++) { + out[i] = ds4q_f16_to_f32(load_u16_le(src->data + (size_t)i * 2)); + } + } else if (meta->type == DS4Q_TYPE_BF16) { + if (src->size != (size_t)n * sizeof(uint16_t)) die("bad GGUF BF16 byte size"); + for (int64_t i = 0; i < n; i++) { + out[i] = ds4q_bf16_to_f32(load_u16_le(src->data + (size_t)i * 2)); + } + } else if (meta->type == DS4Q_TYPE_Q8_0) { + if (ncols % ds4q_block_size(DS4Q_TYPE_Q8_0) != 0) die("bad Q8_0 column count"); + const size_t row_size = ds4q_row_size(DS4Q_TYPE_Q8_0, ncols); + if (src->size != (size_t)nrows * row_size) die("bad GGUF Q8_0 byte size"); + const uint8_t *p = src->data; + for (int64_t r = 0; r < nrows; r++) { + float *row = out + (size_t)r * (size_t)ncols; + for (int64_t b = 0; b < ncols / 32; b++) { + const float d = ds4q_f16_to_f32(load_u16_le(p)); + p += sizeof(uint16_t); + const int8_t *qs = (const int8_t *)p; + for (int j = 0; j < 32; j++) row[(size_t)b * 32u + (size_t)j] = d * (float)qs[j]; + p += 32; + } + } + } else { + fprintf(stderr, "error: cannot dequantize source GGUF tensor type %s\n", ds4q_type_name(meta->type)); + exit(1); + } + if (n_out) *n_out = n; + return out; +} + +static bool reversed_shape_matches(const st_info *info, const tensor_meta *tmpl, int nd) { + if (info->n_dims != nd) return false; + for (int i = 0; i < nd; i++) { + if (tmpl->ne[i] != info->shape[nd - 1 - i]) return false; + } + return true; +} + +static void check_reversed_shape(const char *gguf_name, const st_info *info, const tensor_meta *tmpl) { + if (reversed_shape_matches(info, tmpl, tmpl->n_dims)) return; + if (reversed_shape_matches(info, tmpl, tensor_n_dims(tmpl))) return; + if (info->n_dims != tmpl->n_dims && info->n_dims != tensor_n_dims(tmpl)) { + fprintf(stderr, "error: rank mismatch for %s\n", gguf_name); + exit(1); + } + fprintf(stderr, "error: shape mismatch for %s\n", gguf_name); + exit(1); +} + +static byte_buf generate_regular_hf(st_db *db, const char *gguf_name, const tensor_meta *tmpl, + ds4q_type target, const imatrix_store *imatrix) { char *hf_name = hf_name_for_regular(gguf_name); tensor_entry *te = db_tensor(db, hf_name, NULL); check_reversed_shape(gguf_name, &te->info, tmpl); @@ -1203,6 +1689,30 @@ static byte_buf generate_regular(st_db *db, const char *gguf_name, const tensor_ return b; } +static byte_buf generate_regular_gguf(const gguf_file *src, const char *gguf_name, + const tensor_meta *tmpl, ds4q_type target, + const imatrix_store *imatrix) { + const tensor_meta *src_meta = gguf_find_tensor(src, gguf_name); + check_same_gguf_shape(gguf_name, src_meta, tmpl); + if (target == src_meta->type) { + byte_buf b = read_gguf_tensor_data(src, src->path, gguf_name); + if (b.size != tensor_nbytes(target, tmpl->ne, tmpl->n_dims)) die("source copy size mismatch"); + return b; + } + if (target == DS4Q_TYPE_I32) die("cannot convert GGUF source tensor to I32"); + if (!is_quantizable_target(target)) die("unsupported regular target type"); + + byte_buf raw = read_gguf_tensor_data(src, src->path, gguf_name); + int64_t n = 0; + float *f32 = gguf_tensor_to_f32(&raw, src_meta, &n); + free(raw.data); + const char *names[1] = { gguf_name }; + const float *imat = imatrix_find(imatrix, names, 1, tmpl->ne[0], -1, 0); + byte_buf b = f32_to_type(f32, n, target, tmpl->ne[0], imat); + free(f32); + return b; +} + typedef struct { st_db *db; const char *gguf_name; @@ -1223,7 +1733,7 @@ typedef struct { static void generate_one_expert(expert_job *j, int xid) { char prefix[256]; - snprintf(prefix, sizeof(prefix), "layers.%d.ffn.experts.%d.%s", j->expert.layer, xid, j->wid); + expert_hf_prefix(prefix, sizeof(prefix), &j->expert, xid, j->wid); char weight_name[320]; char scale_name[320]; snprintf(weight_name, sizeof(weight_name), "%s.weight", prefix); @@ -1263,9 +1773,9 @@ static void *expert_worker(void *arg) { return NULL; } -static byte_buf generate_expert(st_db *db, const char *gguf_name, const tensor_meta *tmpl, - ds4q_type target, int n_experts, int n_threads, - const imatrix_store *imatrix) { +static byte_buf generate_expert_hf(st_db *db, const char *gguf_name, const tensor_meta *tmpl, + ds4q_type target, int n_experts, int n_threads, + const imatrix_store *imatrix) { expert_tensor e = parse_expert_tensor(gguf_name); if (!e.is_expert) die("not an expert tensor"); if (!is_quantizable_target(target)) die("unsupported expert target type"); @@ -1295,13 +1805,118 @@ static byte_buf generate_expert(st_db *db, const char *gguf_name, const tensor_m return out; } -static byte_buf generate_tensor(st_db *db, const char *name, const tensor_meta *tmpl, +typedef struct { + const gguf_file *src; + const char *gguf_name; + const tensor_meta *src_meta; + const tensor_meta *tmpl; + ds4q_type target; + int n_experts; + const imatrix_store *imatrix; + expert_tensor expert; + int64_t ncols; + int64_t nrows; + size_t src_per_expert; + size_t dst_per_expert; + byte_buf *out; + int next; + int done; + pthread_mutex_t lock; +} gguf_expert_job; + +static void generate_one_expert_gguf(gguf_expert_job *j, int xid) { + byte_buf raw = read_gguf_tensor_data_range(j->src, j->src_meta, + (uint64_t)xid * (uint64_t)j->src_per_expert, + j->src_per_expert); + tensor_meta one = *j->src_meta; + one.n_dims = 2; + one.ne[0] = j->ncols; + one.ne[1] = j->nrows; + for (int i = 2; i < DS4Q_MAX_DIMS; i++) one.ne[i] = 1; + one.size = j->src_per_expert; + + int64_t n = 0; + float *f32 = gguf_tensor_to_f32(&raw, &one, &n); + free(raw.data); + const char *names[1] = { j->gguf_name }; + const float *imat = imatrix_find(j->imatrix, names, 1, j->ncols, xid, j->n_experts); + byte_buf q = f32_to_type(f32, n, j->target, j->ncols, imat); + if (q.size != j->dst_per_expert) die("expert quantized size mismatch"); + memcpy(j->out->data + (size_t)xid * j->dst_per_expert, q.data, q.size); + free(q.data); + free(f32); +} + +static void *gguf_expert_worker(void *arg) { + gguf_expert_job *j = arg; + for (;;) { + pthread_mutex_lock(&j->lock); + int xid = j->next++; + pthread_mutex_unlock(&j->lock); + if (xid >= j->n_experts) break; + generate_one_expert_gguf(j, xid); + pthread_mutex_lock(&j->lock); + int done = ++j->done; + if (done % 32 == 0 || done == j->n_experts) { + fprintf(stderr, "generate_expert_tensor_from_gguf: layer %d %d/%d experts\n", + j->expert.layer, done, j->n_experts); + } + pthread_mutex_unlock(&j->lock); + } + return NULL; +} + +static byte_buf generate_expert_gguf(const gguf_file *src, const char *gguf_name, const tensor_meta *tmpl, + ds4q_type target, int n_experts, int n_threads, + const imatrix_store *imatrix) { + expert_tensor e = parse_expert_tensor(gguf_name); + if (!e.is_expert) die("not an expert tensor"); + if (!is_quantizable_target(target)) die("unsupported expert target type"); + const tensor_meta *src_meta = gguf_find_tensor(src, gguf_name); + check_same_gguf_shape(gguf_name, src_meta, tmpl); + if (src_meta->n_dims < 3 || src_meta->ne[2] != n_experts) die("source expert tensor shape mismatch"); + const int64_t ncols = tmpl->ne[0]; + const int64_t nrows = tmpl->ne[1]; + const size_t src_per_expert = (size_t)nrows * ds4q_row_size(src_meta->type, ncols); + const size_t dst_per_expert = (size_t)nrows * ds4q_row_size(target, ncols); + if (src_per_expert * (size_t)n_experts != src_meta->size) die("source expert size mismatch"); + + byte_buf out = { .size = dst_per_expert * (size_t)n_experts, .data = xmalloc(dst_per_expert * (size_t)n_experts) }; + ds4q_quantize_init(target); + int worker_count = n_threads > 0 ? n_threads : 8; + if (worker_count < 1) worker_count = 1; + if (worker_count > n_experts) worker_count = n_experts; + fprintf(stderr, "generate_expert_tensor_from_gguf: layer %d using %d worker%s\n", + e.layer, worker_count, worker_count == 1 ? "" : "s"); + gguf_expert_job job = { + .src = src, .gguf_name = gguf_name, .src_meta = src_meta, .tmpl = tmpl, .target = target, + .n_experts = n_experts, .imatrix = imatrix, .expert = e, + .ncols = ncols, .nrows = nrows, .src_per_expert = src_per_expert, + .dst_per_expert = dst_per_expert, .out = &out, + }; + pthread_mutex_init(&job.lock, NULL); + pthread_t *threads = xcalloc((size_t)worker_count, sizeof(threads[0])); + for (int i = 1; i < worker_count; i++) pthread_create(&threads[i], NULL, gguf_expert_worker, &job); + gguf_expert_worker(&job); + for (int i = 1; i < worker_count; i++) pthread_join(threads[i], NULL); + pthread_mutex_destroy(&job.lock); + free(threads); + return out; +} + +static byte_buf generate_tensor(model_source *source, const char *name, const tensor_meta *tmpl, ds4q_type target, int n_experts, int n_threads, const imatrix_store *imatrix) { if (parse_expert_tensor(name).is_expert) { - return generate_expert(db, name, tmpl, target, n_experts, n_threads, imatrix); + if (source->kind == MODEL_SOURCE_GGUF) { + return generate_expert_gguf(source->gguf, name, tmpl, target, n_experts, n_threads, imatrix); + } + return generate_expert_hf(source->hf, name, tmpl, target, n_experts, n_threads, imatrix); } - return generate_regular(db, name, tmpl, target, imatrix); + if (source->kind == MODEL_SOURCE_GGUF) { + return generate_regular_gguf(source->gguf, name, tmpl, target, imatrix); + } + return generate_regular_hf(source->hf, name, tmpl, target, imatrix); } /* ===== @@ -1317,19 +1932,6 @@ typedef struct { size_t end; } byte_span; -typedef struct { - char *path; - uint32_t version; - uint64_t n_kv; - uint64_t n_tensors; - uint8_t *kv_raw; - size_t kv_raw_len; - size_t alignment; - int n_experts; - size_t data_offset; - tensor_meta *tensors; - hmap tensor_map; -} gguf_file; typedef struct { tensor_meta *tensors; @@ -1339,6 +1941,8 @@ typedef struct { size_t data_offset; size_t tensor_bytes; size_t alignment; + bool write_dspark; + dspark_metadata dspark; } output_context; static size_t gguf_scalar_size(uint32_t type) { @@ -1418,6 +2022,11 @@ static bool is_imatrix_kv_key(const char *key) { return str_starts(key, "quantize.imatrix."); } + +static size_t extra_alignment_kv_size(void) { + return gguf_string_size(DS4_KV_GENERAL_ALIGNMENT) + 4 + 4; +} + static size_t extra_imatrix_kv_size(const imatrix_store *im) { if (!imatrix_enabled(im)) return 0; size_t n = 0; @@ -1433,6 +2042,13 @@ static uint64_t extra_imatrix_kv_count(const imatrix_store *im) { return 2 + (im->dataset ? 1 : 0) + (im->chunks > 0 ? 1 : 0); } +static void write_alignment_kv(FILE *fp, size_t alignment) { + if (alignment > UINT32_MAX) die("GGUF alignment does not fit in uint32"); + write_gguf_string(fp, DS4_KV_GENERAL_ALIGNMENT); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, (uint32_t)alignment); +} + static void write_imatrix_kvs(FILE *fp, const imatrix_store *im) { if (!imatrix_enabled(im)) return; write_gguf_string(fp, DS4_KV_QUANTIZE_IMATRIX_FILE); @@ -1455,7 +2071,63 @@ static void write_imatrix_kvs(FILE *fp, const imatrix_store *im) { } } -static gguf_file load_gguf_metadata(const char *path) { + +static size_t gguf_kv_scalar_size(uint32_t type) { + return 4 + gguf_scalar_size(type); +} + + +static size_t gguf_kv_u32_size(const char *key) { + return gguf_string_size(key) + gguf_kv_scalar_size(GGUF_TYPE_UINT32); +} + +static uint64_t extra_dspark_kv_count(bool enabled) { + if (!enabled) return 0; + return 4 + DS4_DSPARK_TARGET_LAYER_COUNT; +} + +static size_t extra_dspark_kv_size(bool enabled) { + if (!enabled) return 0; + size_t n = 0; + n += gguf_kv_u32_size(DS4_KV_DSPARK_N_MTP_LAYERS); + n += gguf_kv_u32_size(DS4_KV_DSPARK_BLOCK_SIZE); + n += gguf_kv_u32_size(DS4_KV_DSPARK_NOISE_TOKEN_ID); + n += gguf_kv_u32_size(DS4_KV_DSPARK_MARKOV_RANK); + for (int i = 0; i < DS4_DSPARK_TARGET_LAYER_COUNT; i++) { + char key[64]; + snprintf(key, sizeof(key), "%s.%d", DS4_KV_DSPARK_TARGET_LAYER_ID, i); + n += gguf_kv_u32_size(key); + } + return n; +} + +static void write_dspark_kvs(FILE *fp, const dspark_metadata *m) { + write_gguf_string(fp, DS4_KV_DSPARK_N_MTP_LAYERS); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->n_mtp_layers); + + write_gguf_string(fp, DS4_KV_DSPARK_BLOCK_SIZE); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->block_size); + + write_gguf_string(fp, DS4_KV_DSPARK_NOISE_TOKEN_ID); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->noise_token_id); + + write_gguf_string(fp, DS4_KV_DSPARK_MARKOV_RANK); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->markov_rank); + + for (int i = 0; i < DS4_DSPARK_TARGET_LAYER_COUNT; i++) { + char key[64]; + snprintf(key, sizeof(key), "%s.%d", DS4_KV_DSPARK_TARGET_LAYER_ID, i); + write_gguf_string(fp, key); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->target_layer_ids[i]); + } +} + +static gguf_file load_gguf_metadata(const char *path, bool drop_dspark_kvs) { gguf_file g = {0}; g.path = xstrdup(path); FILE *fp = fopen(path, "rb"); @@ -1494,12 +2166,12 @@ static gguf_file load_gguf_metadata(const char *path) { if (rec_end < 0 || rec_end < rec_start) die("GGUF ftell failed"); /* - * Template GGUFs may already carry imatrix provenance from a previous - * quantization. Drop those keys and write the current run's keys later, - * otherwise the output can contain duplicate GGUF metadata with stale - * and new values. + * Template GGUFs may already carry provenance from a previous run. + * Always drop imatrix keys because the current run rewrites them. + * Drop DSpark keys only when this run will rewrite DSpark metadata; + * source/template reuse without DSpark rewriting must preserve them. */ - if (!is_imatrix_kv_key(key)) { + if (!is_imatrix_kv_key(key) && !(drop_dspark_kvs && is_dspark_kv_key(key))) { kv_keep[n_kv_keep++] = (byte_span){ .start = (size_t)(rec_start - kv_start), .end = (size_t)(rec_end - kv_start), @@ -1549,6 +2221,149 @@ static gguf_file load_gguf_metadata(const char *path) { return g; } +static void gguf_replace_tensors_start(gguf_file *g) { + for (uint64_t i = 0; i < g->n_tensors; i++) free(g->tensors[i].name); + free(g->tensors); + g->tensors = NULL; + g->n_tensors = 0; + g->data_offset = 0; + hmap_free(&g->tensor_map); +} + +static void gguf_add_tensor_meta(gguf_file *g, const char *name, int n_dims, const int64_t *ne, ds4q_type type) { + g->tensors = xrealloc(g->tensors, (size_t)(g->n_tensors + 1) * sizeof(g->tensors[0])); + tensor_meta *t = &g->tensors[g->n_tensors++]; + memset(t, 0, sizeof(*t)); + t->name = xstrdup(name); + t->n_dims = n_dims; + for (int i = 0; i < n_dims; i++) t->ne[i] = ne[i]; + t->type = type; + t->size = tensor_nbytes(type, t->ne, t->n_dims); +} + +static ds4q_type template_type_for_hf_dtype(const char *dtype) { + if (strcmp(dtype, "F32") == 0) return DS4Q_TYPE_F32; + if (strcmp(dtype, "BF16") == 0) return DS4Q_TYPE_BF16; + if (strcmp(dtype, "F8_E4M3") == 0) return DS4Q_TYPE_F16; + if (strcmp(dtype, "I8") == 0) return DS4Q_TYPE_Q4_K; + if (strcmp(dtype, "I64") == 0) return DS4Q_TYPE_I32; + fprintf(stderr, "error: unsupported HF dtype for DSpark template: %s\n", dtype); + exit(1); +} + +static bool is_dspark_required_stage_tensor(const char *rest) { + return strcmp(rest, "hc_attn_fn.weight") == 0 || + strcmp(rest, "hc_attn_scale.weight") == 0 || + strcmp(rest, "hc_attn_base.weight") == 0 || + strcmp(rest, "attn_norm.weight") == 0 || + strcmp(rest, "attn_q_a.weight") == 0 || + strcmp(rest, "attn_q_a_norm.weight") == 0 || + strcmp(rest, "attn_q_b.weight") == 0 || + strcmp(rest, "attn_kv.weight") == 0 || + strcmp(rest, "attn_kv_a_norm.weight") == 0 || + strcmp(rest, "attn_sinks.weight") == 0 || + strcmp(rest, "attn_output_a.weight") == 0 || + strcmp(rest, "attn_output_b.weight") == 0 || + strcmp(rest, "hc_ffn_fn.weight") == 0 || + strcmp(rest, "hc_ffn_scale.weight") == 0 || + strcmp(rest, "hc_ffn_base.weight") == 0 || + strcmp(rest, "ffn_norm.weight") == 0 || + strcmp(rest, "ffn_gate_inp.weight") == 0 || + strcmp(rest, "exp_probs_b.bias") == 0 || + strcmp(rest, "ffn_gate_shexp.weight") == 0 || + strcmp(rest, "ffn_up_shexp.weight") == 0 || + strcmp(rest, "ffn_down_shexp.weight") == 0; +} + +static bool is_dspark_routed_stage_tensor(const char *rest) { + return strcmp(rest, "ffn_gate_exps.weight") == 0 || + strcmp(rest, "ffn_up_exps.weight") == 0 || + strcmp(rest, "ffn_down_exps.weight") == 0; +} + +static bool is_dspark_loader_f32_tensor(const char *name) { + return strstr(name, ".main_norm.weight") || + (strstr(name, ".norm.weight") && str_starts(name, "mtp.")) || + strstr(name, ".attn_norm.weight") || + strstr(name, ".attn_q_a_norm.weight") || + strstr(name, ".attn_kv_a_norm.weight") || + strstr(name, ".hc_attn_scale.weight") || + strstr(name, ".hc_attn_base.weight") || + strstr(name, ".attn_sinks.weight") || + strstr(name, ".hc_ffn_scale.weight") || + strstr(name, ".hc_ffn_base.weight") || + strstr(name, ".ffn_norm.weight") || + strstr(name, ".exp_probs_b.bias") || + strstr(name, ".hc_head_base.weight") || + strstr(name, ".hc_head_scale.weight") || + strstr(name, ".confidence_head.proj.weight"); +} + +static ds4q_type dspark_template_for_name(const char *name, ds4q_type hf_type) { + if (is_dspark_loader_f32_tensor(name)) return DS4Q_TYPE_F32; + if (strstr(name, ".markov_head.markov_w1.weight") || + strstr(name, ".markov_head.markov_w2.weight")) return DS4Q_TYPE_F16; + if (strstr(name, ".hc_head_fn.weight") || + strstr(name, ".hc_attn_fn.weight") || + strstr(name, ".hc_ffn_fn.weight") || + strstr(name, ".ffn_gate_inp.weight")) return DS4Q_TYPE_F16; + if (is_attention_projection(name) || is_shared_expert(name)) return DS4Q_TYPE_Q8_0; + if (parse_expert_tensor(name).is_expert) return DS4Q_TYPE_Q4_K; + return hf_type; +} + +static void gguf_add_regular_from_hf(gguf_file *g, st_db *db, const char *gguf_name) { + char *hf_name = hf_name_for_regular(gguf_name); + tensor_entry *te = db_tensor(db, hf_name, NULL); + int nd = te->info.n_dims; + int64_t ne[DS4Q_MAX_DIMS] = {0}; + for (int i = 0; i < nd; i++) ne[i] = te->info.shape[nd - 1 - i]; + ds4q_type hf_type = template_type_for_hf_dtype(te->info.dtype); + gguf_add_tensor_meta(g, gguf_name, nd, ne, dspark_template_for_name(gguf_name, hf_type)); + free(hf_name); +} + +static void gguf_add_expert_from_hf(gguf_file *g, st_db *db, const char *gguf_name, int n_experts) { + expert_tensor e = parse_expert_tensor(gguf_name); + if (!e.is_expert) die("internal error: expected routed expert tensor"); + char prefix[256]; + expert_hf_prefix(prefix, sizeof(prefix), &e, 0, expert_part_name(e.part)); + char weight_name[320]; + snprintf(weight_name, sizeof(weight_name), "%s.weight", prefix); + tensor_entry *te = db_tensor(db, weight_name, NULL); + if (te->info.n_dims != 2) die("bad DSpark routed expert rank"); + int64_t ne[3] = { te->info.shape[1] * 2, te->info.shape[0], n_experts }; + gguf_add_tensor_meta(g, gguf_name, 3, ne, DS4Q_TYPE_Q4_K); +} + +static void gguf_add_dspark_stage(gguf_file *g, st_db *db, uint32_t stage, int n_experts) { + char name[256]; + for (size_t i = 0; i < sizeof(layer_map) / sizeof(layer_map[0]); i++) { + const char *rest = layer_map[i].gguf; + if (!is_dspark_required_stage_tensor(rest) && !is_dspark_routed_stage_tensor(rest)) continue; + snprintf(name, sizeof(name), "mtp.%u.%s", stage, rest); + if (is_dspark_routed_stage_tensor(rest)) gguf_add_expert_from_hf(g, db, name, n_experts); + else gguf_add_regular_from_hf(g, db, name); + } +} + +static void gguf_use_dspark_mtp_template(gguf_file *g, st_db *db, int n_experts, dspark_hf_layout layout) { + if (layout == DS4_DSPARK_HF_NONE) die("--dspark-only requires DSpark HF tensors"); + gguf_replace_tensors_start(g); + gguf_add_regular_from_hf(g, db, "mtp.0.main_proj.weight"); + gguf_add_regular_from_hf(g, db, "mtp.0.main_norm.weight"); + for (uint32_t s = 0; s < DS4_DSPARK_TARGET_LAYER_COUNT; s++) gguf_add_dspark_stage(g, db, s, n_experts); + gguf_add_regular_from_hf(g, db, "mtp.2.norm.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_base.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_fn.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_scale.weight"); + if (layout == DS4_DSPARK_HF_MARKOV) { + gguf_add_regular_from_hf(g, db, "mtp.2.markov_head.markov_w1.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.markov_head.markov_w2.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.confidence_head.proj.weight"); + } +} + static byte_buf read_gguf_tensor_data(const gguf_file *g, const char *path, const char *name) { int idx = hmap_get(&g->tensor_map, name); if (idx < 0) { @@ -1565,6 +2380,20 @@ static byte_buf read_gguf_tensor_data(const gguf_file *g, const char *path, cons return b; } +static byte_buf read_gguf_tensor_data_range(const gguf_file *g, const tensor_meta *t, + uint64_t rel_offset, size_t size) { + if (rel_offset > t->size || size > t->size - rel_offset) die("GGUF tensor range out of bounds"); + byte_buf b = { .size = size, .data = xmalloc(size) }; + FILE *fp = fopen(g->path, "rb"); + if (!fp) die_errno("open GGUF", g->path); + if (fseeko(fp, (off_t)(g->data_offset + t->old_offset + rel_offset), SEEK_SET) != 0) { + die_errno("seek GGUF", g->path); + } + if (size && fread(b.data, 1, size, fp) != size) die_errno("read GGUF tensor range", g->path); + fclose(fp); + return b; +} + static uint64_t fnv1a64_bytes(const uint8_t *data, size_t n) { uint64_t h = 1469598103934665603ull; for (size_t i = 0; i < n; i++) { @@ -1574,11 +2403,15 @@ static uint64_t fnv1a64_bytes(const uint8_t *data, size_t n) { return h; } -static output_context build_output_context(const gguf_file *tmpl, const quant_policy *policy, const imatrix_store *im) { +static output_context build_output_context(const gguf_file *tmpl, const quant_policy *policy, + const imatrix_store *im, bool write_dspark, + const dspark_metadata *dspark) { output_context out = {0}; out.n_tensors = tmpl->n_tensors; - out.n_kv_extra = extra_imatrix_kv_count(im); + out.n_kv_extra = 1 + extra_imatrix_kv_count(im) + extra_dspark_kv_count(write_dspark); out.alignment = tmpl->alignment; + out.write_dspark = write_dspark; + if (write_dspark && dspark) out.dspark = *dspark; out.tensors = xcalloc((size_t)out.n_tensors, sizeof(out.tensors[0])); size_t tensor_info = 0; size_t off = 0; @@ -1598,7 +2431,9 @@ static output_context build_output_context(const gguf_file *tmpl, const quant_po tensor_info += gguf_string_size(dst->name) + 4 + (size_t)dst->n_dims * 8 + 4 + 8; } out.tensor_bytes = off; - out.meta_size = 4 + 4 + 8 + 8 + tmpl->kv_raw_len + extra_imatrix_kv_size(im) + tensor_info; + out.meta_size = 4 + 4 + 8 + 8 + tmpl->kv_raw_len + + extra_alignment_kv_size() + extra_imatrix_kv_size(im) + + extra_dspark_kv_size(write_dspark) + tensor_info; out.data_offset = ds4q_pad(out.meta_size, tmpl->alignment); return out; } @@ -1612,7 +2447,7 @@ static void write_padding(FILE *fp, size_t n) { } } -static void write_full_gguf(st_db *db, const gguf_file *tmpl, const output_context *out_ctx, +static void write_full_gguf(model_source *source, const gguf_file *tmpl, const output_context *out_ctx, const char *out_path, int n_experts, int n_threads, const imatrix_store *imatrix) { FILE *fp = fopen(out_path, "wb"); @@ -1622,7 +2457,9 @@ static void write_full_gguf(st_db *db, const gguf_file *tmpl, const output_conte write_u64(fp, tmpl->n_tensors); write_u64(fp, tmpl->n_kv + out_ctx->n_kv_extra); if (fwrite(tmpl->kv_raw, 1, tmpl->kv_raw_len, fp) != tmpl->kv_raw_len) die("write GGUF KV failed"); + write_alignment_kv(fp, out_ctx->alignment); write_imatrix_kvs(fp, imatrix); + if (out_ctx->write_dspark) write_dspark_kvs(fp, &out_ctx->dspark); for (uint64_t i = 0; i < out_ctx->n_tensors; i++) { const tensor_meta *t = &out_ctx->tensors[i]; write_gguf_string(fp, t->name); @@ -1640,21 +2477,93 @@ static void write_full_gguf(st_db *db, const gguf_file *tmpl, const output_conte const tensor_meta *src = &tmpl->tensors[i]; const tensor_meta *dst = &out_ctx->tensors[i]; fprintf(stderr, "[%4" PRIu64 "/%4" PRIu64 "] %s -> %s\n", i + 1, out_ctx->n_tensors, dst->name, ds4q_type_name(dst->type)); - byte_buf data = generate_tensor(db, dst->name, src, dst->type, n_experts, n_threads, imatrix); + byte_buf data = generate_tensor(source, dst->name, src, dst->type, n_experts, n_threads, imatrix); size_t expected = dst->size; if (data.size != expected) { fprintf(stderr, "error: generated size mismatch for %s: got %zu expected %zu\n", dst->name, data.size, expected); exit(1); } - if (fwrite(data.data, 1, data.size, fp) != data.size) die_errno("write tensor", out_path); - size_t padded = ds4q_pad(data.size, out_ctx->alignment); + if (fwrite(data.data, 1, data.size, fp) != data.size) die("write tensor data failed"); + const size_t padded = ds4q_pad(data.size, out_ctx->alignment); write_padding(fp, padded - data.size); - fprintf(stderr, " generated %.2f MiB\n", (double)data.size / 1048576.0); free(data.data); } fclose(fp); } +static void free_gguf_file(gguf_file *g); + +static uint64_t count_gguf_kv_prefix_in_file(const char *path, const char *prefix) { + FILE *fp = fopen(path, "rb"); + if (!fp) die_errno("open GGUF", path); + char magic[4]; + if (fread(magic, 1, sizeof(magic), fp) != sizeof(magic) || memcmp(magic, "GGUF", 4) != 0) { + die("bad GGUF self-test file"); + } + (void)read_u32_le_fp(fp, "GGUF version"); + (void)read_u64_le_fp(fp, "GGUF tensor count"); + uint64_t n_kv = read_u64_le_fp(fp, "GGUF KV count"); + uint64_t count = 0; + for (uint64_t i = 0; i < n_kv; i++) { + char *key = read_gguf_string_fp(fp); + uint32_t type = read_u32_le_fp(fp, "GGUF KV type"); + if (str_starts(key, prefix)) count++; + skip_gguf_value_fp(fp, type); + free(key); + } + fclose(fp); + return count; +} + +static void write_dspark_metadata_template(const char *path, const dspark_metadata *m) { + FILE *fp = fopen(path, "wb"); + if (!fp) die_errno("create GGUF self-test template", path); + if (fwrite("GGUF", 1, 4, fp) != 4) die("write GGUF magic failed"); + write_u32(fp, 3); + write_u64(fp, 0); + write_u64(fp, extra_dspark_kv_count(true)); + write_dspark_kvs(fp, m); + if (fclose(fp) != 0) die_errno("close GGUF self-test template", path); +} + +static void self_test_dspark_kv_rewrite_no_duplicates(void) { + char tmpl_path[] = "/tmp/ds4q-dspark-template-XXXXXX"; + int tmpl_fd = mkstemp(tmpl_path); + if (tmpl_fd < 0) die_errno("mkstemp", tmpl_path); + close(tmpl_fd); + char out_path[] = "/tmp/ds4q-dspark-output-XXXXXX"; + int out_fd = mkstemp(out_path); + if (out_fd < 0) die_errno("mkstemp", out_path); + close(out_fd); + + dspark_metadata old_meta = dspark_metadata_defaults(); + old_meta.markov_rank = 64; + write_dspark_metadata_template(tmpl_path, &old_meta); + + gguf_file preserved = load_gguf_metadata(tmpl_path, false); + if (preserved.n_kv != extra_dspark_kv_count(true)) { + die("DSpark metadata should be preserved when not rewriting it"); + } + free_gguf_file(&preserved); + + gguf_file tmpl = load_gguf_metadata(tmpl_path, true); + if (tmpl.n_kv != 0) die("DSpark metadata should be dropped before rewrite"); + quant_policy policy = {0}; + imatrix_store im = {0}; + dspark_metadata new_meta = dspark_metadata_defaults(); + new_meta.markov_rank = 0; + output_context out = build_output_context(&tmpl, &policy, &im, true, &new_meta); + write_full_gguf(NULL, &tmpl, &out, out_path, 0, 1, &im); + if (count_gguf_kv_prefix_in_file(out_path, "deepseek4.dspark.") != extra_dspark_kv_count(true)) { + die("rewritten DSpark metadata should not contain duplicate keys"); + } + + free(out.tensors); + free_gguf_file(&tmpl); + unlink(tmpl_path); + unlink(out_path); +} + static void print_plan(const gguf_file *tmpl, const output_context *out_ctx) { size_t tensor_bytes = 0; size_t changed = 0; @@ -1680,6 +2589,7 @@ static void print_plan(const gguf_file *tmpl, const output_context *out_ctx) { typedef struct { char *hf_dir; + char *source_gguf; char *template_gguf; char *out_gguf; char *compare_gguf; @@ -1688,22 +2598,28 @@ typedef struct { quant_policy policy; int n_experts; int n_threads; + size_t alignment; bool dry_run; bool overwrite; bool imatrix_strict; + bool dspark_only; + bool self_test_dspark_map; } params; static void usage(const char *argv0) { - printf("usage: %s --hf DIR --template MODEL.gguf --out OUT.gguf [options]\n", argv0); - printf("\nDeepSeek V4 Flash/Pro safetensors -> GGUF quantizer in plain C.\n\n"); + printf("usage: %s (--hf DIR | --source-gguf MODEL.gguf) --template MODEL.gguf --out OUT.gguf [options]\n", argv0); + printf("\nDeepSeek V4 Flash/Pro safetensors/GGUF -> GGUF quantizer in plain C.\n\n"); printf("options:\n"); printf(" --hf DIR Hugging Face model directory with model.safetensors.index.json\n"); + printf(" --source-gguf FILE source GGUF to re-quantize from, e.g. an abliterated Q8_0 GGUF\n"); printf(" --template FILE existing DS4 GGUF used for metadata, tensor order, shapes\n"); printf(" --out FILE output GGUF path\n"); printf(" --compare-gguf FILE reference GGUF for --compare-tensor, default template\n"); printf(" --compare-tensor NAME regenerate one tensor, byte-compare, and exit\n"); printf(" --overwrite replace --out if it already exists\n"); printf(" --dry-run print output plan without reading HF tensor data\n"); + printf(" --self-test-dspark-map validate DSpark HF map, policy, and metadata defaults\n"); + printf(" --dspark-only replace template tensors with official DSpark MTP tensors\n"); printf(" --imatrix FILE legacy .dat imatrix from ds4 --imatrix-out\n"); printf(" --imatrix-strict fail if a quantized tensor has no matching imatrix vector\n"); printf(" --experts TYPE set routed w1/w2/w3 expert tensors to TYPE\n"); @@ -1717,6 +2633,7 @@ static void usage(const char *argv0) { printf(" --output TYPE output.* tensor type\n"); printf(" --dense TYPE remaining 2D+ non-routed tensor type\n"); printf(" --tensor-type PFX=TYPE exact tensor-name or prefix override; may repeat\n"); + printf(" --alignment N write GGUF tensor-data alignment, default from template\n"); printf(" --n-experts N routed expert count, default template metadata\n"); printf(" --threads N expert worker count, default 8\n"); printf("\nTYPE examples: f16, f32, bf16, q8_0, q4_k, q2_k, iq2_xxs\n"); @@ -1752,6 +2669,8 @@ static params parse_args(int argc, char **argv) { exit(0); } else if (strcmp(arg, "--hf") == 0) { p.hf_dir = need_value(argc, argv, &i, arg); + } else if (strcmp(arg, "--source-gguf") == 0) { + p.source_gguf = need_value(argc, argv, &i, arg); } else if (strcmp(arg, "--template") == 0) { p.template_gguf = need_value(argc, argv, &i, arg); } else if (strcmp(arg, "--out") == 0) { @@ -1762,6 +2681,10 @@ static params parse_args(int argc, char **argv) { p.compare_tensor = need_value(argc, argv, &i, arg); } else if (strcmp(arg, "--overwrite") == 0) { p.overwrite = true; + } else if (strcmp(arg, "--self-test-dspark-map") == 0) { + p.self_test_dspark_map = true; + } else if (strcmp(arg, "--dspark-only") == 0) { + p.dspark_only = true; } else if (strcmp(arg, "--dry-run") == 0) { p.dry_run = true; } else if (strcmp(arg, "--imatrix") == 0) { @@ -1796,6 +2719,11 @@ static params parse_args(int argc, char **argv) { *eq = '\0'; p.policy.overrides = xrealloc(p.policy.overrides, (size_t)(p.policy.n_overrides + 1) * sizeof(p.policy.overrides[0])); p.policy.overrides[p.policy.n_overrides++] = (type_override){ xstrdup(spec), parse_type(eq + 1) }; + } else if (strcmp(arg, "--alignment") == 0) { + char *end = NULL; + unsigned long long v = strtoull(need_value(argc, argv, &i, arg), &end, 10); + if (!v || (end && *end)) die("bad --alignment value"); + p.alignment = (size_t)v; } else if (strcmp(arg, "--n-experts") == 0) { p.n_experts = atoi(need_value(argc, argv, &i, arg)); } else if (strcmp(arg, "--threads") == 0) { @@ -1805,7 +2733,8 @@ static params parse_args(int argc, char **argv) { exit(1); } } - if (!p.hf_dir) die("--hf is required"); + if (p.self_test_dspark_map) return p; + if ((p.hf_dir != NULL) == (p.source_gguf != NULL)) die("exactly one of --hf or --source-gguf is required"); if (!p.template_gguf) die("--template is required"); if (!p.dry_run && !p.compare_tensor && !p.out_gguf) die("--out is required unless --dry-run or --compare-tensor is used"); if (p.compare_tensor && !p.compare_gguf) p.compare_gguf = p.template_gguf; @@ -1813,6 +2742,18 @@ static params parse_args(int argc, char **argv) { return p; } +static void self_test_dspark_only_args(void) { + char *argv[] = { + "deepseek4-quantize", + "--self-test-dspark-map", + "--dspark-only", + }; + params p = parse_args((int)(sizeof(argv) / sizeof(argv[0])), argv); + if (!p.self_test_dspark_map || !p.dspark_only) { + die("bad --dspark-only self-test parsing"); + } +} + static void free_gguf_file(gguf_file *g) { free(g->path); free(g->kv_raw); @@ -1822,7 +2763,7 @@ static void free_gguf_file(gguf_file *g) { memset(g, 0, sizeof(*g)); } -static void compare_one_tensor(st_db *db, const gguf_file *tmpl, const output_context *out_ctx, +static void compare_one_tensor(model_source *source, const gguf_file *tmpl, const output_context *out_ctx, const params *p, const imatrix_store *imatrix) { int idx = hmap_get(&tmpl->tensor_map, p->compare_tensor); if (idx < 0) { @@ -1831,9 +2772,9 @@ static void compare_one_tensor(st_db *db, const gguf_file *tmpl, const output_co } fprintf(stderr, "regenerating %s as %s\n", p->compare_tensor, ds4q_type_name(out_ctx->tensors[idx].type)); - byte_buf generated = generate_tensor(db, p->compare_tensor, &tmpl->tensors[idx], + byte_buf generated = generate_tensor(source, p->compare_tensor, &tmpl->tensors[idx], out_ctx->tensors[idx].type, p->n_experts, p->n_threads, imatrix); - gguf_file ref = load_gguf_metadata(p->compare_gguf); + gguf_file ref = load_gguf_metadata(p->compare_gguf, false); byte_buf reference = read_gguf_tensor_data(&ref, p->compare_gguf, p->compare_tensor); printf("tensor: %s\n", p->compare_tensor); printf("type: %s\n", ds4q_type_name(out_ctx->tensors[idx].type)); @@ -1866,10 +2807,15 @@ static void compare_one_tensor(st_db *db, const gguf_file *tmpl, const output_co int main(int argc, char **argv) { params p = parse_args(argc, argv); + if (p.self_test_dspark_map) { + self_test_dspark_map(); + self_test_dspark_kv_rewrite_no_duplicates(); + return 0; + } imatrix_store imatrix = {0}; if (p.imatrix_file) imatrix_load(&imatrix, p.imatrix_file, p.imatrix_strict); - gguf_file tmpl = load_gguf_metadata(p.template_gguf); + gguf_file tmpl = load_gguf_metadata(p.template_gguf, false); if (p.n_experts <= 0) { if (tmpl.n_experts > 0) { p.n_experts = tmpl.n_experts; @@ -1881,24 +2827,64 @@ int main(int argc, char **argv) { } else { fprintf(stderr, "using %d routed experts from --n-experts\n", p.n_experts); } - output_context out_ctx = build_output_context(&tmpl, &p.policy, &imatrix); + st_db db = {0}; + gguf_file source_gguf = {0}; + model_source source = {0}; + bool write_dspark = false; + dspark_metadata dspark_meta = dspark_metadata_defaults(); + + if (p.hf_dir) { + bool markov_rank_set = false; + dspark_meta = dspark_metadata_from_hf_config(p.hf_dir, &markov_rank_set); + db_open(&db, p.hf_dir); + dspark_hf_layout dspark_layout = db_dspark_hf_layout(&db, markov_rank_set, dspark_meta.markov_rank); + if (dspark_layout != DS4_DSPARK_HF_NONE) { + write_dspark = true; + fprintf(stderr, "DSpark HF %s layout detected; writing deepseek4.dspark.* metadata\n", + dspark_hf_layout_name(dspark_layout)); + } + if (p.dspark_only) write_dspark = true; + if (write_dspark) { + free_gguf_file(&tmpl); + tmpl = load_gguf_metadata(p.template_gguf, true); + } + if (p.dspark_only) { + gguf_use_dspark_mtp_template(&tmpl, &db, p.n_experts, dspark_layout); + } + source.kind = MODEL_SOURCE_HF; + source.hf = &db; + } + + if (p.alignment) tmpl.alignment = p.alignment; + output_context out_ctx = build_output_context(&tmpl, &p.policy, &imatrix, write_dspark, &dspark_meta); print_plan(&tmpl, &out_ctx); - if (p.dry_run) return 0; + if (p.dry_run) { + if (p.hf_dir) db_close(&db); + imatrix_free(&imatrix); + free_gguf_file(&tmpl); + free(out_ctx.tensors); + return 0; + } - st_db db; - db_open(&db, p.hf_dir); + if (!p.hf_dir) { + source_gguf = load_gguf_metadata(p.source_gguf, false); + source.kind = MODEL_SOURCE_GGUF; + source.gguf = &source_gguf; + } if (p.compare_tensor) { - compare_one_tensor(&db, &tmpl, &out_ctx, &p, &imatrix); - db_close(&db); + compare_one_tensor(&source, &tmpl, &out_ctx, &p, &imatrix); + if (p.hf_dir) db_close(&db); + else free_gguf_file(&source_gguf); imatrix_free(&imatrix); free_gguf_file(&tmpl); free(out_ctx.tensors); return 0; } - write_full_gguf(&db, &tmpl, &out_ctx, p.out_gguf, p.n_experts, p.n_threads, &imatrix); + write_full_gguf(&source, &tmpl, &out_ctx, p.out_gguf, p.n_experts, p.n_threads, &imatrix); fprintf(stderr, "wrote %s\n", p.out_gguf); - db_close(&db); + if (p.hf_dir) db_close(&db); + else free_gguf_file(&source_gguf); imatrix_free(&imatrix); free_gguf_file(&tmpl); free(out_ctx.tensors); diff --git a/gguf-tools/deepspec/ds4_deepspec.py b/gguf-tools/deepspec/ds4_deepspec.py new file mode 100755 index 000000000..b76f85a73 --- /dev/null +++ b/gguf-tools/deepspec/ds4_deepspec.py @@ -0,0 +1,505 @@ +#!/usr/bin/env python3 +"""DS4 helpers for DeepSpec target-cache interoperability.""" + +from __future__ import annotations + +import argparse +import json +import struct +import sys +import tempfile +import textwrap +from pathlib import Path + +INDEX_RECORD_STRUCT = struct.Struct(" None: + if not condition: + raise CacheValidationError(message) + + +def _read_json(path: Path) -> dict: + try: + with path.open("r", encoding="utf-8") as fp: + data = json.load(fp) + except OSError as exc: + raise CacheValidationError(f"cannot read {path}: {exc}") from exc + except json.JSONDecodeError as exc: + raise CacheValidationError(f"invalid JSON in {path}: {exc}") from exc + _require(isinstance(data, dict), f"{path} is not a JSON object") + return data + + +def _required_int(manifest: dict, key: str) -> int: + value = manifest.get(key) + _require(isinstance(value, int) and value >= 0, f"manifest.{key} must be a non-negative integer") + return value + + +def _validate_manifest(manifest: dict, + expected_target_model: str | None, + expected_chat_template: str | None) -> tuple[int, list[int], int, list[dict]]: + _require(manifest.get("version") == TARGET_CACHE_VERSION, + f"manifest.version must be {TARGET_CACHE_VERSION}") + if "format" in manifest: + _require(manifest["format"] == "deepspec-target-cache", + "manifest.format must be deepspec-target-cache") + _require(manifest.get("hidden_dtype") == EXPECTED_HIDDEN_DTYPE, + f"manifest.hidden_dtype must be {EXPECTED_HIDDEN_DTYPE}") + _require(manifest.get("token_dtype") == EXPECTED_TOKEN_DTYPE, + f"manifest.token_dtype must be {EXPECTED_TOKEN_DTYPE}") + _require(manifest.get("mask_dtype") == EXPECTED_MASK_DTYPE, + f"manifest.mask_dtype must be {EXPECTED_MASK_DTYPE}") + _require(manifest.get("index_record_size") == INDEX_RECORD_STRUCT.size, + f"manifest.index_record_size must be {INDEX_RECORD_STRUCT.size}") + + hidden_size = _required_int(manifest, "hidden_size") + _require(hidden_size > 0, "manifest.hidden_size must be positive") + num_samples = _required_int(manifest, "num_samples") + num_shards = _required_int(manifest, "num_shards") + + layers = manifest.get("target_layer_ids") + _require(isinstance(layers, list) and len(layers) > 0, + "manifest.target_layer_ids must be a non-empty list") + _require(all(isinstance(layer, int) and layer >= 0 for layer in layers), + "manifest.target_layer_ids must contain non-negative integers") + _require(len(set(layers)) == len(layers), "manifest.target_layer_ids must not contain duplicates") + _require(layers == sorted(layers), "manifest.target_layer_ids must be sorted in capture order") + + target_hidden_layers = manifest.get("target_hidden_layers") + if target_hidden_layers is not None: + _require(target_hidden_layers == len(layers), + "manifest.target_hidden_layers must match target_layer_ids length") + + if expected_target_model is not None: + _require(manifest.get("target_model_name_or_path") == expected_target_model, + "manifest.target_model_name_or_path does not match expected target model") + + if expected_chat_template is not None: + convention = manifest.get("input_convention") + _require(isinstance(convention, dict), "manifest.input_convention must be an object") + _require(convention.get("chat_template") == expected_chat_template, + "manifest.input_convention.chat_template does not match expected template") + + shards = manifest.get("shards") + _require(isinstance(shards, list), "manifest.shards must be a list") + _require(len(shards) == num_shards, "manifest.num_shards must match shards length") + if num_samples > 0: + _require(num_shards > 0, "manifest with samples must contain at least one shard") + return hidden_size, layers, num_samples, shards + + +def _load_shard_map(cache_dir: Path, shards: list[dict]) -> dict[int, Path]: + shard_map: dict[int, Path] = {} + for entry in shards: + _require(isinstance(entry, dict), "manifest.shards entries must be objects") + shard_id = entry.get("shard_id") + file_name = entry.get("file_name") + _require(isinstance(shard_id, int) and shard_id >= 0, "shard_id must be a non-negative integer") + _require(isinstance(file_name, str) and file_name, "shard file_name must be a non-empty string") + _require(shard_id not in shard_map, f"duplicate shard_id {shard_id}") + path = cache_dir / file_name + _require(path.is_file(), f"missing shard file {path}") + shard_map[shard_id] = path + return shard_map + + +def _intervals_for_record(seq_len: int, + hidden_size: int, + num_layers: int, + offsets: tuple[int, int, int, int, int]) -> list[tuple[str, int, int]]: + input_ids_offset, attention_mask_offset, loss_mask_offset, target_hidden_offset, target_last_offset = offsets + target_hidden_bytes = seq_len * num_layers * hidden_size * 2 + target_last_bytes = seq_len * hidden_size * 2 + return [ + ("input_ids", input_ids_offset, seq_len * 4), + ("attention_mask", attention_mask_offset, seq_len), + ("loss_mask", loss_mask_offset, seq_len), + ("target_hidden_states", target_hidden_offset, target_hidden_bytes), + ("target_last_hidden_states", target_last_offset, target_last_bytes), + ] + + +def _validate_record(cache_dir: Path, + record_index: int, + record: tuple[int, int, int, int, int, int, int, int], + shard_map: dict[int, Path], + hidden_size: int, + num_layers: int) -> None: + sample_id, shard_id, seq_len, input_ids_offset, attention_mask_offset, loss_mask_offset, target_hidden_offset, target_last_offset = record + _require(sample_id == record_index, + f"record {record_index} sample_id is {sample_id}, expected {record_index}") + _require(seq_len > 0, f"record {record_index} seq_len must be positive") + _require(shard_id in shard_map, f"record {record_index} references unknown shard_id {shard_id}") + shard = shard_map[shard_id] + shard_size = shard.stat().st_size + intervals = _intervals_for_record(seq_len, + hidden_size, + num_layers, + (input_ids_offset, + attention_mask_offset, + loss_mask_offset, + target_hidden_offset, + target_last_offset)) + sorted_intervals = sorted(intervals, key=lambda item: item[1]) + for name, offset, size in sorted_intervals: + _require(offset >= 0, f"record {record_index} {name} offset must be non-negative") + _require(size > 0, f"record {record_index} {name} size must be positive") + _require(offset + size <= shard_size, + f"record {record_index} {name} extends beyond shard {shard.relative_to(cache_dir)}") + for (_, prev_offset, prev_size), (name, offset, _) in zip(sorted_intervals, sorted_intervals[1:]): + _require(prev_offset + prev_size <= offset, + f"record {record_index} {name} overlaps previous tensor payload") + + +def validate_target_cache(cache_dir: Path, + expected_target_model: str | None = None, + expected_chat_template: str | None = None) -> dict: + cache_dir = cache_dir.resolve() + _require(cache_dir.is_dir(), f"cache directory does not exist: {cache_dir}") + manifest = _read_json(cache_dir / "manifest.json") + hidden_size, layers, num_samples, shards = _validate_manifest(manifest, + expected_target_model, + expected_chat_template) + shard_map = _load_shard_map(cache_dir, shards) + index_path = cache_dir / "samples.idx" + _require(index_path.is_file(), f"missing index file {index_path}") + index_size = index_path.stat().st_size + _require(index_size == num_samples * INDEX_RECORD_STRUCT.size, + "samples.idx size must equal num_samples * index_record_size") + with index_path.open("rb") as fp: + for record_index in range(num_samples): + raw = fp.read(INDEX_RECORD_STRUCT.size) + _require(len(raw) == INDEX_RECORD_STRUCT.size, + f"short samples.idx record {record_index}") + _validate_record(cache_dir, + record_index, + INDEX_RECORD_STRUCT.unpack(raw), + shard_map, + hidden_size, + len(layers)) + return { + "cache_dir": str(cache_dir), + "num_samples": num_samples, + "num_shards": len(shards), + "hidden_size": hidden_size, + "target_layer_ids": layers, + "index_record_size": INDEX_RECORD_STRUCT.size, + } + +def render_nonseq_config(target_cache_path: str | None = None, + target_model_name_or_path: str = DEFAULT_TARGET_MODEL, + chat_template: str = DEFAULT_CHAT_TEMPLATE, + target_layer_ids: list[int] | None = None, + max_train_steps: int | None = None, + global_batch_size: int = 512, + local_batch_size: int = 1) -> str: + """Return a DeepSpec config for a DeepSeek-V4 non-Markov DSpark pilot.""" + if target_layer_ids is None: + target_layer_ids = DEFAULT_TARGET_LAYER_IDS + _require(len(target_layer_ids) > 0, "target_layer_ids must not be empty") + return textwrap.dedent(f"""\ + # Generated by ds4_deepspec.py for DS4 DeepSpec training. + import os + + try: + from deepspec.trainer import DeepSeekV4DSparkTrainer + except ImportError as exc: + raise RuntimeError( + "DS4 DeepSeek-V4 DSpark training needs a DeepSpec checkout/fork " + "that provides DeepSeekV4DSparkTrainer; upstream DeepSpec main " + "currently ships Qwen3/Gemma trainers only." + ) from exc + + BASE_TB_DIR = os.path.expanduser("~/tensorboard") + BASE_CKPT_DIR = os.path.expanduser("~/checkpoints") + + seed = 42 + project_name = "deepspec" + exp_name = "dspark_block5_deepseek_v4_flash_nonseq" + + model = dict( + target_model_name_or_path={target_model_name_or_path!r}, + block_size={DEFAULT_DSPARK_BLOCK_SIZE}, + num_draft_layers={len(target_layer_ids)}, + target_layer_ids={target_layer_ids!r}, + mask_token_id={DEFAULT_MASK_TOKEN_ID}, + num_anchors=512, + markov_rank=0, + markov_head_type="vanilla", + confidence_head_alpha=0.0, + confidence_head_with_markov=False, + ) + + train = dict( + trainer_cls=DeepSeekV4DSparkTrainer, + lr=6.0e-4, + warmup_ratio=0.04, + weight_decay=0.0, + precision="bf16", + local_batch_size={local_batch_size}, + global_batch_size={global_batch_size}, + num_train_epochs=10, + max_train_steps={max_train_steps!r}, + max_grad_norm=1.0, + sharding_strategy="no_shard", + torch_compile=False, + loss_decay_gamma=None, + ce_loss_alpha=1.0, + l1_loss_alpha=0.0, + ) + + logging = dict( + logging_steps=10, + checkpointing_steps=3000, + ) + + data = dict( + target_cache_path={target_cache_path!r}, + chat_template={chat_template!r}, + max_length=4096, + num_workers=4, + ) + + def finalize_cfg(cfg): + logging_cfg = dict(cfg["logging"]) + project = str(cfg["project_name"]) + exp = str(cfg["exp_name"]) + logging_cfg["checkpoint_dir"] = os.path.join(BASE_CKPT_DIR, project, exp) + logging_cfg["tensorboard_dir"] = os.path.join(BASE_TB_DIR, project, exp) + cfg["logging"] = logging_cfg + return cfg + """) + + +def _target_cache_config_defaults(target_cache_path: str, + target_model_name_or_path: str | None, + chat_template: str | None) -> tuple[str, str, list[int]]: + cache_dir = Path(target_cache_path) + manifest = _read_json(cache_dir / "manifest.json") + + manifest_target = manifest.get("target_model_name_or_path") + if target_model_name_or_path is None: + _require(isinstance(manifest_target, str) and manifest_target, + "manifest.target_model_name_or_path is required to emit a config without --target-model") + target_model_name_or_path = manifest_target + elif manifest_target is not None: + _require(isinstance(manifest_target, str) and manifest_target, + "manifest.target_model_name_or_path must be a non-empty string when present") + _require(manifest_target == target_model_name_or_path, + "manifest.target_model_name_or_path does not match expected target model") + + convention = manifest.get("input_convention") + manifest_template = None + if convention is not None: + _require(isinstance(convention, dict), "manifest.input_convention must be an object") + manifest_template = convention.get("chat_template") + if manifest_template is not None: + _require(isinstance(manifest_template, str) and manifest_template, + "manifest.input_convention.chat_template must be a non-empty string when present") + if chat_template is None: + _require(isinstance(manifest_template, str) and manifest_template, + "manifest.input_convention.chat_template is required to emit a config without --chat-template") + chat_template = manifest_template + elif manifest_template is not None: + _require(manifest_template == chat_template, + "manifest.input_convention.chat_template does not match expected template") + + _, target_layer_ids, _, _ = _validate_manifest(manifest, None, None) + return target_model_name_or_path, chat_template, target_layer_ids + + +def write_nonseq_config(path: Path, + target_cache_path: str | None = None, + target_model_name_or_path: str | None = None, + chat_template: str | None = None, + max_train_steps: int | None = None, + global_batch_size: int = 512, + local_batch_size: int = 1, + overwrite: bool = False) -> dict: + if path.exists() and not overwrite: + raise CacheValidationError(f"refusing to overwrite existing config: {path}") + _require(target_cache_path is not None and target_cache_path != "", + "--target-cache is required with --emit-nonseq-config") + if max_train_steps is not None: + _require(max_train_steps > 0, "--max-train-steps must be positive") + _require(global_batch_size > 0, "--global-batch-size must be positive") + _require(local_batch_size > 0, "--local-batch-size must be positive") + target_model_name_or_path, chat_template, target_layer_ids = _target_cache_config_defaults( + target_cache_path, + target_model_name_or_path, + chat_template) + config = render_nonseq_config(target_cache_path, + target_model_name_or_path, + chat_template, + target_layer_ids, + max_train_steps, + global_batch_size, + local_batch_size) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(config, encoding="utf-8") + return { + "config": str(path), + "target_model_name_or_path": target_model_name_or_path, + "chat_template": chat_template, + "target_cache_path": target_cache_path, + "markov_rank": 0, + } + + +def _write_self_test_cache(cache_dir: Path, + target_model_name_or_path: str = DEFAULT_TARGET_MODEL, + chat_template: str = DEFAULT_CHAT_TEMPLATE, + include_optional_config: bool = True) -> None: + hidden_size = 4 + layers = [1, 2, 3] + seq_len = 2 + shard = cache_dir / "shard-00000.bin" + index = cache_dir / "samples.idx" + manifest = cache_dir / "manifest.json" + cache_dir.mkdir(parents=True, exist_ok=True) + offsets: list[int] = [] + payloads = [ + struct.pack(" dict: + with tempfile.TemporaryDirectory(prefix="ds4-deepspec-cache-") as tmp: + cache_dir = Path(tmp) / "cache" + config_path = Path(tmp) / "dspark_v4_nonseq.py" + self_test_target_model = "local/self-test-target" + self_test_chat_template = "self_test_template" + _write_self_test_cache(cache_dir, + target_model_name_or_path=self_test_target_model, + chat_template=self_test_chat_template) + cache_result = validate_target_cache(cache_dir, + expected_target_model=self_test_target_model, + expected_chat_template=self_test_chat_template) + config_result = write_nonseq_config(config_path, + target_cache_path=str(cache_dir), + max_train_steps=1) + config_text = config_path.read_text(encoding="utf-8") + compile(config_text, str(config_path), "exec") + _require(f"target_model_name_or_path={self_test_target_model!r}" in config_text, + "emitted config must inherit target model from cache manifest") + _require(f"chat_template={self_test_chat_template!r}" in config_text, + "emitted config must inherit chat template from cache manifest") + _require("block_size=5" in config_text, "emitted config must use DeepSeek-V4 DSpark block_size=5") + _require("num_draft_layers=3" in config_text, "emitted config must use the three DSpark MTP layers") + _require("target_layer_ids=[1, 2, 3]" in config_text, + "emitted config must inherit target layers from cache manifest") + optional_cache_dir = Path(tmp) / "optional-cache" + optional_config_path = Path(tmp) / "optional_nonseq.py" + explicit_target_model = "explicit/target" + explicit_chat_template = "explicit_template" + _write_self_test_cache(optional_cache_dir, include_optional_config=False) + optional_config = write_nonseq_config(optional_config_path, + target_cache_path=str(optional_cache_dir), + target_model_name_or_path=explicit_target_model, + chat_template=explicit_chat_template, + max_train_steps=1) + optional_text = optional_config_path.read_text(encoding="utf-8") + compile(optional_text, str(optional_config_path), "exec") + _require(optional_config["target_model_name_or_path"] == explicit_target_model, + "explicit target model must be accepted when optional manifest target is absent") + _require(optional_config["chat_template"] == explicit_chat_template, + "explicit chat template must be accepted when optional manifest template is absent") + _require(f"target_model_name_or_path={explicit_target_model!r}" in optional_text, + "explicit target model must be emitted when optional manifest target is absent") + _require(f"chat_template={explicit_chat_template!r}" in optional_text, + "explicit chat template must be emitted when optional manifest template is absent") + cache_result["nonseq_config"] = config_result + return cache_result + + +def _parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Validate DS4 DeepSpec target-cache artifacts.") + parser.add_argument("cache_dir", nargs="?", help="Directory containing manifest.json, samples.idx, and shard files.") + parser.add_argument("--target-model", help="Expected manifest target_model_name_or_path, or emitted config target model.") + parser.add_argument("--chat-template", help="Expected manifest input_convention.chat_template, or emitted config chat template.") + parser.add_argument("--self-test", action="store_true", help="Run the built-in synthetic cache/config compatibility smoke.") + parser.add_argument("--emit-nonseq-config", metavar="FILE", help="Write a DeepSeek-V4 non-Markov DSpark DeepSpec config.") + parser.add_argument("--target-cache", help="target_cache_path value for --emit-nonseq-config.") + parser.add_argument("--max-train-steps", type=int, help="Optional train.max_train_steps value for the emitted config.") + parser.add_argument("--global-batch-size", type=int, default=512, help="Emitted train.global_batch_size. Default: 512.") + parser.add_argument("--local-batch-size", type=int, default=1, help="Emitted train.local_batch_size. Default: 1.") + parser.add_argument("--overwrite", action="store_true", help="Allow --emit-nonseq-config to replace FILE.") + return parser.parse_args(argv) + + +def main(argv: list[str]) -> int: + args = _parse_args(argv) + try: + if args.emit_nonseq_config: + result = write_nonseq_config(Path(args.emit_nonseq_config), + target_cache_path=args.target_cache, + target_model_name_or_path=args.target_model, + chat_template=args.chat_template, + max_train_steps=args.max_train_steps, + global_batch_size=args.global_batch_size, + local_batch_size=args.local_batch_size, + overwrite=args.overwrite) + elif args.self_test: + result = self_test() + else: + _require(args.cache_dir is not None, "cache_dir is required unless --self-test or --emit-nonseq-config is used") + result = validate_target_cache(Path(args.cache_dir), + expected_target_model=args.target_model, + expected_chat_template=args.chat_template) + except CacheValidationError as exc: + print(f"ds4-deepspec: {exc}", file=sys.stderr) + return 1 + json.dump(result, sys.stdout, indent=2, sort_keys=True) + sys.stdout.write("\n") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/rocm/ds4_rocm_attention_launch.cuh b/rocm/ds4_rocm_attention_launch.cuh index b9b43d958..0691db2e8 100644 --- a/rocm/ds4_rocm_attention_launch.cuh +++ b/rocm/ds4_rocm_attention_launch.cuh @@ -324,6 +324,25 @@ extern "C" int ds4_gpu_attention_decode_raw_batch_heads_tensor( n_head, head_dim); } +extern "C" int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + ds4_gpu_tensor *heads, + const void *model_map, + uint64_t model_size, + uint64_t sinks_offset, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *raw_kv, + uint32_t n_tokens, + uint32_t n_raw, + uint32_t raw_cap, + uint32_t raw_start, + uint32_t n_head, + uint32_t head_dim) { + (void)heads; (void)model_map; (void)model_size; (void)sinks_offset; + (void)q; (void)raw_kv; (void)n_tokens; (void)n_raw; (void)raw_cap; + (void)raw_start; (void)n_head; (void)head_dim; + return 0; +} + extern "C" int ds4_gpu_attention_decode_mixed_batch_heads_tensor( ds4_gpu_tensor *heads, const void *model_map, diff --git a/speed-bench/README.md b/speed-bench/README.md index 32075fe18..645e1ebbe 100644 --- a/speed-bench/README.md +++ b/speed-bench/README.md @@ -26,3 +26,239 @@ python3 speed-bench/plot_speed.py speed-bench/m3_max.csv --title "M3 Max t/s" The script uses only the Python standard library. By default it writes a file next to the CSV using the `_ts.svg` suffix, such as `speed-bench/m3_max_ts.svg`. + +For Metal Tensor prefill experiments, treat matmul as the first optimization +surface: profile routed-MoE stages and dense Q8_0 attention projections, then +compare the current standard path, current Tensor auto path, and a default-off +candidate env switch with: + +``` +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label moe-matmul-first \ + --set-env DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1 +``` + +### Metal Tensor helper map + +The Metal Tensor work uses a small set of local tools so speed changes, +logprob drift, and diagnostic attribution stay tied to the same fixtures and +artifact format: + +| Tool | Why it exists | +| --- | --- | +| `run_metal_tensor_bench.sh` | Regenerates the Standard Metal / Quality Metal / Tensor Metal chart for the current branch and keeps timestamped CSV/PNG artifacts under ignored `speed-bench/local-runs/`. Use this for PR performance evidence. | +| `run_quality_drift_gate.py` | Runs the five fixed prompt scenarios against `--quality`, `-mt off`, and `-mt auto`, then writes PR-ready `summary.md` and automation-friendly `summary.json`. Use this as the main logprob drift gate. | +| `run_prefill_candidate_gate.py` | Compares a default-off candidate against current Tensor and Standard speed first, then launches the drift gates only when the candidate is speed-positive enough to justify the cost. Use this before promoting any new prefill route. | +| `metal_tensor_presets.py` | Stores named environment profiles for measured default-off candidates so speed, drift, and comparator reruns use the same route settings without copying long env strings. | +| `run_chunked_prefill_drift_gate.py` | Adds resumed-prefill frontier coverage for candidates that depend on nonzero `pos=` route filters, because the five fixed prompts mostly validate cold `pos=0` prefill. | +| `run_mpp_compare_probe.py` and `summarize_mpp_compare.py` | Run and summarize local Tensor-vs-legacy projection comparisons for route attribution. Use them to decide which layer/route caused a drift breach before spending a full five-fixture gate. | +| `summarize_stage_profile.py` | Converts Metal stage-profiler stderr into Markdown/JSON tables so kernel targets are chosen from measured stage time instead of whole-layer timing alone. | +| `index_local_runs.py` | Builds a compact index over ignored local artifacts so candidate runs, drift gates, comparator probes, profiles, and chart runs are easy to find later. | + +These tools intentionally write to ignored local directories by default. The +PR should include selected numbers or Markdown summaries, not the raw local +artifacts themselves. + +The measured default-off profiles can also be selected with `--preset` to avoid +copying long environment strings by hand: + +``` +python3 speed-bench/run_prefill_candidate_gate.py \ + --preset mpp-fast-skip-down26-29-30 \ + --run-drift-gate +``` + +Add `--run-drift-gate` before promoting a candidate. The helper first evaluates +the speed screen; if the candidate fails the prefill or generation floor, it +records the skip reason and does not launch the five-fixture drift gate. When +the speed screen passes, it reuses the five-fixture `--quality` drift gate and +writes JSON plus Markdown summaries beside the benchmark CSVs. By default this +helper writes timestamped output under +`speed-bench/local-runs/-/`, which is ignored by git. +The candidate Markdown scorecard marks production promotion-safe only when every +measured context beats Tensor prefill by at least `--min-prefill-gain-pct`, +every repeat/context pair clears `--min-repeat-prefill-gain-pct`, the candidate +stays above the generation floor set by `--min-generation-gain-pct`, the drift +gate is green, and Tensor-vs-standard drift stays inside the configured +envelope (`--max-tensor-standard-rms` and +`--max-tensor-standard-top20-abs`). Candidates that use nonzero `pos=` route +filters need additional resumed-prefill coverage, because the existing five +fixtures mostly exercise cold `pos=0` prefill. When `--run-drift-gate` is set +and the speed screen passes, the helper now also runs the chunked frontier drift +gate for that class of candidate. Without that chunked gate artifact, nonzero +`pos=` candidates are marked not promotion-safe. With `--run-drift-gate`, +failed candidates still write artifacts before exiting non-zero; add `--no-fail` +for exploratory sweeps. Use `--reuse --out-dir=` to regenerate +summaries from saved CSVs, charts, and drift-gate dumps without rerunning +benchmarks. The gate refuses to use stale `ds4-bench` or nested `ds4` binaries +when core sources or `metal/*.metal` are newer than the executable; rebuild +first, or pass `--allow-stale-binary` only when intentionally summarizing old +artifacts. When nested drift gates are present, the candidate scorecard also +shows the Tensor-vs-standard fixtures or frontiers responsible for the worst +drift metrics. The Markdown scorecard also prints per-context repeat deltas, so +noisy median-only wins can be rejected without opening the JSON. Both JSON +reports record a `run_config` block with the command thresholds and resolved +paths used for the run, and the Markdown reports include a quoted replay +command. + +To run only the five-fixture drift gate: + +``` +python3 speed-bench/run_quality_drift_gate.py +``` + +For default-off candidates, the drift gate accepts the same `--preset` names as +the candidate gate: + +``` +python3 speed-bench/run_quality_drift_gate.py \ + --preset mpp-fast-skip-down26-29-30 \ + --max-tensor-standard-rms 0.30 \ + --max-tensor-standard-top20-abs 0.60 +``` + +By default the drift gate writes timestamped output under +`speed-bench/local-runs/-quality-drift-gate/`. Set `--out-dir=...` to +override the destination. Each run writes both `summary.json` for automation and +`summary.md` for a persistent human-readable comparison table, including the +fixture responsible for each worst drift metric. Add +`--max-tensor-standard-rms` and `--max-tensor-standard-top20-abs` when the +standalone drift gate should enforce the production drift envelope. The drift +gate also refuses stale `ds4` binaries unless `--allow-stale-binary` is set. + +To run the resumed-prefill frontier drift gate for candidates that depend on +nonzero `pos=` filters: + +``` +python3 speed-bench/run_chunked_prefill_drift_gate.py \ + --preset mpp-fast-continuation-chunks \ + --max-tensor-default-rms 0.30 \ + --max-tensor-default-top20-abs 0.60 +``` + +This script uses `ds4-bench` to grow `speed-bench/promessi_sposi.txt` through +frontiers `512, 1024, 2048, 4096, 8192` by default, dumps one full-logit JSON +file after each resumed frontier, then compares quality, standard Metal, and +Tensor Metal. When a candidate preset or `--set-env` override is present, it +also captures the no-env Tensor baseline as `default_tensor` and reports +`tensor_vs_default_tensor`; the candidate gate uses that pair for resumed +coverage so candidates are judged against the current Tensor baseline instead +of an absolute chunked Tensor-vs-standard envelope. Output is timestamped under +`speed-bench/local-runs/--chunked-drift-gate/` and ignored by +git. The chunked gate also refuses stale `ds4-bench` binaries unless +`--allow-stale-binary` is set. + +To regenerate the standard/quality/Tensor chart for the current branch: + +``` +OPEN_CHART=0 speed-bench/run_metal_tensor_bench.sh +``` + +By default the script writes timestamped output under +`speed-bench/local-runs/-metal-tensor-bench/`. That folder is ignored +by git so multiple local comparison runs can be kept without pushing the CSVs or +charts. The generated CSV and PNG filenames are also prefixed with the same +datetime run id, so reruns stay distinct even when `OUT_DIR` is reused. The +script refuses stale `ds4-bench` binaries unless `ALLOW_STALE_BINARY=1` is set. +Set `OUT_DIR=...` or `RUN_ID=...` to override the destination. + +To create a compact index of saved local benchmark charts, drift, comparator, +candidate-gate, and profile artifacts: + +``` +RUN_ID=$(date +%Y%m%d-%H%M%S) +OUT_DIR=speed-bench/local-runs/${RUN_ID}-local-run-index +python3 speed-bench/index_local_runs.py \ + --output ${OUT_DIR}/local-run-index.md \ + --json-output ${OUT_DIR}/local-run-index.json +``` + +The indexer only reads existing JSON summaries; it does not run the model. The +output directory is ignored by git, so it can be regenerated after local sweeps +without changing tracked artifacts. The prefill table includes both median and +repeat-level minimum candidate-vs-Tensor prefill deltas, matching the candidate +gate's speed-first promotion screen. It also reports five-fixture drift and +coverage/chunked drift separately, including the coverage pair used, so a +candidate that passes the normal drift gate but fails resumed-prefill coverage +is visible in the top-level table. Timestamped runs from +`run_metal_tensor_bench.sh` are indexed as chart runs with Tensor-vs-standard +prefill and generation ranges plus the PNG path. If the same `OUT_DIR` is +reused with multiple timestamped `RUN_ID` values, each complete CSV triplet is +indexed separately. + +To summarize Metal stage-profile logs from runs with +`DS4_METAL_MOE_STAGE_PROFILE=1`, `DS4_METAL_Q8_PREFILL_PROFILE=1`, +`DS4_METAL_FLASH_ATTN_STAGE_PROFILE=1`, or layer profiling enabled: + +``` +python3 speed-bench/summarize_stage_profile.py \ + speed-bench/local-runs//long_code_audit_profile.stderr +``` + +Use `--output speed-bench/local-runs//stage-profile-summary.md` to keep a +timestamped local summary beside the raw profile log. When present, the report +also includes routed-MoE timing by Tensor mask, dense Q8_0 shape tables, and +FlashAttention shape tables, which helps separate kernel targets from per-layer +totals. Use `--json-output speed-bench/local-runs//stage-profile-summary.json` +when the profile should also be indexed by the local-run indexer. + +To summarize local Tensor-vs-legacy comparator logs from runs with +`DS4_METAL_MPP_COMPARE_ROUTE=...`: + +``` +python3 speed-bench/summarize_mpp_compare.py \ + speed-bench/local-runs//.stderr \ + --output speed-bench/local-runs//mpp-compare-summary.md \ + --json-output speed-bench/local-runs//mpp-compare-summary.json +``` + +This report ranks local projection deltas by max abs and RMS, shows comparator +target breaches, and keeps the largest-delta details needed for deciding whether +a fast prefill route should be narrowed before running the five-fixture drift +gate. + +To run a targeted comparator probe and summarize it in one step: + +``` +python3 speed-bench/run_mpp_compare_probe.py \ + --preset mpp-fast-skip-down26-29-30 \ + --case long_memory_archive \ + --route moe_down +``` + +For dense Q8_0 prefill candidate work, use the same probe with the `q8` route +and a substring filter for the projection shape or module label you want to +inspect: + +``` +python3 speed-bench/run_mpp_compare_probe.py \ + --case short_code_completion \ + --route q8 \ + --q8-filter attn_q_b \ + --compare-max 3 \ + --verbose +``` + +For static-mixed FlashAttention candidate work, use the `flash_attn` route. The +probe enables `DS4_METAL_FLASH_ATTN_COMPARE=1` and replays the existing generic +static-mixed path into a reference head-output buffer: + +``` +python3 speed-bench/run_mpp_compare_probe.py \ + --case short_reasoning_plain \ + --route flash_attn \ + --flash-attn-filter static_mixed \ + --compare-max 1 \ + --verbose +``` + +By default this writes logs plus `mpp-compare-summary.md/json` under +`speed-bench/local-runs/--mpp-compare-probe/`. Use +`--all-cases` when a local comparator question needs the same five fixtures as +the logprob drift gate. `--route` is repeatable, and comma or pipe separated +route lists are split into separate probes. The comparator probe is only an +attribution tool; a candidate still needs `run_quality_drift_gate.py` before +promotion. It refuses stale `ds4` binaries unless `--allow-stale-binary` is +set. Add `--continue-after-breach` when the question is whether a route has one +isolated local breach or many; normal probes stop at the first target breach to +keep logs short. diff --git a/speed-bench/compare_bench.py b/speed-bench/compare_bench.py new file mode 100755 index 000000000..034ab1934 --- /dev/null +++ b/speed-bench/compare_bench.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +"""Plot two or more ds4-bench CSV runs as a speed comparison chart.""" + +from __future__ import annotations + +import argparse +import csv +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +REQUIRED_COLUMNS = { + "ctx_tokens", + "prefill_tps", + "gen_tps", +} + + +def read_run(path: Path) -> dict[int, dict[str, float]]: + with path.open(newline="") as fp: + reader = csv.DictReader(fp) + if reader.fieldnames is None: + raise SystemExit(f"{path}: empty CSV") + missing = REQUIRED_COLUMNS - set(reader.fieldnames) + if missing: + raise SystemExit(f"{path}: missing columns: {', '.join(sorted(missing))}") + + rows: dict[int, dict[str, float]] = {} + for row in reader: + ctx = int(row["ctx_tokens"]) + rows[ctx] = { + "prefill_tps": float(row["prefill_tps"]), + "gen_tps": float(row["gen_tps"]), + } + if not rows: + raise SystemExit(f"{path}: no data rows") + return rows + + +def context_label(ctx: int) -> str: + if ctx < 1024: + return f"{ctx / 1024:g}k" + rounded_k = round(ctx / 1024) + if abs(ctx - rounded_k * 1024) <= max(4, ctx * 0.001): + return f"{rounded_k}k" + return f"{ctx / 1024:.1f}k" + + +def annotate_points(ax, xs: list[int], ys: list[float], color: str, dy: float) -> None: + for x, y in zip(xs, ys): + ax.annotate( + f"{y:.1f}", + (x, y), + textcoords="offset points", + xytext=(0, dy), + ha="center", + va="bottom" if dy >= 0 else "top", + fontsize=8, + color=color, + fontweight="medium", + ) + + +def plot_metric( + ax, + xs: list[int], + labels: list[str], + series: list[list[float]], + metric_title: str, + run_labels: list[str], + annotate: bool, +) -> None: + colors = ["#2563eb", "#64748b", "#ea580c", "#16a34a", "#9333ea", "#dc2626"] + markers = ["o", "s", "^", "D", "P", "X"] + + for i, (values, label) in enumerate(zip(series, run_labels)): + color = colors[i % len(colors)] + ax.plot( + xs, + values, + marker=markers[i % len(markers)], + markersize=7, + linewidth=2.4, + color=color, + label=label, + ) + + if len(series) == 2: + ax.fill_between(xs, series[0], series[1], color=colors[1], alpha=0.08) + + ax.set_title(metric_title, fontsize=15, fontweight="bold", pad=12) + ax.set_xlabel("Context Size") + ax.set_ylabel("Tokens/sec") + ax.set_xticks(xs, labels) + ax.grid(True, color="#d1d5db", linewidth=0.9, alpha=0.65) + ax.set_axisbelow(True) + ax.margins(x=0.05, y=0.18) + + for spine in ("top", "right"): + ax.spines[spine].set_visible(False) + ax.spines["left"].set_color("#9ca3af") + ax.spines["bottom"].set_color("#9ca3af") + + if len(series) == 2: + gain_color = "#14532d" + ymin, ymax = ax.get_ylim() + label_y = ymin + (ymax - ymin) * 0.05 + for x, b, a in zip(xs, series[0], series[1]): + gain = ((a / b) - 1.0) * 100.0 if b else 0.0 + ax.annotate( + f"{gain:+.0f}%", + (x, label_y), + ha="center", + va="center", + fontsize=8, + color=gain_color if gain >= 0 else "#991b1b", + bbox={ + "boxstyle": "round,pad=0.24", + "facecolor": "#ecfdf5" if gain >= 0 else "#fef2f2", + "edgecolor": "#bbf7d0" if gain >= 0 else "#fecaca", + "linewidth": 0.8, + }, + ) + + if annotate: + offsets = [-16, 8, 22, 36, 50, 64] + for i, values in enumerate(series): + annotate_points(ax, xs, values, colors[i % len(colors)], offsets[i % len(offsets)]) + + +def default_run_labels(paths: list[Path], args: argparse.Namespace) -> list[str]: + if len(paths) == 2 and not args.labels: + return [args.before_label, args.after_label] + if args.labels: + if len(args.labels) != len(paths): + raise SystemExit("--labels count must match the number of CSV runs") + return args.labels + return [path.stem for path in paths] + + +def build_chart(args: argparse.Namespace) -> None: + if len(args.runs) < 2: + raise SystemExit("provide at least two ds4-bench CSV files") + runs = [read_run(path) for path in args.runs] + run_labels = default_run_labels(args.runs, args) + contexts = sorted(set.intersection(*(set(run) for run in runs))) + if not contexts: + raise SystemExit("the CSV files have no shared ctx_tokens values") + + x_positions = list(range(len(contexts))) + labels = [context_label(ctx) for ctx in contexts] + prefill_series = [[run[ctx]["prefill_tps"] for ctx in contexts] for run in runs] + gen_series = [[run[ctx]["gen_tps"] for ctx in contexts] for run in runs] + + plt.rcParams.update( + { + "figure.facecolor": "#f8fafc", + "axes.facecolor": "#ffffff", + "axes.edgecolor": "#cbd5e1", + "axes.labelcolor": "#111827", + "xtick.color": "#111827", + "ytick.color": "#111827", + "font.family": "DejaVu Sans", + } + ) + + fig, axes = plt.subplots(1, 2, figsize=(15.5, 7), constrained_layout=True) + fig.suptitle(args.title, fontsize=22, fontweight="bold", y=1.04) + + plot_metric( + axes[0], + x_positions, + labels, + prefill_series, + "Prompt Processing Speed", + run_labels, + not args.no_values, + ) + plot_metric( + axes[1], + x_positions, + labels, + gen_series, + "Text Generation Speed", + run_labels, + not args.no_values, + ) + + handles, legend_labels = axes[0].get_legend_handles_labels() + fig.legend( + handles, + legend_labels, + loc="upper center", + bbox_to_anchor=(0.5, 0.98), + ncol=min(len(run_labels), 4), + frameon=True, + fancybox=True, + shadow=False, + facecolor="#ffffff", + edgecolor="#cbd5e1", + ) + + output = args.output + if output.suffix.lower() != ".png": + raise SystemExit(f"{output}: output must be a .png file") + output.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(output, dpi=180, bbox_inches="tight", format="png") + plt.close(fig) + + print(f"Wrote {output}") + header = ["ctx"] + for label in run_labels: + safe = label.lower().replace(" ", "_") + header.extend([f"prefill_{safe}", f"gen_{safe}"]) + for label in run_labels[1:]: + safe = label.lower().replace(" ", "_") + base = run_labels[0].lower().replace(" ", "_") + header.extend([f"prefill_gain_{safe}_vs_{base}", f"gen_gain_{safe}_vs_{base}"]) + print(",".join(header)) + for idx, ctx in enumerate(contexts): + row = [str(ctx)] + base_prefill = prefill_series[0][idx] + base_gen = gen_series[0][idx] + for prefill, gen in zip(prefill_series, gen_series): + row.extend([f"{prefill[idx]:.2f}", f"{gen[idx]:.2f}"]) + for prefill, gen in zip(prefill_series[1:], gen_series[1:]): + prefill_gain = ((prefill[idx] / base_prefill) - 1.0) * 100.0 if base_prefill else 0.0 + gen_gain = ((gen[idx] / base_gen) - 1.0) * 100.0 if base_gen else 0.0 + row.extend([f"{prefill_gain:.1f}", f"{gen_gain:.1f}"]) + print(",".join(row)) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Create a two-panel comparison chart from ds4-bench CSV files." + ) + parser.add_argument("runs", nargs="+", type=Path, help="ds4-bench CSV files; first is the baseline") + parser.add_argument( + "-o", + "--output", + type=Path, + default=Path("/tmp/ds4-bench-compare.png"), + help="output chart path; must end in .png", + ) + parser.add_argument("--before-label", default="standard kernel") + parser.add_argument("--after-label", default="Metal Tensor") + parser.add_argument("--labels", nargs="+", help="Labels for each CSV run.") + parser.add_argument("--title", default="ds4-bench Speed Comparison") + parser.add_argument("--no-values", action="store_true", help="hide per-point value labels") + return parser.parse_args() + + +if __name__ == "__main__": + build_chart(parse_args()) diff --git a/speed-bench/compare_logit_drift.py b/speed-bench/compare_logit_drift.py new file mode 100644 index 000000000..53ac0d1a0 --- /dev/null +++ b/speed-bench/compare_logit_drift.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +"""Compare full-logit dumps produced by ./ds4 --dump-logits. + +Example: + ./ds4 -m q2.gguf --metal -mt off --dump-logits /tmp/q2-off.json \ + --nothink --prompt-file prompt.txt + ./ds4 -m q2.gguf --metal -mt auto --dump-logits /tmp/q2-mt.json \ + --nothink --prompt-file prompt.txt + ./ds4 -m q4.gguf --metal -mt off --dump-logits /tmp/q4-off.json \ + --nothink --prompt-file prompt.txt + python3 speed-bench/compare_logit_drift.py /tmp/q2-off.json \ + /tmp/q2-mt.json /tmp/q4-off.json --labels q2_mt q4_off +""" + +from __future__ import annotations + +import argparse +import json +import math +from heapq import nlargest +from pathlib import Path +from typing import Any + + +def load_dump(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as fp: + data = json.load(fp) + logits_raw = data.get("logits") + if not isinstance(logits_raw, list) or not logits_raw: + raise SystemExit(f"{path}: missing non-empty logits array") + logits = [float("nan") if v is None else float(v) for v in logits_raw] + vocab = int(data.get("vocab", len(logits))) + if vocab != len(logits): + raise SystemExit(f"{path}: vocab={vocab} does not match logits={len(logits)}") + data["logits"] = logits + data["_path"] = str(path) + return data + + +def dump_label(data: dict[str, Any]) -> str: + model = Path(str(data.get("model", data.get("_path", "dump")))).name + quant = data.get("quant_bits", "?") + mt = data.get("mt", "?") + quality = data.get("quality") + suffix = f":quality={quality}" if isinstance(quality, bool) else "" + return f"{model}:q{quant}:mt={mt}{suffix}" + + +def finite_indices(logits: list[float]) -> list[int]: + return [i for i, v in enumerate(logits) if math.isfinite(v)] + + +def topk(logits: list[float], k: int) -> list[int]: + # Match the C test's tie behavior: higher logit first, lower token id first. + return nlargest(k, finite_indices(logits), key=lambda i: (logits[i], -i)) + + +def overlap(a: list[int], b: list[int], k: int) -> int: + return len(set(a[:k]) & set(b[:k])) + + +def rank_delta(ref_top: list[int], cand_top: list[int]) -> int: + cand_rank = {token: i for i, token in enumerate(cand_top)} + worst = 0 + for i, token in enumerate(ref_top): + if token in cand_rank: + worst = max(worst, abs(cand_rank[token] - i)) + return worst + + +def top_union_max_abs( + ref: list[float], + cand: list[float], + ref_top: list[int], + cand_top: list[int], + k: int, +) -> float: + ids = set(ref_top[:k]) | set(cand_top[:k]) + worst = 0.0 + for token in ids: + if math.isfinite(ref[token]) and math.isfinite(cand[token]): + worst = max(worst, abs(cand[token] - ref[token])) + return worst + + +def compare(ref_dump: dict[str, Any], cand_dump: dict[str, Any], top_k: int) -> dict[str, Any]: + ref = ref_dump["logits"] + cand = cand_dump["logits"] + if len(ref) != len(cand): + raise SystemExit( + f"vocab mismatch: {ref_dump['_path']} has {len(ref)}, " + f"{cand_dump['_path']} has {len(cand)}" + ) + + ref_top = topk(ref, top_k) + cand_top = topk(cand, top_k) + sumsq = 0.0 + max_abs = 0.0 + nonfinite = 0 + largest: list[tuple[float, int, float, float]] = [] + for token, (rv, cv) in enumerate(zip(ref, cand)): + if not math.isfinite(rv) or not math.isfinite(cv): + nonfinite += 1 + continue + delta = cv - rv + abs_delta = abs(delta) + sumsq += delta * delta + max_abs = max(max_abs, abs_delta) + if len(largest) < 5: + largest.append((abs_delta, token, rv, cv)) + largest.sort(reverse=True) + elif abs_delta > largest[-1][0]: + largest[-1] = (abs_delta, token, rv, cv) + largest.sort(reverse=True) + + return { + "same_top1": bool(ref_top and cand_top and ref_top[0] == cand_top[0]), + "ref_top1": ref_top[0] if ref_top else None, + "cand_top1": cand_top[0] if cand_top else None, + "top5_overlap": overlap(ref_top, cand_top, min(5, top_k)), + "top20_overlap": overlap(ref_top, cand_top, min(20, top_k)), + "top_k": top_k, + "max_rank_delta": rank_delta(ref_top, cand_top), + "rms": math.sqrt(sumsq / len(ref)), + "max_abs": max_abs, + "top20_max_abs": top_union_max_abs(ref, cand, ref_top, cand_top, min(20, top_k)), + "nonfinite": nonfinite, + "largest_deltas": [ + {"token": token, "ref": rv, "cand": cv, "abs": abs_delta} + for abs_delta, token, rv, cv in largest + ], + } + + +def print_table(rows: list[dict[str, Any]]) -> None: + headers = [ + "candidate", + "same_top1", + "top5", + "top20", + "rank", + "rms", + "max_abs", + "top20_abs", + "nonfinite", + ] + print(" | ".join(headers)) + print(" | ".join("-" * len(h) for h in headers)) + for row in rows: + print( + " | ".join( + [ + row["label"], + "yes" if row["same_top1"] else "no", + f"{row['top5_overlap']}/5", + f"{row['top20_overlap']}/20", + str(row["max_rank_delta"]), + f"{row['rms']:.6g}", + f"{row['max_abs']:.6g}", + f"{row['top20_max_abs']:.6g}", + str(row["nonfinite"]), + ] + ) + ) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Compare ds4 full-logit JSON dumps from --dump-logits." + ) + parser.add_argument("reference", type=Path) + parser.add_argument("candidates", nargs="+", type=Path) + parser.add_argument("--labels", nargs="+", help="Labels for candidate dumps.") + parser.add_argument("--top-k", type=int, default=20) + parser.add_argument("--json-output", type=Path) + args = parser.parse_args() + + if args.top_k < 20: + raise SystemExit("--top-k must be at least 20") + if args.labels and len(args.labels) != len(args.candidates): + raise SystemExit("--labels count must match candidate count") + + ref = load_dump(args.reference) + candidates = [load_dump(path) for path in args.candidates] + labels = args.labels or [dump_label(data) for data in candidates] + + print(f"reference: {dump_label(ref)}") + print( + "prompt_tokens: " + f"{ref.get('prompt_tokens', '?')} ctx: {ref.get('ctx', '?')} " + f"vocab: {ref.get('vocab', len(ref['logits']))}" + ) + rows = [] + for label, candidate in zip(labels, candidates): + if candidate.get("prompt_tokens") != ref.get("prompt_tokens"): + print( + f"warning: prompt token mismatch for {label}: " + f"ref={ref.get('prompt_tokens')} cand={candidate.get('prompt_tokens')}" + ) + metrics = compare(ref, candidate, args.top_k) + metrics["label"] = label + metrics["path"] = candidate["_path"] + rows.append(metrics) + + print_table(rows) + for row in rows: + print(f"\n{row['label']} largest deltas:") + for delta in row["largest_deltas"]: + print( + " token={token} ref={ref:.9g} cand={cand:.9g} abs={abs:.9g}".format( + **delta + ) + ) + + if args.json_output: + payload = { + "reference": {"path": ref["_path"], "label": dump_label(ref)}, + "rows": rows, + } + with args.json_output.open("w", encoding="utf-8") as fp: + json.dump(payload, fp, indent=2) + fp.write("\n") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/speed-bench/index_local_runs.py b/speed-bench/index_local_runs.py new file mode 100644 index 000000000..e5a64f26b --- /dev/null +++ b/speed-bench/index_local_runs.py @@ -0,0 +1,582 @@ +#!/usr/bin/env python3 +"""Index saved speed-bench/local-runs artifacts. + +This scans ignored local run artifacts and builds a compact Markdown/JSON +evidence index across candidate gates, drift gates, comparator probes, and stage +profiles. It never runs the model; it only reads existing JSON summaries. +""" + +from __future__ import annotations + +import argparse +import csv +import json +from pathlib import Path +from typing import Any + + +def load_json(path: Path) -> Any | None: + try: + return json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return None + + +def rel(path: Path, root: Path) -> str: + try: + return str(path.relative_to(root)) + except ValueError: + return str(path) + + +def run_label(path: Path, root: Path) -> str: + parent = path.parent + if parent.name in {"quality-drift-gate", "chunked-drift-gate"} and parent.parent != root: + return f"{parent.parent.name}/{parent.name}" + return parent.name + + +def fmt_pct(value: float | None) -> str: + return "n/a" if value is None else f"{value:+.1f}%" + + +def fmt_num(value: float | int | None) -> str: + if value is None: + return "n/a" + if isinstance(value, int): + return str(value) + return f"{value:.6g}" + + +def bool_label(value: Any) -> str: + if value is True: + return "yes" + if value is False: + return "no" + return "n/a" + + +def coverage_label(item: dict[str, Any]) -> str: + if not item.get("coverage_required") and not item.get("coverage_run"): + return "n/a" + return bool_label(item.get("coverage_ok")) + + +def markdown_escape(value: object) -> str: + return str(value).replace("|", "\\|") + + +def env_label(env: dict[str, str] | None, max_items: int = 3) -> str: + if not env: + return "none" + items = [f"{name}={value}" for name, value in sorted(env.items())] + if len(items) > max_items: + items = items[:max_items] + [f"...(+{len(env) - max_items})"] + return ", ".join(items) + + +def candidate_speed_from_gains(data: dict[str, Any]) -> tuple[float | None, float | None]: + speed = data.get("speed_summary") or {} + name = data.get("candidate_name") + gains = speed.get("gains") or {} + pair = gains.get(f"{name}_vs_tensor") if name else None + if not isinstance(pair, dict) or not pair: + return None, None + prefill = [ + row.get("prefill_gain_pct") + for row in pair.values() + if isinstance(row, dict) and row.get("prefill_gain_pct") is not None + ] + gen = [ + row.get("gen_gain_pct") + for row in pair.values() + if isinstance(row, dict) and row.get("gen_gain_pct") is not None + ] + return (min(prefill) if prefill else None, min(gen) if gen else None) + + +def read_bench_csv(path: Path) -> dict[int, dict[str, float]] | None: + try: + with path.open(newline="", encoding="utf-8") as fp: + reader = csv.DictReader(fp) + if reader.fieldnames is None: + return None + required = {"ctx_tokens", "prefill_tps", "gen_tps"} + if not required.issubset(reader.fieldnames): + return None + rows: dict[int, dict[str, float]] = {} + for row in reader: + ctx = int(row["ctx_tokens"]) + rows[ctx] = { + "prefill_tps": float(row["prefill_tps"]), + "gen_tps": float(row["gen_tps"]), + } + return rows or None + except (OSError, ValueError): + return None + + +def gain_pct(other: float | None, base: float | None) -> float | None: + if other is None or base is None or base == 0.0: + return None + return ((other / base) - 1.0) * 100.0 + + +def min_present(values: list[float | None]) -> float | None: + present = [value for value in values if value is not None] + return min(present) if present else None + + +def max_present(values: list[float | None]) -> float | None: + present = [value for value in values if value is not None] + return max(present) if present else None + + +def prefixed_files(run_dir: Path, suffix: str) -> dict[str, Path]: + files: dict[str, Path] = {} + for path in sorted(run_dir.glob(f"*{suffix}")): + name = path.name + if name.endswith(suffix): + files[name[:-len(suffix)]] = path + return files + + +def collect_candidate(path: Path, root: Path) -> dict[str, Any] | None: + data = load_json(path) + if not isinstance(data, dict) or "candidate_label" not in data: + return None + decision = data.get("promotion_decision") or {} + speed_gate = decision.get("speed_gate") or {} + drift_gate = decision.get("drift_gate") or {} + coverage_gate = decision.get("coverage_gate") or {} + min_prefill = speed_gate.get("min_prefill_gain_pct") + min_gen = speed_gate.get("min_generation_gain_pct") + if min_prefill is None or min_gen is None: + fallback_prefill, fallback_gen = candidate_speed_from_gains(data) + min_prefill = fallback_prefill if min_prefill is None else min_prefill + min_gen = fallback_gen if min_gen is None else min_gen + return { + "path": rel(path, root), + "run": run_label(path, root), + "candidate": data.get("candidate_label"), + "preset": data.get("candidate_preset"), + "env": data.get("candidate_env") or {}, + "promotion_safe": decision.get("promotion_safe"), + "min_prefill_gain_pct": min_prefill, + "min_generation_gain_pct": min_gen, + "min_repeat_prefill_gain_pct": speed_gate.get("min_repeat_prefill_gain_pct"), + "drift_run": drift_gate.get("run"), + "drift_ok": drift_gate.get("ok"), + "coverage_required": coverage_gate.get("required"), + "coverage_run": coverage_gate.get("run"), + "coverage_ok": coverage_gate.get("ok"), + "coverage_pair": coverage_gate.get("pair"), + "coverage_tensor_standard_worst_rms": coverage_gate.get("tensor_vs_standard_worst_rms"), + "coverage_tensor_standard_worst_rms_case": coverage_gate.get("tensor_vs_standard_worst_rms_case"), + "coverage_tensor_standard_worst_top20_abs": coverage_gate.get("tensor_vs_standard_worst_top20_max_abs"), + "coverage_tensor_standard_worst_top20_abs_case": coverage_gate.get("tensor_vs_standard_worst_top20_max_abs_case"), + "tensor_standard_worst_rms": drift_gate.get("tensor_vs_standard_worst_rms"), + "tensor_standard_worst_rms_case": drift_gate.get("tensor_vs_standard_worst_rms_case"), + "tensor_standard_worst_top20_abs": drift_gate.get("tensor_vs_standard_worst_top20_max_abs"), + "tensor_standard_worst_top20_abs_case": drift_gate.get("tensor_vs_standard_worst_top20_abs_case"), + "failures": decision.get("failures") or [], + } + + +def collect_drift(path: Path, root: Path) -> dict[str, Any] | None: + data = load_json(path) + if not isinstance(data, dict) or "pairs" not in data or "modes" not in data: + return None + pairs = data.get("pairs") or {} + tensor_standard = pairs.get("tensor_vs_standard", {}) + ts_summary = tensor_standard.get("summary", {}) + ts_extrema = tensor_standard.get("extrema", {}) + is_chunked = isinstance(data.get("frontiers"), list) + return { + "path": rel(path, root), + "run": run_label(path, root), + "kind": "chunked" if is_chunked else "five-fixture", + "env": data.get("env") or data.get("candidate_env") or {}, + "preset": (data.get("run_config") or {}).get("candidate_preset"), + "gate_ok": not bool(data.get("gate_failures")), + "failures": data.get("gate_failures") or [], + "tensor_standard_top1": ts_summary.get("top1_mismatches"), + "tensor_standard_greedy": ts_summary.get("greedy_mismatches"), + "tensor_standard_min_top20": ts_summary.get("min_top20_overlap"), + "tensor_standard_worst_rms": ts_summary.get("worst_rms"), + "tensor_standard_worst_rms_case": ( + ts_extrema.get("worst_rms_case") or ts_extrema.get("worst_rms_frontier") + ), + "tensor_standard_worst_top20_abs": ts_summary.get("worst_top20_max_abs"), + "tensor_standard_worst_top20_abs_case": ( + ts_extrema.get("worst_top20_max_abs_case") or + ts_extrema.get("worst_top20_max_abs_frontier") + ), + } + + +def unwrap_compare_summary(data: dict[str, Any]) -> dict[str, Any]: + summary = data.get("summary") + if isinstance(summary, dict) and "count" in summary: + return summary + return data + + +def collect_compare(path: Path, root: Path) -> dict[str, Any] | None: + data = load_json(path) + if not isinstance(data, dict): + return None + summary = unwrap_compare_summary(data) + if "top_max_abs" not in summary: + return None + top_max = (summary.get("top_max_abs") or [{}])[0] if summary.get("top_max_abs") else {} + top_rms = (summary.get("top_rms") or [{}])[0] if summary.get("top_rms") else {} + return { + "path": rel(path, root), + "run": run_label(path, root), + "count": summary.get("count"), + "routes": summary.get("route_counts") or {}, + "threshold_breaches": len(summary.get("threshold_breaches") or []), + "explicit_breaches": len(summary.get("breaches") or []), + "worst_max_abs": top_max.get("max_abs"), + "worst_max_abs_route": top_max.get("route"), + "worst_max_abs_module": top_max.get("module"), + "worst_rms": top_rms.get("rms"), + "worst_rms_route": top_rms.get("route"), + "worst_rms_module": top_rms.get("module"), + } + + +def collect_stage(path: Path, root: Path) -> dict[str, Any] | None: + data = load_json(path) + summaries = data if isinstance(data, list) else [data] + if not summaries or not isinstance(summaries[0], dict) or "stages" not in summaries[0]: + return None + first = summaries[0] + stages = first.get("stages") or {} + q8_shapes = first.get("q8_shapes") or {} + flash_shapes = first.get("flash_shapes") or {} + top_stage_name, top_stage = max( + stages.items(), + key=lambda item: item[1].get("total_ms", 0.0), + default=("n/a", {}), + ) + top_q8_name, top_q8 = max( + q8_shapes.items(), + key=lambda item: item[1].get("total_ms", 0.0), + default=("n/a", {}), + ) + top_flash_name, top_flash = max( + flash_shapes.items(), + key=lambda item: item[1].get("total_ms", 0.0), + default=("n/a", {}), + ) + throughput = first.get("throughput") or [] + last_throughput = throughput[-1] if throughput else {} + return { + "path": rel(path, root), + "run": run_label(path, root), + "events": first.get("events"), + "prefill_tps": last_throughput.get("prefill_tps"), + "generation_tps": last_throughput.get("generation_tps"), + "top_stage": top_stage_name, + "top_stage_ms": top_stage.get("total_ms"), + "top_q8_shape": top_q8_name, + "top_q8_ms": top_q8.get("total_ms"), + "top_flash_shape": top_flash_name, + "top_flash_ms": top_flash.get("total_ms"), + } + + +def collect_metal_tensor_bench(run_dir: Path, root: Path) -> list[dict[str, Any]]: + standards = prefixed_files(run_dir, "_ds4_bench_standard_metal.csv") + qualities = prefixed_files(run_dir, "_ds4_bench_quality.csv") + tensors = prefixed_files(run_dir, "_ds4_bench_tensor_metal.csv") + prefixes = sorted(set(standards) & set(qualities) & set(tensors)) + if not prefixes: + return [] + + items: list[dict[str, Any]] = [] + for prefix in prefixes: + standard_csv = standards[prefix] + quality_csv = qualities[prefix] + tensor_csv = tensors[prefix] + standard = read_bench_csv(standard_csv) + quality = read_bench_csv(quality_csv) + tensor = read_bench_csv(tensor_csv) + if not standard or not quality or not tensor: + continue + + contexts = sorted(set(standard) & set(quality) & set(tensor)) + if not contexts: + continue + + tensor_vs_standard_prefill = [ + gain_pct(tensor[ctx]["prefill_tps"], standard[ctx]["prefill_tps"]) + for ctx in contexts + ] + tensor_vs_standard_gen = [ + gain_pct(tensor[ctx]["gen_tps"], standard[ctx]["gen_tps"]) + for ctx in contexts + ] + quality_vs_standard_prefill = [ + gain_pct(quality[ctx]["prefill_tps"], standard[ctx]["prefill_tps"]) + for ctx in contexts + ] + chart_path = run_dir / f"{prefix}_ds4_bench_standard_quality_tensor.png" + run_name = run_dir.name if len(prefixes) == 1 else f"{run_dir.name}/{prefix}" + items.append({ + "path": rel(run_dir, root), + "run": run_name, + "prefix": prefix, + "chart": rel(chart_path, root) if chart_path.exists() else None, + "standard_csv": rel(standard_csv, root), + "quality_csv": rel(quality_csv, root), + "tensor_csv": rel(tensor_csv, root), + "contexts": contexts, + "min_tensor_prefill_vs_standard_pct": min_present(tensor_vs_standard_prefill), + "max_tensor_prefill_vs_standard_pct": max_present(tensor_vs_standard_prefill), + "min_tensor_gen_vs_standard_pct": min_present(tensor_vs_standard_gen), + "max_tensor_gen_vs_standard_pct": max_present(tensor_vs_standard_gen), + "min_quality_prefill_vs_standard_pct": min_present(quality_vs_standard_prefill), + "max_quality_prefill_vs_standard_pct": max_present(quality_vs_standard_prefill), + }) + return items + + +def collect(root: Path) -> dict[str, list[dict[str, Any]]]: + candidates: list[dict[str, Any]] = [] + drifts: list[dict[str, Any]] = [] + compares: list[dict[str, Any]] = [] + stages: list[dict[str, Any]] = [] + metal_benches: list[dict[str, Any]] = [] + if root.exists(): + for run_dir in sorted(path for path in root.iterdir() if path.is_dir()): + metal_benches.extend(collect_metal_tensor_bench(run_dir, root)) + for path in sorted(root.rglob("*.json")): + name = path.name + if name == "prefill-candidate-summary.json": + item = collect_candidate(path, root) + if item: + candidates.append(item) + elif name == "summary.json" and path.parent.name == "quality-drift-gate": + item = collect_drift(path, root) + if item: + drifts.append(item) + elif name == "summary.json": + item = collect_drift(path, root) + if item: + drifts.append(item) + elif name == "mpp-compare-summary.json": + item = collect_compare(path, root) + if item: + compares.append(item) + elif name == "stage-profile-summary.json": + item = collect_stage(path, root) + if item: + stages.append(item) + return { + "candidates": candidates, + "drift_gates": drifts, + "mpp_compares": compares, + "stage_profiles": stages, + "metal_tensor_benches": metal_benches, + } + + +def top_items(items: list[dict[str, Any]], key: str, top: int, reverse: bool = True) -> list[dict[str, Any]]: + sortable = [item for item in items if item.get(key) is not None] + return sorted(sortable, key=lambda item: item[key], reverse=reverse)[:top] + + +def render_markdown(index: dict[str, list[dict[str, Any]]], top: int) -> str: + lines: list[str] = [ + "# DS4 Local Run Index", + "", + "| Artifact type | Count |", + "| --- | ---: |", + f"| Prefill candidates | {len(index['candidates'])} |", + f"| Metal Tensor bench charts | {len(index['metal_tensor_benches'])} |", + f"| Drift gates | {len(index['drift_gates'])} |", + f"| Comparator summaries | {len(index['mpp_compares'])} |", + f"| Stage profiles | {len(index['stage_profiles'])} |", + "", + ] + + if index["candidates"]: + lines.extend( + [ + "## Prefill Candidates By Speed", + "", + "| Run | Candidate | Promotion-safe | 5-fixture OK | Coverage OK | Coverage pair | Min prefill vs Tensor | Min repeat prefill | Min gen vs Tensor | 5-fixture RMS | 5-fixture top20 | Coverage RMS | Coverage top20 |", + "| --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + ) + for item in top_items(index["candidates"], "min_prefill_gain_pct", top): + lines.append( + "| " + f"`{markdown_escape(item['run'])}` | " + f"`{markdown_escape(item['candidate'])}` | " + f"{bool_label(item.get('promotion_safe'))} | " + f"{bool_label(item.get('drift_ok'))} | " + f"{coverage_label(item)} | " + f"`{markdown_escape(item.get('coverage_pair') or 'n/a')}` | " + f"{fmt_pct(item.get('min_prefill_gain_pct'))} | " + f"{fmt_pct(item.get('min_repeat_prefill_gain_pct'))} | " + f"{fmt_pct(item.get('min_generation_gain_pct'))} | " + f"{fmt_num(item.get('tensor_standard_worst_rms'))} | " + f"{fmt_num(item.get('tensor_standard_worst_top20_abs'))} | " + f"{fmt_num(item.get('coverage_tensor_standard_worst_rms'))} | " + f"{fmt_num(item.get('coverage_tensor_standard_worst_top20_abs'))} |" + ) + lines.append("") + + lines.extend( + [ + "## Candidate Promotion Failures", + "", + "| Run | Candidate | Env | First failure |", + "| --- | --- | --- | --- |", + ] + ) + for item in index["candidates"]: + failures = item.get("failures") or [] + if failures: + lines.append( + "| " + f"`{markdown_escape(item['run'])}` | " + f"`{markdown_escape(item['candidate'])}` | " + f"`{markdown_escape(env_label(item.get('env')))}` | " + f"{markdown_escape(failures[0])} |" + ) + lines.append("") + + if index["metal_tensor_benches"]: + lines.extend( + [ + "## Metal Tensor Bench Charts", + "", + "| Run | Contexts | Tensor prefill vs Standard | Tensor gen vs Standard | Quality prefill vs Standard | Chart |", + "| --- | ---: | ---: | ---: | ---: | --- |", + ] + ) + for item in sorted(index["metal_tensor_benches"], key=lambda row: row["run"], reverse=True)[:top]: + lines.append( + "| " + f"`{markdown_escape(item['run'])}` | " + f"{len(item.get('contexts') or [])} | " + f"{fmt_pct(item.get('min_tensor_prefill_vs_standard_pct'))}.." + f"{fmt_pct(item.get('max_tensor_prefill_vs_standard_pct'))} | " + f"{fmt_pct(item.get('min_tensor_gen_vs_standard_pct'))}.." + f"{fmt_pct(item.get('max_tensor_gen_vs_standard_pct'))} | " + f"{fmt_pct(item.get('min_quality_prefill_vs_standard_pct'))}.." + f"{fmt_pct(item.get('max_quality_prefill_vs_standard_pct'))} | " + f"`{markdown_escape(item.get('chart') or 'n/a')}` |" + ) + lines.append("") + + if index["drift_gates"]: + lines.extend( + [ + "## Drift Gates", + "", + "| Run | Kind | Gate OK | Env | Top1 | Greedy | Min top20 | Worst RMS | RMS case/frontier | Worst top20 abs | Top20 case/frontier |", + "| --- | --- | --- | --- | ---: | ---: | ---: | ---: | --- | ---: | --- |", + ] + ) + for item in sorted(index["drift_gates"], key=lambda row: row["run"], reverse=True)[:top]: + lines.append( + "| " + f"`{markdown_escape(item['run'])}` | " + f"{markdown_escape(item.get('kind') or 'n/a')} | " + f"{bool_label(item.get('gate_ok'))} | " + f"`{markdown_escape(env_label(item.get('env')))}` | " + f"{fmt_num(item.get('tensor_standard_top1'))} | " + f"{fmt_num(item.get('tensor_standard_greedy'))} | " + f"{fmt_num(item.get('tensor_standard_min_top20'))}/20 | " + f"{fmt_num(item.get('tensor_standard_worst_rms'))} | " + f"{markdown_escape(item.get('tensor_standard_worst_rms_case') or 'n/a')} | " + f"{fmt_num(item.get('tensor_standard_worst_top20_abs'))} | " + f"{markdown_escape(item.get('tensor_standard_worst_top20_abs_case') or 'n/a')} |" + ) + lines.append("") + + if index["mpp_compares"]: + lines.extend( + [ + "## Comparator Summaries", + "", + "| Run | Comparisons | Breaches | Worst max abs | Route | Module | Worst RMS |", + "| --- | ---: | ---: | ---: | --- | --- | ---: |", + ] + ) + for item in top_items(index["mpp_compares"], "worst_max_abs", top): + lines.append( + "| " + f"`{markdown_escape(item['run'])}` | " + f"{fmt_num(item.get('count'))} | " + f"{fmt_num(item.get('threshold_breaches'))} | " + f"{fmt_num(item.get('worst_max_abs'))} | " + f"`{markdown_escape(item.get('worst_max_abs_route') or 'n/a')}` | " + f"`{markdown_escape(item.get('worst_max_abs_module') or 'n/a')}` | " + f"{fmt_num(item.get('worst_rms'))} |" + ) + lines.append("") + + if index["stage_profiles"]: + lines.extend( + [ + "## Stage Profiles", + "", + "| Run | Prefill t/s | Top stage | Stage ms | Top Q8 shape | Q8 ms | Top Flash shape | Flash ms |", + "| --- | ---: | --- | ---: | --- | ---: | --- | ---: |", + ] + ) + for item in sorted(index["stage_profiles"], key=lambda row: row["run"], reverse=True)[:top]: + lines.append( + "| " + f"`{markdown_escape(item['run'])}` | " + f"{fmt_num(item.get('prefill_tps'))} | " + f"`{markdown_escape(item.get('top_stage') or 'n/a')}` | " + f"{fmt_num(item.get('top_stage_ms'))} | " + f"`{markdown_escape(item.get('top_q8_shape') or 'n/a')}` | " + f"{fmt_num(item.get('top_q8_ms'))} | " + f"`{markdown_escape(item.get('top_flash_shape') or 'n/a')}` | " + f"{fmt_num(item.get('top_flash_ms'))} |" + ) + lines.append("") + + return "\n".join(lines).rstrip() + "\n" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--root", type=Path, default=Path("speed-bench/local-runs")) + parser.add_argument("--top", type=int, default=20) + parser.add_argument("--output", type=Path, help="write Markdown index here") + parser.add_argument("--json-output", type=Path, help="write JSON index here") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if args.top < 1: + raise SystemExit("--top must be >= 1") + root = args.root + index = collect(root) + markdown = render_markdown(index, args.top) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(markdown, encoding="utf-8") + print(f"Wrote {args.output}") + else: + print(markdown) + if args.json_output: + args.json_output.parent.mkdir(parents=True, exist_ok=True) + args.json_output.write_text(json.dumps(index, indent=2) + "\n", encoding="utf-8") + print(f"Wrote {args.json_output}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/speed-bench/metal_tensor_prefill_log.md b/speed-bench/metal_tensor_prefill_log.md new file mode 100644 index 000000000..bcfe2afad --- /dev/null +++ b/speed-bench/metal_tensor_prefill_log.md @@ -0,0 +1,4489 @@ +# Metal Tensor Prefill Optimization Log + +Branch: `metal-tensor-prefill-next` + +Date: 2026-05-14 + +This branch keeps the current low-drift Tensor default and uses the five-fixture +quality gate before promoting any prefill optimization. + +## Drift Gate + +Run: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --out-dir speed-bench/local-runs/20260514-170519-quality-drift-gate +``` + +Fixtures: + +- `short_italian_fact` +- `short_code_completion` +- `short_reasoning_plain` +- `long_memory_archive` +- `long_code_audit` + +Summary: + +| Pair | top1 mismatches | greedy mismatches | worst RMS | worst top20 abs | +| --- | ---: | ---: | ---: | ---: | +| standard vs quality | 0 | 1 | 0.618172 | 2.24006 | +| tensor vs quality | 0 | 1 | 0.618172 | 2.24006 | +| tensor vs standard | 0 | 0 | 0.239946 | 0.55422 | + +Gate status: OK. + +Latest summary artifact: +`speed-bench/local-runs/20260514-170519-quality-drift-gate/summary.json`. + +The direct equivalence test also passed: + +```sh +./ds4_test --metal-mpp-equivalence +``` + +Result after promoting attention-output low projection to all layers while +keeping the routed-MoE Tensor window at down from layer 12 and gate/up from +layer 15: +`top1_mismatch=0`, `greedy_fail=0`, +`worst_rms=0.239946`, and `worst_top20_max_abs=0.55422`. + +## HC Stable Sigmoid Scope + +VariableFate noted that commit `670411d` routed only the standalone +`kernel_dsv4_hc_split_sinkhorn` through `ds4_hc_sigmoid()` and +`ds4_hc_twice_sigmoid()`, while the fused decode kernels kept inline +`1/(1+exp(-z))` forms. That scope is intentional for now. + +Inspected paths: + +- `ds4_gpu_hc_split_sinkhorn_tensor`: standalone split/sinkhorn path. +- `ds4_gpu_hc_split_weighted_sum_tensor`: fused split plus pre-weighted HC + reduction, used by batched paths. +- `ds4_gpu_hc_split_weighted_sum_norm_tensor`: decode-only HC-pre plus weighted + RMSNorm fusion. This is the hot release decode path and is called for both + attention HC-pre and FFN HC-pre. + +Local A/B patch: + +- Changed the four fused sites in `kernel_dsv4_hc_split_weighted_sum` and + `kernel_dsv4_hc_split_weighted_sum_norm4` to call `ds4_hc_sigmoid()` and + `ds4_hc_twice_sigmoid()`. +- Built with `make ds4 ds4-bench ds4_test`. + +Generation throughput on `promessi_sposi`, `ctx=8192`, `gen_tokens=256`: + +| Variant | gen t/s | +| --- | ---: | +| production inline exp after revert | 33.28 | +| helper exp with `DS4_METAL_HC_STABLE=0`, repeat 1 | 32.32 | +| helper exp with `DS4_METAL_HC_STABLE=0`, repeat 2 | 31.21 | +| helper tanh with default `DS4_METAL_HC_STABLE=1`, repeat 1 | 31.61 | +| helper tanh with default `DS4_METAL_HC_STABLE=1`, repeat 2 | 31.01 | + +Quality result: + +- The helper/tanh fused-kernel patch produced non-finite logits in the + five-fixture drift run. All 15 captured logits dumps reported + `argmax_logit: nan`, so the summary could not be parsed as valid JSON. +- `./ds4_test --metal-mpp-equivalence` with helper/tanh failed with + `logits_fail=5` and `top1_mismatch=5`. +- The same helper-call patch with `DS4_METAL_HC_STABLE=0`, which compiles the + helpers back to the historical exp form, passed equivalence with + `top1_mismatch=0`, `greedy_fail=0`, `worst_rms=0.066747`, and + `worst_top20_max_abs=0.191437`. + +Decision: keep `DS4_METAL_HC_STABLE` limited to the standalone split/sinkhorn +path and keep the fused decode kernels on the historical inline exp form. A +separate decode flag is not useful until there is a finite, low-drift +decode-specific stable form with measured throughput. The production code keeps +the fused math unchanged and documents this scope near the helper definitions. + +## Compact Prefill Timing + +Run shape: + +```sh +CTX_MAX=8192 GEN_TOKENS=16 \ + OUT_DIR=speed-bench/local-runs/20260514-160025-default-attn-out-all-compact \ + OPEN_CHART=0 \ + speed-bench/run_metal_tensor_bench.sh +``` + +Current Tensor default (`attn_out=all`, routed-MoE `down=12`, `up=15`, +`gate=15`) vs standard Metal: + +| ctx | standard prefill t/s | tensor prefill t/s | tensor gain | standard gen t/s | tensor gen t/s | +| ---: | ---: | ---: | ---: | ---: | ---: | +| 512 | 265.82 | 358.20 | 34.8% | 38.12 | 38.32 | +| 1024 | 272.46 | 373.83 | 37.2% | 37.99 | 38.07 | +| 2048 | 330.40 | 436.33 | 32.1% | 37.44 | 37.47 | +| 4096 | 341.47 | 421.93 | 23.6% | 34.35 | 34.35 | +| 8192 | 355.11 | 425.63 | 19.9% | 33.53 | 33.38 | + +This keeps the plan focused on prefill. Generation is close to neutral at +shorter contexts in this compact run, with the largest measured drop at 8192 +tokens. + +## Rejected Knobs + +These were evaluated as env-only candidates and not promoted. + +| Candidate | Speed result | Drift result | Decision | +| --- | --- | --- | --- | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=18` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=18` | One run showed +2.2% to +5.7% over Tensor auto, but an immediate control run favored the old layer-20 default by 8.7% to 17.1%. | Five-fixture gate passed with `tensor_vs_standard` worst RMS `0.139912` and worst top20 abs `0.316128`. | Not promoted because the speed win was not stable. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=18` alone with up/down defaulting to 19/19 | Two-repeat median vs 19/19/19 Tensor auto: +0.3% at 512, then -0.3%, -0.3%, -0.7%, and +0.6% from 1024..8192. | Not run. | Reject before drift gate because the speed change is noise-level. | +| `DS4_METAL_MPP_MOE_UP_START_LAYER=18` alone with gate/down defaulting to 19/19 | Two-repeat median vs 19/19/19 Tensor auto: -0.2% at 512, -0.9% at 1024, +0.3% at 2048, -0.1% at 4096, and -0.1% at 8192. | Not run. | Reject before drift gate because the speed change is noise-level. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=18` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=18` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=20` | Slower than the promoted Tensor auto default by 0.1% to 3.6% in two-repeat median timing. | Not run. | Reject before drift gate. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=18` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=18` with down defaulting to 19 | Two-repeat median vs 19/19/19 Tensor auto: +0.1% at 512, then -0.7%, -1.9%, -3.0%, and -1.3% from 1024..8192. Generation was within -0.9%..+0.6%. | Not run. | Reject before drift gate because it is slower at most measured contexts. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=18` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=18` with down defaulting to 12 | Two-repeat median vs down-12 Tensor auto: -2.2% at 512, -2.8% at 1024, -2.7% at 2048, -0.1% at 4096, and +1.5% at 8192. Generation was within -0.7%..+1.5%. | Not run. | Reject before drift gate because it is slower at most measured contexts. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=12` with down/up unchanged at 12/15 after the attention-output all-layer promotion | Two-repeat median vs current Tensor auto: -0.1% at 512, -0.4% at 1024, -0.7% at 2048, -2.7% at 4096, and -1.4% at 8192. Generation was within -1.1%..+0.6%. | Not run. | Reject before drift gate because moving only gate earlier is slower at every compact prefill point. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=14` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=14` with down defaulting to 12 | Two-repeat median vs down-12 Tensor auto: +2.7% at 512, +2.9% at 1024, +2.2% at 2048, +1.1% at 4096, but -0.8% at 8192. Generation was -3.2% at 8192. | Not run. | Reject before drift gate because it regresses the long-context point and generation more than the layer-15 window. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=13` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=13` with down defaulting to 12 | Two-repeat median vs current Tensor auto: -1.5% at 512, -4.0% at 1024, -2.0% at 2048, +0.9% at 4096, and +1.4% at 8192. Generation was within -2.2%..+0.2%. Artifact: `speed-bench/local-runs/20260514-172507-moe-gate-up13-down12/prefill-candidate-summary.json`. | Not run. | Reject before drift gate because it trades away short and mid-context prefill for only small long-context gains. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=14` with down defaulting to 12 and up defaulting to 15 | Two-repeat median vs current Tensor auto: -2.2% at 512, -1.7% at 1024, -0.4% at 2048, +1.0% at 4096, and +2.1% at 8192. Generation was down by 0.4%..1.9%. | Not run. | Reject before drift gate because it is a tradeoff, not a clear prefill win. | +| `DS4_METAL_MPP_MOE_UP_START_LAYER=14` with down defaulting to 12 and gate defaulting to 15 | Two-repeat median vs current Tensor auto: -3.4% at 512, -6.4% at 1024, -4.9% at 2048, -6.2% at 4096, and -5.1% at 8192. | Not run. | Reject before drift gate because it is consistently slower. | +| `DS4_METAL_MPP_MOE_TILE_N=64` | Slower than default by 3.3% to 15.6%. | Not run. | Reject before drift gate. | +| `DS4_METAL_MOE_SUM6_DISABLE=1` | Two-repeat median vs current Tensor auto: -1.6% at 512, -1.8% at 1024, -1.4% at 2048, -0.1% at 4096, and +0.6% at 8192. Generation was within -0.5%..+0.4%. | Not run. | Reject before drift gate because disabling the fused six-expert sum is slower or noise-level at every compact point. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=9` with gate/up unchanged at 19 | Two-repeat median vs down-12 Tensor auto: +0.3% at 512, +0.1% at 1024, -1.4% at 2048, -0.4% at 4096, and -0.5% at 8192. Generation was within -0.7%..+0.5%. | Not run. | Reject before drift gate because it is slower at most measured contexts. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=10` with gate/up unchanged at 19 | Two-repeat median vs 19/19/19 Tensor auto: +0.8% at 512, flat at 1024, +0.8% at 2048, +2.6% at 4096, and +2.8% at 8192. Generation was within -1.7%..+1.4%. | Five-fixture gate and `./ds4_test --metal-mpp-equivalence` passed, but `tensor_vs_standard` drift rose to worst RMS `0.314905` and worst top20 abs `0.780825`. | Not promoted because layer 12 kept useful speed with lower drift. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=10` with gate/up defaulting to 15 and attention-output Tensor all-layer default | Two-repeat median vs current Tensor auto: -0.1% at 512, -0.5% at 1024, -1.6% at 2048, -2.9% at 4096, and -0.8% at 8192. Generation was within -0.3%..+0.5%. | Not run. | Reject before drift gate because it is slower at every compact prefill point after the attention-output promotion. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=11` with gate/up unchanged at 19 | Two-repeat median vs 19/19/19 Tensor auto: +1.7% at 512, +1.7% at 1024, +3.5% at 2048, +1.7% at 4096, and +1.2% at 8192. Generation was within -1.4%..-0.3%. | Five-fixture gate passed, but `tensor_vs_standard` drift rose to worst RMS `0.314275` and worst top20 abs `0.725578`. | Not promoted because layer 12 had a better drift balance. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=11` with gate/up defaulting to 15 | Two-repeat median vs current Tensor auto: +0.3% at 512, -0.1% at 1024, +0.2% at 2048, +0.5% at 4096, and -2.8% at 8192. Generation was within -1.3%..+0.2%. | Not run. | Reject before drift gate because the new gate/up window removes most of the earlier speed upside and the long-context point regresses. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=18` with gate/up/down defaulting to 19/19/19 | Two-repeat median vs 19/19/19 Tensor auto: -2.1% at 512, -3.1% at 1024, -3.3% at 2048, -0.7% at 4096, and +1.7% at 8192. Generation was within -1.2%..+0.4%. | Not run. | Reject before drift gate because it is slower at most measured contexts. | +| `DS4_METAL_MPP_MOE_DISABLE=1` after the attention-output all-layer promotion | Two-repeat median vs current Tensor auto: -23.6% at 512, -25.0% at 1024, -22.0% at 2048, -18.0% at 4096, and -15.4% at 8192. Generation was within -1.2%..+2.4%. | Not run. | Reject before drift gate because disabling the conservative routed-MoE Tensor window removes the dominant current prefill win. | +| Local patch: route-specific routed-MoE tile env plus `DS4_METAL_MPP_MOE_DOWN_TILE_N=64` | Compact two-repeat median vs current Tensor auto: -3.3% at 512, -4.3% at 1024, -3.1% at 2048, -0.4% at 4096, and +1.7% at 8192. A one-repeat long sweep was still slightly slower from 8192..65536: -0.4%, -0.2%, -0.3%, and -0.2%. | Not run. | Reverted before drift gate because the route-specific tile knob did not produce a clear prefill win and would add another non-promotable switch. | +| `DS4_METAL_MPP_ATTN_OUT_DISABLE=1` after the attention-output all-layer promotion | Two-repeat median vs current Tensor auto: -4.6% at 512, -5.3% at 1024, -5.6% at 2048, -5.0% at 4096, and -5.1% at 8192. Generation was within -1.1%..+0.8%. | Not run. | Reject before drift gate because disabling the default all-layer attention-output Tensor route removes a clear prefill win. | +| `DS4_METAL_MPP_F16_DISABLE=1` after the attention-output all-layer promotion | Two-repeat median vs current Tensor auto: -1.1% at 512, -1.8% at 1024, -3.1% at 2048, -2.2% at 4096, and -2.5% at 8192. Generation was within -1.4%..+0.4%. | Not run. | Reject before drift gate because disabling the default F16 compressor route is slower at every compact prefill point. | +| `DS4_METAL_MPP_F16_PAIR=1` after the attention-output all-layer promotion | Two-repeat median vs current Tensor auto: -0.7% at 512, -1.1% at 1024, -0.5% at 2048, -1.8% at 4096, and -1.2% at 8192. Generation was within -1.3%..+1.1%. Artifact: `speed-bench/local-runs/20260514-171939-f16-pair-current/prefill-candidate-summary.json`. | Not run. | Reject before drift gate because it is slower at every compact prefill point. | +| `DS4_METAL_MPP_F16_WIDE=1` | Diagnostic-only wider 512/1024-column compressor Tensor route. | Existing long-code full-model equivalence check fails with wide F16 Tensor (`rms ~= 0.569`, `top20_max_abs ~= 1.48`). | Keep default-off; do not spend more prefill timing effort until the drift issue has a new mitigation. | +| `DS4_METAL_MPP_DIRECT_RHS=0` plus `DS4_METAL_MPP_F16_DIRECT_RHS=1` to isolate staged-RHS attention-output low projection | Two-repeat median vs current Tensor auto: -7.1% at 512, -4.9% at 1024, -4.5% at 2048, -3.4% at 4096, and +0.1% at 8192. Generation was within -0.6%..+0.2%. | Not run. | Reject before drift gate because it is slower at most measured contexts. Keep the direct-RHS attention-output default. | +| `DS4_METAL_MPP_ATTN_OUT_TILE_N=32` | Slower than default by 1.1% to 16.4%. | Not run. | Keep default tile 64. | +| `DS4_METAL_MPP_ATTN_OUT_FILTER=layer=31..42` | Two-repeat median vs 32..42 Tensor auto: flat at 512, then slower by 0.3% to 1.4% from 1024..8192. | Not run. | Reject before drift gate; keep attention-output at 32..42. | +| Local patch: split dense Q8_0 prefill full 32-token tiles from the non-32-token tail (`DS4_METAL_Q8_PREFILL_SPLIT_TAIL=1` prototype) | On `long_code_audit` at `ctx=3836`, two-repeat median vs current Tensor auto was +0.3% prefill and +0.6% generation. | Not run. | Reverted before drift gate because the speed change is noise-level and does not justify keeping another Q8_0 switch. | +| Local patch: dense Q8_0 cooperative Tensor direct-RHS prefill prototype scoped to `attn_q_b` | Two-repeat median vs current Tensor auto was mixed: +2.8% at 512, -1.3% at 1024, -2.2% at 2048, +2.3% at 4096, and +5.1% at 8192. Generation moved -2.5%..+0.8%. | Not run. | Reverted before drift gate because mid-context prefill and generation regressed. | +| Local patch: dense Q8_0 cooperative Tensor direct-RHS prefill prototype scoped to `attn_out`/`attn_output_b` | Two-repeat median vs current Tensor auto was +4.6% at 512, +4.4% at 1024, +6.0% at 2048, +5.2% at 4096, and +3.5% at 8192. A conservative `attn_out@layer=32..42` window was only +0.6%..+0.9% and dropped generation up to 2.2%. | All-layer `attn_out` failed the five-fixture gate: `long_memory_archive` top-1 changed and greedy differed at step 0; `tensor_vs_standard` worst RMS `0.531143` and worst top20 abs `1.17201`. | Reverted despite speed because it violates the no-new-top1/no-new-greedy rule, and the late-only safe-shape hypothesis was noise-level. | +| Local patch: paired shared-expert Q8_0 prefill matmul for `shared_gate` plus `shared_up` | Two-repeat median vs current Tensor auto: -4.8% at 512, -3.3% at 1024, -3.0% at 2048, -0.4% at 4096, and +1.4% at 8192. Generation was within -1.3%..+0.3%. Artifact: `speed-bench/local-runs/20260514-173418-shared-q8-pair-prefill/prefill-candidate-summary.json`. | Not run. | Reverted before drift gate because it slows short and mid-context prefill for only a small long-context gain. | +| `DS4_METAL_MPP_MOE_PAIR_GATE_UP=1` with gate/up/down defaulting to 19/19/19 | Two-repeat median vs 19/19/19 Tensor auto: -6.2% at 512, -3.4% at 1024, -2.7% at 2048, -2.5% at 4096, and -2.1% at 8192. Generation was within -0.2%..+1.2%. | Not run. | Reject before drift gate because the paired dispatch is consistently slower. | +| `DS4_METAL_MPP_MOE_PAIR_GATE_UP=1` after the attention-output all-layer promotion and gate/up/down defaults of 15/15/12 | Two-repeat median vs current Tensor auto: -4.0% at 512, -4.4% at 1024, -4.5% at 2048, -2.4% at 4096, and -2.5% at 8192. Generation was within -2.4%..+0.2%. | Not run. | Reject before drift gate; the paired dispatch remains slower on the wider current gate/up Tensor window. | +| Local patch: standard-Metal paired routed-MoE gate/up prefill matmul for early non-Tensor gate/up layers | Two-repeat median vs current Tensor auto: -3.8% at 512, -2.3% at 1024, -0.8% at 2048, +0.6% at 4096, and +1.3% at 8192. Generation was within -1.1%..+1.0%. Artifact: `speed-bench/local-runs/20260514-230653-experimental-moe-pair-gate-up/prefill-candidate-summary.json`. | Not run. | Reverted before drift gate. Reusing the activation tile while preserving the legacy simdgroup-MMA math did not beat separate gate/up dispatch at short and mid contexts, so it is not worth keeping as another default-off mode. | +| `DS4_METAL_MPP_MOE_FAST_LAYOUT=0` after the attention-output all-layer promotion and gate/up/down defaults of 15/15/12 | Two-repeat median vs current Tensor auto: -3.6% at 512, -3.4% at 1024, -2.3% at 2048, -1.5% at 4096, and -3.2% at 8192. Generation was within -0.5%..+0.2%. | Not run. | Reject before drift gate; the staged layout is slower than the first-PR fast layout on the current conservative window. | +| Local patch: wider non-vector FlashAttention prefill key block (`NCPSG=128` instead of 64) | One-repeat screen vs current Tensor auto: -13.1% at 512, -4.9% at 1024, -2.8% at 2048, +0.9% at 4096, and +2.7% at 8192. Generation was within -0.8%..+0.4%. Artifact: `speed-bench/local-runs/20260514-231641-flash-attn-ncpsg128/prefill-candidate-summary.json`. | Not run. | Reverted before drift gate. The larger attention key block only helps long contexts slightly and regresses the short/mid contexts that dominate the compact promotion gate. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_EXPERIMENTAL_MOE_MATMUL_START_LAYER=18` | Two-repeat median vs current Tensor auto: +0.1% at 512, -0.1% at 1024, -0.6% at 2048, -1.8% at 4096, and -1.2% at 8192. | Not run. | Reject before drift gate because it is not faster than the current 19/19/19 default. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_EXPERIMENTAL_MOE_MATMUL_START_LAYER=19` | Two-repeat median vs current Tensor auto: -0.9% at 512, -1.9% at 1024, -1.6% at 2048, -2.7% at 4096, and -1.8% at 8192. Generation was within -0.3%..+0.7%. | Not run. | Reject before drift gate because it is consistently slower than the current 19/19/19 default. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_EXPERIMENTAL_MOE_MATMUL_START_LAYER=10` | Two-repeat median vs current Tensor auto: +7.5% at 512, +8.4% at 1024, +6.0% at 2048, +3.8% at 4096, +4.8% at 8192. Generation was -2.8%, -1.0%, +1.3%, +1.1%, +0.7%. | Failed the five-fixture gate: `long_memory_archive` top-1 changed and greedy differed at step 0; `tensor_vs_standard` also had one top-1 and one greedy mismatch. | Reject despite the speed because it violates the no-new-top1/no-new-greedy rule. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_EXPERIMENTAL_MOE_MATMUL_START_LAYER=12` | Two-repeat median vs current Tensor auto: +12.2% at 512, +8.5% at 1024, +8.3% at 2048, +3.2% at 4096, +1.1% at 8192. Generation was +3.4%, -0.2%, +1.5%, -4.6%, -3.6%. | Full `./ds4_test --metal-mpp-equivalence` passed with no top-1 or greedy mismatch, but drift rose to worst RMS `0.300474` and worst top20 abs `1.00957`. | Reject before the full quality gate: long-context speed is weak and drift is much worse than the current conservative default. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_EXPERIMENTAL_MOE_MATMUL_START_LAYER=15` | Two-repeat median vs current Tensor auto: +2.3% at 512, +2.0% at 1024, +1.5% at 2048, +2.6% at 4096, +2.0% at 8192. Generation was -2.7%, +0.0%, -1.8%, +1.1%, +1.4%. | Full `./ds4_test --metal-mpp-equivalence` passed with no top-1 or greedy mismatch, but drift rose to worst RMS `0.229322` and worst top20 abs `0.511806`. | Reject before the full quality gate: speed is marginal and drift is still worse than default. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_EXPERIMENTAL_MOE_MATMUL_START_LAYER=17` | Two-repeat median vs current Tensor auto: +2.2% at 512, +0.5% at 1024, +0.8% at 2048, +1.2% at 4096, +0.7% at 8192. Generation was within -1.7%..+0.5%. | Full `./ds4_test --metal-mpp-equivalence` passed with no top-1 or greedy mismatch, but drift rose to worst RMS `0.190587` and worst top20 abs `0.560192`. | Reject before the full quality gate: speed is within noise and drift is worse than default. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_MATH_SAFE=1` | Not timed. | `./ds4_test --metal-mpp-equivalence` failed: `long_memory_archive` changed top-1 and greedy at step 0; summary `top1_mismatch=1`, `greedy_fail=4`, worst RMS `0.58437`, and worst top20 abs `2.17881`. | Reject as a drift-reduction diagnostic. Strict Metal math makes the all-layer experimental route worse rather than explaining away the Tensor-vs-standard movement. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus route-specific gate start `8`, up start `15`, down start `12` | Two-repeat median vs current Tensor auto: +2.1% at 512, +2.6% at 1024, +1.5% at 2048, +1.8% at 4096, and +1.4% at 8192. Generation was within -0.6%..+0.4%. | Failed the five-fixture gate: `long_memory_archive` top-1 changed and greedy differed at step 0; `tensor_vs_standard` had one top-1 and one greedy mismatch. | Reject despite the clean timing profile because it violates the no-new-top1/no-new-greedy rule. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=12` | Two-repeat median vs current Tensor auto: +13.3% at 512, +12.6% at 1024, +10.9% at 2048, +6.4% at 4096, and +6.1% at 8192. Generation had one -3.1% point at 2048 and was otherwise within -1.3%..-0.3%. Artifact: `speed-bench/local-runs/20260514-181839-mpp-fast-gate-up0-down12/`. | Failed the five-fixture gate: `tensor_vs_standard` had one greedy mismatch on `long_code_audit` (`diff@11`), with worst RMS `0.554059` and worst top20 abs `1.40659`. | Reject despite speed because it introduces a new greedy continuation change. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=15` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=12` plus `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42` | Two-repeat median vs current Tensor auto: +2.0% at 512, then -1.9%, -2.1%, -2.6%, and -1.5% from 1024..8192. Generation was within -1.6%..+1.4%. Artifact: `speed-bench/local-runs/20260514-222322-mpp-fast-gate0-up15-down12-skip-down26-29-30/prefill-candidate-summary.json`. | Not run. | Reject before drift gate. Combining the fast all-layer gate route with conservative up/down windows and the known down-layer skips gives up too much compact prefill; the skipped down layers do not recover a useful speed/drift middle ground. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus route-specific gate start `0`, up start `15`, down start `12`, and `DS4_METAL_MOE_MID_F32=1` | Two-repeat median vs current Tensor auto: +4.5% at 512, +4.1% at 1024, +0.9% at 2048, -1.3% at 4096, and +0.4% at 8192. Generation was within -1.4%..-0.1%. | Not run. | Reject before drift gate because the F32 intermediate removes most of the useful route-specific prefill win and regresses the 4096-token point. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus route-specific up start `0`, gate start `15`, down start `12` | Two-repeat median vs current Tensor auto: +6.6% at 512, +6.3% at 1024, +4.5% at 2048, +3.3% at 4096, and +2.9% at 8192. Generation was within -1.4%..+0.5%. | Failed the five-fixture gate: `long_memory_archive` top-1 changed and greedy differed at step 0; `tensor_vs_standard` had one top-1 and one greedy mismatch. | Reject despite speed because it violates the no-new-top1/no-new-greedy rule. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus route-specific down start `0`, gate/up start `15` | Two-repeat median vs current Tensor auto: +4.1% at 512, +4.2% at 1024, +3.5% at 2048, +2.3% at 4096, and +2.2% at 8192. Generation was within -1.7%..+0.1%. | Failed the five-fixture gate: `long_memory_archive` top-1 changed and greedy differed at step 0; `tensor_vs_standard` had one top-1 and one greedy mismatch. | Reject despite speed because it violates the no-new-top1/no-new-greedy rule. | +| `DS4_METAL_MPP_MOE_{GATE,UP,DOWN}_START_LAYER=0` with filters adding layers 0..3 to the current default windows | Two-repeat median vs current Tensor auto: +4.4% at 512, +3.7% at 1024, +0.7% at 2048, +2.4% at 4096, and +2.0% at 8192. Generation was mostly neutral except -1.9% at 2048. Artifact: `speed-bench/local-runs/20260514-185845-mpp-gud0-3-default/`. | Failed the five-fixture gate: `tensor_vs_standard` had one greedy mismatch on `long_code_audit` (`diff@10`), with worst RMS `0.495637` and worst top20 abs `1.78119`. | Reject despite the modest speed gain because it introduces a new greedy continuation change. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=0` plus `DS4_METAL_MPP_MOE_GATE_FILTER=layer=0-3,layer=15-42`, with up/down at 15/12 | Two-repeat median vs current Tensor auto: -2.2% at 512, -2.3% at 1024, -3.5% at 2048, -1.9% at 4096, and +0.6% at 8192. Generation was within -1.2%..-0.1%. Artifact: `speed-bench/local-runs/20260514-184842-mpp-gate0-3-up15-down12/`. | Not run. | Reject before drift gate because adding only gate layers 0..3 is slower through the compact range. | +| `DS4_METAL_MPP_MOE_UP_START_LAYER=0` plus `DS4_METAL_MPP_MOE_UP_FILTER=layer=0-3,layer=15-42`, with gate/down at 15/12 | Two-repeat median vs current Tensor auto: +0.9% at 512, +0.3% at 1024, -0.4% at 2048, -2.2% at 4096, and -2.2% at 8192. Generation was within -2.1%..-0.1%. Artifact: `speed-bench/local-runs/20260514-185210-mpp-up0-3-gate15-down12/`. | Not run. | Reject before drift gate because adding only up layers 0..3 is slower at the larger compact contexts and hurts generation. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=0` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=0`, each filtered to `layer=0-3,layer=15-42`, with down defaulting to 12 | Two-repeat median vs current Tensor auto was positive: +1.7% at 512, +2.0% at 1024, +2.4% at 2048, +2.3% at 4096, and +2.6% at 8192. Generation was nearly flat, -0.4%..-0.1%. Artifact: `speed-bench/local-runs/20260515-065835-mpp-gateup0-3-down12/prefill-candidate-summary.md`. | Not run; `run_prefill_candidate_gate.py --run-drift-gate` skipped the drift gate because the repeat-level speed floor failed, with repeat prefill deltas `[-0.5%, +3.9%]` at 512 and observed min repeat prefill `-0.5%`. | Reject before drift gate. Median speed was encouraging, but the gain is not repeat-stable enough for promotion, and the speed-first guard correctly avoided a five-fixture drift run. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=0` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=0`, each filtered to `layer=0-5,layer=15-42`, with down defaulting to 12 | Two-repeat median vs current Tensor auto: +3.6% at 512, +3.0% at 1024, +1.1% at 2048, -1.2% at 4096, and +1.7% at 8192. Generation was within -1.5%..-0.1%. Artifact: `speed-bench/local-runs/20260515-070235-mpp-gateup0-5-down12/prefill-candidate-summary.md`. | Not run. | Reject before drift gate because it fails the compact speed screen at 4096 tokens and has repeat-level prefill down to -1.7%. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=0` plus `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-3,layer=12-42`, with gate/up at 15/15 | Two-repeat median vs current Tensor auto: +1.5% at 512, +1.7% at 1024, -0.3% at 2048, -1.1% at 4096, and -1.3% at 8192. Generation was within -3.3%..-0.1%. Artifact: `speed-bench/local-runs/20260514-185528-mpp-down0-3-gate15-up15/`. | Not run. | Reject before drift gate because adding only down layers 0..3 regresses the larger compact contexts and generation. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=2` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=15` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=12` | Two-repeat median vs current Tensor auto: +5.1% at 512, +4.2% at 1024, +3.9% at 2048, +2.5% at 4096, and +1.2% at 8192. Generation was within -1.5%..+0.4%. Artifact: `speed-bench/local-runs/20260514-184135-mpp-gate2-up15-down12/`. | Five-fixture gate passed, but `tensor_vs_standard` drift rose to worst RMS `0.640912` and worst top20 abs `1.11909`. | Reject because gate0/up15/down12 is faster at most points and has lower worst RMS. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=4` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=15` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=12` | Two-repeat median vs current Tensor auto: +0.1% at 512, -1.0% at 1024, -0.5% at 2048, +1.9% at 4096, and +3.1% at 8192. Generation was within -2.0%..-0.4%. Artifact: `speed-bench/local-runs/20260514-183734-mpp-gate4-up15-down12/`. | Not run. | Reject before drift gate because it trades short/mid-context prefill and generation for only long-context gains. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=8` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=15` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=12` | Two-repeat median vs current Tensor auto: +2.2% at 512, +2.8% at 1024, +1.9% at 2048, +1.9% at 4096, and +1.6% at 8192. Generation was within -0.8%..-0.1%. Artifact: `speed-bench/local-runs/20260514-182931-mpp-gate8-up15-down12/`. | Failed the five-fixture gate: `long_memory_archive` top-1 changed and greedy differed at step 0; `tensor_vs_standard` also had one top-1 and one greedy mismatch. | Reject because the modest speed gain is not worth the top-1 regression. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=32-42` | Comparator-guided follow-up after the skip-26/29/30 candidate; this also excludes `moe_down` layer 31. Two-repeat median vs current Tensor auto: +15.0% at 512, +10.9% at 1024, +8.9% at 2048, +6.0% at 4096, and +3.4% at 8192. Generation regressed by -6.1%, -3.4%, -3.5%, -3.3%, and -3.0%. Artifact: `speed-bench/local-runs/20260514-214603-mpp-fast-skip-down26-29-31/prefill-candidate-summary.md`. | Five-fixture gate failed the strict Tensor-vs-standard envelope: no top-1 or greedy mismatch, but worst RMS `0.643831` on `long_memory_archive` and worst top20 abs `1.10919` on `long_code_audit`. | Reject. Skipping layer 31 removes the remaining local `moe_down` comparator breach but does not materially reduce full-model drift, fails the generation floor at 512 tokens, and gives up too much 8192-token prefill compared with the skip-26/29/30 candidate. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28` | Hybrid follow-up that keeps fast all-layer gate/up Tensor but stops Tensor `moe_down` after the comparator-clean early range. Two-repeat median vs current Tensor auto: +8.5% at 512, +6.1% at 1024, +4.6% at 2048, +5.4% at 4096, and +5.9% at 8192. Generation was within -1.0%..+0.6%. Artifact: `speed-bench/local-runs/20260515-023038-mpp-fast-gate-up0-down-clean-early/prefill-candidate-summary.md`. | Five-fixture gate failed the strict Tensor-vs-standard envelope: no top-1 or greedy mismatch, but worst RMS `0.643635` on `long_memory_archive` and worst top20 abs `1.11349` on `long_code_audit`. | Reject. Removing late `moe_down` Tensor does not fix the route-wide drift, and it is slower than the skip-26/29/30 default-off candidate. | + +## Promoted Candidates + +| Candidate | Speed result | Drift result | Decision | +| --- | --- | --- | --- | +| `DS4_METAL_MPP_ATTN_OUT_FILTER=all` | Two-repeat median vs current Tensor auto: +3.1% at 512, +3.3% at 1024, +3.6% at 2048, +2.2% at 4096, and +2.1% at 8192. Generation was within -1.1%..+0.3%. | Five-fixture gate passed and `./ds4_test --metal-mpp-equivalence` passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.239946`, worst top20 abs `0.55422`, matching the current default envelope. | Promoted: attention-output low projection now defaults to all layers; `late_safe` remains available for the old 32..42 window. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=19` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=19` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=21` | Two-repeat median vs current Tensor auto: +0.6% at 512, +0.8% at 1024, +2.3% at 2048, +2.0% at 4096, +1.6% at 8192. Generation was within -1.4%..+0.5%. | Five-fixture gate passed, first as env candidate and again as the env-free default after promotion. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1` matching standard-vs-quality, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.176030`, worst top20 abs `0.360397`. | Promoted, then superseded by the lower-drift 19/19/20 window and the faster 19/19/19 window. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=19` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=19` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=20` | Two-repeat median vs 19/19/21 Tensor auto: +0.3% at 512, +1.2% at 1024, +0.9% at 2048, +0.4% at 4096, +0.2% at 8192. Generation was within -0.9%..+1.0%. | Five-fixture gate passed, first as env candidate and again as the env-free default after promotion. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1` matching standard-vs-quality, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.066747`, worst top20 abs `0.191437`. | Promoted, then superseded by the slightly faster 19/19/19 window. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=19` with gate/up unchanged at 19 | Two-repeat median vs 19/19/20 Tensor auto: +0.9% at 512, +1.2% at 1024, +1.1% at 2048, +0.4% at 4096, +0.9% at 8192. Generation was within -1.0%..+1.4%. | Five-fixture env-candidate gate passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1` matching standard-vs-quality, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.136143`, worst top20 abs `0.315292`. | Promoted as the next routed-MoE default window: gate/up/down from layer 19. | +| `DS4_METAL_MPP_MOE_DOWN_START_LAYER=12` with gate/up unchanged at 19 | Two-repeat median vs 19/19/19 Tensor auto: +2.1% at 512, +0.8% at 1024, +2.0% at 2048, +1.1% at 4096, and +1.5% at 8192. Env-free compact timing after promotion showed Tensor prefill +26.7%, +28.8%, +21.9%, +18.7%, and +15.7% vs standard Metal from 512..8192. | Five-fixture env-candidate gate and env-free default gate passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1` matching standard-vs-quality, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.229474`, worst top20 abs `0.601166`. `./ds4_test --metal-mpp-equivalence` also passed with the same worst RMS/top20 abs. | Promoted, then superseded by the layer-15 gate/up window. | +| `DS4_METAL_MPP_MOE_GATE_START_LAYER=15` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=15` with down defaulting to 12 | Two-repeat median vs down-12 Tensor auto: +2.2% at 512, +1.5% at 1024, +0.3% at 2048, +0.2% at 4096, and +0.6% at 8192. Env-free compact timing after promotion shows Tensor prefill +32.3%, +31.7%, +24.7%, +19.8%, and +17.0% vs standard Metal from 512..8192. | Five-fixture env-candidate gate and env-free default gate passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1` matching standard-vs-quality, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.239946`, worst top20 abs `0.55422`. `./ds4_test --metal-mpp-equivalence` also passed with the same worst RMS/top20 abs. | Promoted as the current routed-MoE default window: down from layer 12, gate/up from layer 15. | + +## Default-Off Candidates + +| Candidate | Speed result | Drift result | Decision | +| --- | --- | --- | --- | +| `DS4_METAL_MPP_FAST=1` | Post-attention-output-promotion two-repeat median vs current Tensor auto: +18.1% at 512, +18.3% at 1024, +12.3% at 2048, +7.4% at 4096, and +7.1% at 8192. Generation was neutral, within -0.1%..+0.7%. | Five-fixture gate passed and `./ds4_test --metal-mpp-equivalence` passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.669241`, worst top20 abs `1.30664`. | Keep default-off as the strongest speed/eval candidate. It widens routed-MoE Tensor to layer 0, but the Tensor-vs-standard drift is much larger than the conservative default. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-42` | Two-repeat median vs current Tensor auto: +15.8% at 512, +14.6% at 1024, +9.4% at 2048, +9.0% at 4096, and +9.6% at 8192. Generation was within -0.8%..+0.0%. Artifact: `speed-bench/local-runs/20260514-180751-mpp-fast-skip-down26/`. | Five-fixture gate passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.645033`, worst top20 abs `1.28496`. | Keep default-off. Skipping the local comparator outlier layer 26 trims the fast-route drift slightly but remains far above the conservative default drift envelope. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42` | Two-repeat median vs current Tensor auto: +19.3% at 512, +19.5% at 1024, +7.8% at 2048, +6.1% at 4096, and +6.0% at 8192. Generation was mixed but acceptable for a prefill-first candidate: +1.7%, +0.5%, -3.5%, -2.5%, and +1.8%. Artifact: `speed-bench/local-runs/20260514-212340-mpp-fast-skip-down26-29-30/prefill-candidate-summary.json`. | Five-fixture gate passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.643810`, worst top20 abs `1.13945`. `./ds4_test --metal-mpp-equivalence` also passed with the same Tensor summary. | Keep default-off as the best current eval candidate. Comparator-guided exclusions remove the large `moe_down` local outliers at layers 26, 29, and 30, reducing top20 Tensor-vs-standard drift versus the layer-26-only skip while keeping a larger compact prefill win. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42` plus `DS4_METAL_MOE_MID_F32=1` | Two-repeat median vs current Tensor auto: +12.0% at 512, +11.5% at 1024, +6.7% at 2048, +4.9% at 4096, and +6.1% at 8192. Generation was flatter than the F16-mid skip candidate: -0.2%, -1.4%, -1.1%, -0.8%, and -0.7%. Artifact: `speed-bench/local-runs/20260514-222853-mpp-fast-skip-down26-29-30-mid-f32/prefill-candidate-summary.json`. | Five-fixture gate passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.643810`, worst top20 abs `1.13945`. `./ds4_test --metal-mpp-equivalence` also passed with the same Tensor summary. | Keep default-off as the best balanced eval candidate when generation steadiness matters. It gives up some short-context prefill versus the F16-mid skip candidate but keeps long-context prefill similar and avoids the larger generation timing swings. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-23,layer=25,layer=27-42` | Two-repeat median vs current Tensor auto: +18.4% at 512, +18.0% at 1024, +12.4% at 2048, +10.1% at 4096, and +8.1% at 8192. Generation was within -1.5%..-0.1%. Artifact: `speed-bench/local-runs/20260514-181319-mpp-fast-skip-down24-26/`. | Five-fixture gate passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.645334`, worst top20 abs `1.44783`. | Keep default-off, but prefer the layer-26-only skip if using this diagnostic because it has lower top20 drift. | +| `DS4_METAL_MPP_FAST=1` plus `DS4_METAL_MPP_MOE_UP_START_LAYER=15` plus `DS4_METAL_MPP_MOE_DOWN_START_LAYER=12` | Two-repeat median vs current Tensor auto: +6.1% at 512, +5.0% at 1024, +4.0% at 2048, +2.7% at 4096, and +2.8% at 8192. Generation was within -1.0%..+0.2%. Artifact: `speed-bench/local-runs/20260514-182359-mpp-fast-gate0-up15-down12/`. | Five-fixture gate passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.529461`, worst top20 abs `1.05153`. | Keep default-off. It is the cleanest new route-split gate result, but the Tensor-vs-standard drift is still materially larger than the current default for only a modest speed gain. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` | Two-repeat median vs current Tensor auto: +15.9% at 512, +19.7% at 1024, +12.5% at 2048, +6.8% at 4096, +11.7% at 8192. Generation was -4.9%, -1.5%, -3.5%, -0.9%, -1.7%. | Five-fixture gate passed. `tensor_vs_quality` stayed inside the current standard-vs-quality envelope with top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, and worst top20 abs `2.24006`. `tensor_vs_standard` had no top1 or greedy mismatch, but drift increased to worst RMS `0.669241` and worst top20 abs `1.30664`. | Keep default-off until an eval confirms the larger Tensor-vs-standard logit movement is acceptable. This is the best prefill candidate so far, but not yet promoted over the lower-drift conservative default. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_MOE_MID_F32=1` | Two-repeat median vs current Tensor auto: +10.8% at 512, +11.8% at 1024, +6.0% at 2048, +4.0% at 4096, and +6.0% at 8192. Generation was neutral, within -0.5%..+0.3%. | Five-fixture gate passed and `./ds4_test --metal-mpp-equivalence` passed. `tensor_vs_quality`: top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, worst top20 abs `2.24006`. `tensor_vs_standard`: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.669241`, worst top20 abs `1.30664`. | Keep default-off. The F32 MoE intermediate improves generation timing versus the all-layer experimental route, but it does not reduce the larger Tensor-vs-standard drift and gives up part of the prefill win. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus route-specific gate start `0`, up start `15`, down start `12` | Two-repeat median vs current Tensor auto: +2.0% at 512, +4.6% at 1024, +6.1% at 2048, +7.3% at 4096, and +4.6% at 8192. Generation was near flat through 4096 and -4.4% at 8192. | Five-fixture gate passed. `tensor_vs_quality` stayed inside the current standard-vs-quality envelope with top1 mismatches `0`, greedy mismatches `1`, worst RMS `0.618172`, and worst top20 abs `2.24006`. `tensor_vs_standard` had no top1 or greedy mismatch, but drift rose to worst RMS `0.529461` and worst top20 abs `1.05153`. | Keep default-off. It is the best route-specific speed candidate that still passes the gate, but it is not promoted because Tensor-vs-standard drift is materially larger than the current conservative default and the 8192 generation point regressed in timing. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus route-specific gate start `0`, up start `15`, down start `12`, after the attention-output all-layer promotion | Two-repeat median vs current Tensor auto: +5.6% at 512, +5.3% at 1024, +4.3% at 2048, +1.6% at 4096, and +0.3% at 8192. Generation was within -0.6%..+0.8%. | Not rerun after the attention-output promotion because the same route already passed the five-fixture gate before promotion and the speed profile is not strong enough to promote. | Keep default-off. The current default absorbed most of the long-context prefill benefit, leaving this as a short-context diagnostic rather than a production default. | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` plus `DS4_METAL_MPP_MOE_FAST_LAYOUT=0` | Two-repeat median vs current Tensor auto: +8.4% at 512, +12.3% at 1024, +0.4% at 2048, +1.2% at 4096, and +4.3% at 8192. Generation was -4.2% at 1024, -3.2% at 2048, -4.4% at 4096, and near flat at 512/8192. | Five-fixture gate passed, but `tensor_vs_standard` was unchanged from the faster experimental layout: top1 mismatches `0`, greedy mismatches `0`, worst RMS `0.669241`, and worst top20 abs `1.30664`. | Reject as the preferred experimental layout because it gives up speed without reducing the larger Tensor-vs-standard movement. | + +## Profile Signal + +`speed-bench/run_prefill_candidate_gate.py` now has named `--preset` values for +the measured default-off profiles, including `mpp-fast`, +`mpp-fast-skip-down26-29-30`, +`mpp-fast-skip-down26-29-30-mid-f32`, and +`experimental-moe-matmul`. Explicit `--set-env` values still override the preset. +This keeps future speed/drift reruns tied to the same five-fixture gate while +removing long env strings from the critical path. + +The preset table is shared through `speed-bench/metal_tensor_presets.py`, and +`speed-bench/run_quality_drift_gate.py` now accepts the same `--preset` option +for standalone five-fixture logprob checks. A preset drift run stores artifacts +under `speed-bench/local-runs/--quality-drift-gate/` by +default. This makes the drift-only rerun for the current best candidate: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --preset mpp-fast-skip-down26-29-30 \ + --max-tensor-standard-rms 0.30 \ + --max-tensor-standard-top20-abs 0.60 +``` + +`speed-bench/summarize_mpp_compare.py` now parses `DS4_METAL_MPP_COMPARE_*` +logs into Markdown and JSON. The existing best-candidate comparator log was +regenerated as: + +- `speed-bench/local-runs/20260515-014911-mpp-compare-summary/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-014911-mpp-compare-summary/mpp-compare-summary.json` + +The summary preserves the key local attribution: the first comparator target +breach in that run is `moe_down` at layer 31 with max abs `0.00341797` and RMS +`2.5071e-06`; the next-largest local deltas are well below the comparator max +abs target. This supports keeping the skip-26/29/30 candidate default-off rather +than promoting or widening it without an eval. + +A follow-up `--all-cases --route moe_down` comparator probe on the same +skip-26/29/30 preset confirmed that layer 31 is the only remaining local +`moe_down` target breach in the five fixtures, and it appears only in the two +long prompts: + +- `speed-bench/local-runs/20260515-020415-mpp-fast-skip-down26-29-30-mpp-compare-probe/mpp-compare-summary.md` + +Excluding layer 31 as well (`layer=0-25,layer=27-28,layer=32-42`) was then +rerun through the five-fixture drift gate. It still failed the strict +Tensor-vs-standard envelope with worst RMS `0.643831` and worst top20 abs +`1.10919`, while the speed scorecard failed the generation floor at 512 tokens. +That means the remaining full-model movement is not fixed by skipping the one +remaining local down-layer breach. + +`speed-bench/run_mpp_compare_probe.py` now wraps this comparator workflow: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --preset mpp-fast-skip-down26-29-30 \ + --case long_memory_archive \ + --route moe_down +``` + +It uses the same preset table, writes raw logs and `mpp-compare-summary.md/json` +under ignored `speed-bench/local-runs/`, and supports `--all-cases` for the +same five fixtures used by `run_quality_drift_gate.py`. `--route` is repeatable +and accepts comma or pipe separated lists, but each route is run separately +because the underlying comparator accepts one route at a time. This should be +used only for local attribution before the logprob gate, not as a promotion +signal. + +`speed-bench/run_prefill_candidate_gate.py --run-drift-gate` now enforces the +speed-first workflow: it evaluates the compact prefill/generation speed screen +before launching the five-fixture drift gate, and records a skip reason instead +of spending a drift run on candidates that already fail the speed floor. This +keeps local optimization sweeps aligned with the promotion rule: speed screen +first, drift gate only for speed-positive candidates. + +Best default-off skip-26/29/30 profile: + +```sh +env DS4_METAL_MPP_FAST=1 \ + DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42 \ + DS4_METAL_GRAPH_TOKEN_PROFILE=1 \ + DS4_METAL_LAYER_STAGE_PROFILE=1 \ + DS4_METAL_MOE_STAGE_PROFILE=1 \ + DS4_METAL_ATTN_OUT_STAGE_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE_FILTER=attn_ \ + ./ds4 --metal -mt auto \ + --prompt-file tests/test-vectors/prompts/long_code_audit.txt \ + -c 8192 -n 1 --system "" --nothink --temp 0 +``` + +Output: + +`speed-bench/local-runs/20260514-214926-mpp-fast-skip26-29-30-profile/long_code_audit_profile.stderr` + +This diagnostic run reported `prefill: 397.46 t/s`. With stage-level flushes +enabled, use these numbers for attribution rather than throughput comparison. + +Important medians at `tokens=3844`, excluding layer 0 first-use overhead: + +- Dense attention Q8_0: `attn_q_a=2.947 ms`, `attn_kv=1.621 ms`, + `attn_q_b=21.102 ms`, and `attn_out=21.683 ms`. +- Routed-MoE Tensor layers (`mpp=1/1/1`, 39 layers): gate `16.386 ms`, up + `16.558 ms`, down `15.795 ms`. +- Skipped-down layers (`mpp=1/1/0`, layers 26/29/30): gate `16.623 ms`, up + `16.480 ms`, legacy down `37.776 ms`. +- Layer-stage medians: attention `43.248 ms`, attention output projection + `43.636 ms`, routed MoE `51.724 ms`, shared gate/up `11.070 ms`, and shared + down `7.975 ms`. + +This makes dense attention `attn_q_b` and `attn_output_b` the next meaningful +kernel target after the route-window work. Further down-layer exclusions reduce +local comparator outliers but start to give up too much generation and +long-context prefill speed. + +## Long-Context Candidate Validation + +The current strongest passing default-off speed candidate was also measured in +a one-repeat full sweep with 128 generated tokens: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label mpp-fast-skip-down26-29-30-long128 \ + --ctx-max 65536 \ + --gen-tokens 128 \ + --repeat 1 \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42 +``` + +Artifact: +`speed-bench/local-runs/20260514-212917-mpp-fast-skip-down26-29-30-long128/prefill-candidate-summary.json`. + +| ctx | candidate prefill vs current Tensor | candidate gen vs current Tensor | +| ---: | ---: | ---: | +| 512 | +15.1% | -0.1% | +| 1024 | +15.3% | -0.5% | +| 2048 | +11.4% | -0.2% | +| 4096 | +8.3% | +1.0% | +| 8192 | +8.7% | -0.4% | +| 16384 | +7.2% | -0.2% | +| 32768 | +6.1% | -0.4% | +| 65536 | +5.8% | -0.3% | + +Decision remains default-off: the full sweep confirms a real prefill win across +the long range, and the five-fixture gate is clean, but Tensor-vs-standard drift +is still materially larger than the conservative default. This is the best eval +candidate if we decide to test whether the larger Tensor-vs-standard movement +is acceptable in task-level quality. + +The balanced F32-mid variant was measured in the same long sweep shape: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label mpp-fast-skip-down26-29-30-mid-f32-long128 \ + --ctx-max 65536 \ + --gen-tokens 128 \ + --repeat 1 \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42 \ + --set-env DS4_METAL_MOE_MID_F32=1 +``` + +Artifact: +`speed-bench/local-runs/20260514-223632-mpp-fast-skip-down26-29-30-mid-f32-long128/prefill-candidate-summary.json`. + +| ctx | candidate prefill vs current Tensor | candidate gen vs current Tensor | +| ---: | ---: | ---: | +| 512 | +15.9% | -1.1% | +| 1024 | +11.1% | -1.5% | +| 2048 | +6.7% | -1.5% | +| 4096 | +7.2% | -0.8% | +| 8192 | +5.1% | -0.9% | +| 16384 | +5.0% | -0.3% | +| 32768 | +2.6% | -1.5% | +| 65536 | +2.4% | -2.7% | + +Decision remains default-off and secondary to the faster F16-mid skip candidate +for pure prefill. The balanced variant still gives a real prefill win across +the full range and passed the five-fixture gate plus +`./ds4_test --metal-mpp-equivalence`, but gives up the strongest long-context +prefill gains and has a -2.7% generation point at 65536. Use it only when the +flatter compact generation profile is more important than maximum prefill. + +The earlier layer-26-only skip candidate was measured in the same shape: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label mpp-fast-skip-down26-long128 \ + --ctx-max 65536 \ + --gen-tokens 128 \ + --repeat 1 \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-42 +``` + +Artifact: +`speed-bench/local-runs/20260514-190526-mpp-fast-skip-down26-long128/prefill-candidate-summary.json`. + +| ctx | candidate prefill vs current Tensor | candidate gen vs current Tensor | +| ---: | ---: | ---: | +| 512 | +18.3% | +0.2% | +| 1024 | +12.4% | -1.1% | +| 2048 | +6.2% | -2.0% | +| 4096 | +6.3% | -0.6% | +| 8192 | +5.6% | -0.7% | +| 16384 | +5.7% | -0.1% | +| 32768 | +4.7% | -0.4% | +| 65536 | +6.9% | -0.0% | + +Decision remains default-off: the full sweep confirms a real prefill win across +the long range, but the five-fixture gate still shows much larger +Tensor-vs-standard drift than the conservative default. The newer +skip-26/29/30 candidate above keeps a stronger long-context prefill profile at +most measured contexts and lower top-20 Tensor-vs-standard drift, so prefer that +one for any task-level eval. + +The smaller `gate0/up15/down12` passing candidate was also measured in the same +long sweep shape: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label mpp-fast-gate0-up15-down12-long128 \ + --ctx-max 65536 \ + --gen-tokens 128 \ + --repeat 1 \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env DS4_METAL_MPP_MOE_UP_START_LAYER=15 \ + --set-env DS4_METAL_MPP_MOE_DOWN_START_LAYER=12 +``` + +Artifact: +`speed-bench/local-runs/20260514-191816-mpp-fast-gate0-up15-down12-long128/prefill-candidate-summary.json`. + +| ctx | candidate prefill vs current Tensor | candidate gen vs current Tensor | +| ---: | ---: | ---: | +| 512 | +4.4% | -0.8% | +| 1024 | -0.3% | -4.2% | +| 2048 | +1.1% | -1.0% | +| 4096 | +1.3% | -0.1% | +| 8192 | +1.6% | -1.4% | +| 16384 | +0.6% | -0.9% | +| 32768 | +0.3% | -0.4% | +| 65536 | -3.9% | -8.0% | + +Decision: reject for long-context promotion. The compact gate passed, but the +full sweep shows it is noise-level for prefill and regresses generation at the +largest context. + +Representative profile: + +```sh +env DS4_METAL_GRAPH_TOKEN_PROFILE=1 \ + DS4_METAL_LAYER_STAGE_PROFILE=1 \ + DS4_METAL_MOE_STAGE_PROFILE=1 \ + DS4_METAL_ATTN_OUT_STAGE_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE_FILTER=attn_ \ + ./ds4 --metal -mt auto \ + --prompt-file tests/test-vectors/prompts/long_code_audit.txt \ + -c 8192 -n 1 --system "" --nothink --temp 0 +``` + +Output: + +`speed-bench/local-runs/20260514-161802-current-default-attn-all-profile/long_code_audit_profile.log` + +Current default diagnostic result: `prefill: 414.91 t/s`. This run enables +stage-level flushes for attribution; use the compact timing chart above as the +primary speed comparison. + +Important stage timings at `tokens=3844`: + +- Layers 0..11 use legacy routed-MoE projections (`mpp=0/0/0`): median gate + `33.420 ms`, up `34.368 ms`, down `33.380 ms`. +- Layers 12..14 use Tensor down only (`mpp=0/0/1`): median gate `33.334 ms`, + up `33.355 ms`, down `13.748 ms`. +- Layers 15..42 use Tensor gate/up/down (`mpp=1/1/1`): median gate + `14.343 ms`, up `14.372 ms`, down `13.822 ms`. +- Dense attention Q8_0 medians are `attn_q_a=2.523 ms`, + `attn_kv=1.415 ms`, `attn_q_b=18.507 ms`, and `attn_out=18.821 ms`. +- The attention output projection stage remains about `38.017 ms/layer`; + with all-layer attention-output Tensor enabled, the low projection is + `19.153 ms` and the output projection is `18.906 ms`. + +Shared-expert dense Q8_0 profile: + +`speed-bench/local-runs/20260514-173017-shared-q8-profile/long_code_audit.stderr` + +- On `long_code_audit`, `tok=3844`, median `shared_gate` was `4.701 ms`, + `shared_up` was `4.691 ms`, and `shared_down` was `4.702 ms`. +- The median combined shared-expert dense Q8_0 time was `14.284 ms/layer`. +- A paired `shared_gate`/`shared_up` prefill prototype was tested and reverted; + it was slower through 4096 tokens and only slightly faster at 8192. + +The routed-MoE stage profiler now prints layer, token/pair counts, expert +count, gate/down quant types, `mm_id` vs `mm_id_pair_mpp` path, active Tensor +route mask, tile widths, and intermediate precision. Use +`DS4_METAL_MOE_STAGE_PROFILE_FILTER=` to limit printed rows while +preserving stage flushes for timing correctness. + +Long-shape routed-MoE profile on `long_code_audit`, `tok=3844`, +`pairs=23064`, `experts=6`, `gate=iq2_xxs`, `down=q2_k`: + +- Layers before the current conservative Tensor window are still the largest + remaining routed-MoE opportunity, but the latest one-layer route-window tests + did not produce a clean prefill win. + +This confirms the highest-value routed-MoE target is still the pre-window +specialized `mm_id` path, not the generic dense Q8_0 wrapper. The dense +attention targets remain `attn_q_b in=1024 out=32768` and the second attention +output projection `attn_output_b`. + +Comparator check on the all-layer experimental routed-MoE Tensor path: + +```sh +env DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1 \ + DS4_METAL_MPP_COMPARE_ROUTE=all \ + DS4_METAL_MPP_COMPARE_MAX=12 \ + DS4_METAL_MPP_COMPARE_VERBOSE=1 \ + ./ds4 --metal -mt auto \ + --prompt-file tests/test-vectors/prompts/long_code_audit.txt \ + -c 8192 -n 1 --system "" --nothink --temp 0 +``` + +The first 12 local projection comparisons, covering `moe_gate`, `moe_up`, and +`moe_down` in layers 0..3, stayed far inside the local comparator target. The +largest observed max abs was about `3.8e-5`, and RMS was about `1e-7` or lower. +That points to accumulated full-model movement from enabling more Tensor +layers, not an obvious single routed-MoE projection breach. + +A wider comparator run on `long_memory_archive` with +`DS4_METAL_MPP_COMPARE_MAX=200` did find the first local breach in `moe_down` +layer 26: max abs `0.00109863`, RMS `1.12718e-06` +(`speed-bench/local-runs/20260514-174248-experimental-moe-compare/`). Earlier +gate/up rows were around `1e-5` to `1e-4`, so the next routed-MoE experiment +should keep the down route scoped and treat wider down windows as drift risk. + +The same long fixture with the passing `gate0/up15/down12` split and +`DS4_METAL_MPP_COMPARE_ROUTE=moe_gate` did not show a single bad gate layer: +all gate local max abs values stayed around `1e-5` to `6e-5` +(`speed-bench/local-runs/20260514-184759-gate0-route-compare/`). This points +to accumulated model movement from widening the gate route, not one obvious +gate-layer exclusion candidate. + +Comparator follow-up on the current best skip-26/29/30 candidate: + +```sh +env DS4_METAL_MPP_FAST=1 \ + DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42 \ + DS4_METAL_MPP_COMPARE_MAX=100 \ + DS4_METAL_MPP_COMPARE_ROUTE=moe_gate|moe_up \ + ./ds4 --metal -mt auto \ + --prompt-file tests/test-vectors/prompts/long_memory_archive.txt \ + -c 16384 -n 1 --system "" --nothink --temp 0 +``` + +Artifacts: + +- `speed-bench/local-runs/20260514-225400-mpp-fast-skip26-29-30-gate-comparator-max100/` +- `speed-bench/local-runs/20260514-225400-mpp-fast-skip26-29-30-up-comparator-max100/` + +Neither `moe_gate` nor `moe_up` reported a local comparator breach over the +available comparisons. This makes another gate/up layer-exclusion pass +unlikely to improve the speed/drift tradeoff; the known actionable local +outliers were the `moe_down` layers already excluded by the skip-26/29/30 +candidate. + +`DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` with gate/up from layer 0 and down from +layer 12 was benchmarked as +`speed-bench/local-runs/20260514-174353-experimental-gate-up0-down12/`. It was +not a clean speed candidate versus the current Tensor default: prefill changed +by `-6.0%`, `-6.7%`, `-5.6%`, `-5.3%`, and `+2.1%` for contexts 512..8192, +while generation changed by `-11.0%`, `-8.2%`, `-6.3%`, `-4.4%`, and `-1.1%`. +This was rejected before running the drift gate. + +For the next matmul kernel iteration, enable filtered Q8_0 prefill-level timing +with: + +```sh +env DS4_METAL_Q8_PREFILL_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE_FILTER=attn_q_b \ + ./ds4 --metal -mt auto \ + --prompt-file tests/test-vectors/prompts/long_code_audit.txt \ + -c 8192 -n 1 --system "" --nothink --temp 0 +``` + +This keeps the legacy Q8_0 dispatch but flushes timed prefill batches so each +logged row names the module/layer context, input/output dimensions, token batch, +and elapsed time. Use those rows to pick the first default-off Metal 4 +cooperative/tensor Q8_0 matmul target. + +Smoke result on `short_code_completion`, `FILTER=moe_gate`: no rows. That is +expected because routed-MoE gate/up/down use the specialized routed-MoE kernels, +not the generic dense Q8_0 prefill wrapper. + +Smoke result on `short_code_completion`, `FILTER=attn_q_b`: rows were emitted +for layers 0..42 with shape `in=1024 out=32768 tok=27`. Layer 0 included +first-use overhead at `1.298 ms`; later layers were about `0.33-0.41 ms` each. +This confirms the profile hook works for dense attention Q8_0 projections. + +Long-shape smoke result on `long_code_audit`, `FILTER=attn_q_b`, `tok=3844`: +layer 0 reported `27.695 ms`; most layers reported about `18.0-19.2 ms`, with +late layers 40..42 at about `20.0-20.6 ms`. This makes +`attn_q_b in=1024 out=32768` the first dense Q8_0 prototype shape to target +after routed-MoE profiling. + +Broader long-shape attention profile on `long_code_audit`, `FILTER=attn_`, +`tok=3844`: + +- `attn_q_a in=4096 out=1024`: about `2.45-2.8 ms/layer` after layer-0 + first-use overhead. +- `attn_kv in=4096 out=512`: about `1.35-1.48 ms/layer`. +- `attn_q_b in=1024 out=32768`: about `18.0-18.9 ms/layer`. +- `attn_out in=8192 out=4096`: about `18.0-19.3 ms/layer`. + +In this profile `attn_out` names the second/output projection +(`attn_output_b`) that still goes through the generic dense Q8_0 wrapper. The +attention-output low projection (`attn_output_a`) already has a separate +guarded Tensor route and comparator. Dense Q8_0 work should therefore focus on +`attn_q_b` and `attn_output_b`, not on the already-specialized low projection. + +## Matmul-First Direction + +The current legacy dense Q8_0 prefill kernel already uses +`simdgroup_multiply_accumulate`, so the next meaningful optimization is not just +to rewrite it with the same primitive. The next target is a default-off +quantized prefill matmul family that uses Metal 4 cooperative/tensor matrix +primitives where they help, while preserving the legacy dequantization and +reduction behavior closely enough to pass the quality gate. + +This should be treated as a new kernel family, not a revival of the removed +dense Q8_0 Tensor route. The removed route was drift-prone in full-model +comparison; a replacement needs its own dispatch switch, route comparator, and +five-fixture gate evidence before it can be promoted. + +Metal 4 and the Neural Accelerator direction should be split into two tracks: + +- Near-term: keep DS4 on custom Metal compute shaders over GGUF buffers, and use + cooperative/tensor matmul primitives inside quantized prefill matmul kernels. + This is the path that can directly improve current prefill without changing + model loading or graph ownership. +- Longer-term: evaluate Metal 4 machine-learning passes/Core ML packages only if + we can package stable repeated subgraphs without losing DS4's quantized + mmap-backed layout, routed-MoE control, and drift gate. That is not a drop-in + acceleration path for the current kernels. + +Priority order: + +1. Early routed-MoE gate/up/down specialized matmuls before the current safe + Tensor window. Use the existing routed-MoE stage profiler and comparator for + these routes; they do not pass through the generic dense Q8_0 wrapper. +2. Attention Q/output dense Q8_0 projections. Use + `DS4_METAL_Q8_PREFILL_PROFILE=1` with a context filter such as `attn_q_b` to + choose the first prototype shape. +3. Wider route windows only after the new kernel proves low drift in the + five-fixture quality gate. + +Promotion rule: keep a change only if it improves compact prefill timing and +passes the gate with no new top-1 or Tensor-vs-standard greedy regression. + +Prototype checklist: + +1. Use `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1` as the first default-off + experimental quantized prefill matmul dispatch. It moves only the routed-MoE + Metal 4 cooperative/tensor matmul window and does not use the removed + dense Q8_0 Tensor controls. +2. First target one high-impact routed-MoE projection shape and compare it with + `DS4_METAL_MPP_COMPARE_ROUTE=moe_gate|moe_up|moe_down`. +3. Run compact prefill timing twice with an adjacent `-mt off` control to avoid + promoting thermal/noise wins. Use: + + ```sh + python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label moe-matmul-first \ + --set-env DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1 + ``` + +4. Add `--run-drift-gate` before promotion. The helper calls + `speed-bench/run_quality_drift_gate.py`; promotion requires no top-1 + mismatch, no Tensor-vs-standard greedy mismatch, and no regression beyond the + current standard-vs-quality envelope. + +## Stage Profile Summarizer + +Added `speed-bench/summarize_stage_profile.py` to convert Metal layer, routed +MoE, attention-output, and Q8 prefill profile logs into a ranked Markdown/JSON +summary. It is a local analysis helper only; summaries should be written under +`speed-bench/local-runs/`. + +Current snapshot: + +- `speed-bench/local-runs/20260514-231404-stage-profile-summary/stage-profile-summary.md` +- `speed-bench/local-runs/20260514-231404-stage-profile-summary/stage-profile-summary.json` + +The current conservative profile on `long_code_audit` ranks parsed stages as +`ffn.routed_moe=2790.479 ms`, `attn.attention=1760.972 ms`, +`attn.output_proj=1638.645 ms`, and `attn.q_path=1165.267 ms`. +Nested profile lines overlap, so these are ranking signals rather than +exclusive wall-time shares. After the routed-MoE route-window and dense-Q8 +prototype boundaries below, the remaining non-repeated performance target is +the compressed/prefill attention kernel itself. The first simple shape test, +widening non-vector FlashAttention from 64 to 128 key rows per group, was +rejected before drift gating because it regressed compact short and mid +contexts. + +## FlashAttention Stage Profiler + +Artifact root: + +- `speed-bench/local-runs/20260514-232644-flash-attn-stage-profile/` + +Patch added a default-off `DS4_METAL_FLASH_ATTN_STAGE_PROFILE=1` profiler for +raw and static-mixed prefill FlashAttention helpers. The profiler splits GPU +batches at stage boundaries and updates the wrapper-owned command buffer, so it +does not affect normal execution when the env var is unset. + +Smoke command: + +```sh +DS4_METAL_FLASH_ATTN_STAGE_PROFILE=1 ./ds4-bench \ + --prompt-file speed-bench/promessi_sposi.txt \ + --ctx-start 512 \ + --ctx-max 512 \ + --step-mul 2 \ + --gen-tokens 1 \ + -mt auto \ + --csv speed-bench/local-runs/20260514-232644-flash-attn-stage-profile/smoke.csv +``` + +Summarized profile: + +| Stage | total ms | events | avg ms | +| --- | ---: | ---: | ---: | +| `flash_attn.static_mixed_nonvec.attention` | 78.117 | 41 | 1.905 | +| `flash_attn.static_mixed_nonvec.copy_raw` | 8.332 | 41 | 0.203 | +| `flash_attn.static_mixed_nonvec.copy_comp` | 7.821 | 41 | 0.191 | +| `flash_attn.static_mixed_nonvec.block_map` | 7.209 | 41 | 0.176 | +| `flash_attn.raw_nonvec.attention` | 4.516 | 2 | 2.258 | +| `flash_attn.static_mixed_nonvec.mask_fill` | 4.489 | 41 | 0.109 | +| `flash_attn.static_mixed_nonvec.pad` | 4.124 | 20 | 0.206 | + +Shape split: + +| FlashAttention shape | total ms | events | avg ms | +| --- | ---: | ---: | ---: | +| `static_mixed_nonvec tokens=512 comp=128 keys=640 heads=64 dim=512 window=128 ratio=4` | 56.452 | 105 | 0.538 | +| `static_mixed_nonvec tokens=512 comp=4 keys=516 heads=64 dim=512 window=128 ratio=128` | 53.640 | 120 | 0.447 | +| `raw_nonvec tokens=512 comp=0 keys=512 heads=64 dim=512 window=128 ratio=0` | 5.825 | 8 | 0.728 | + +Conclusion: after routed-MoE and attention-output work, the prefill attention +kernel itself is the next high-signal target. Copy, mask, block-map, and pad +costs are visible but secondary in this smoke; a real optimization attempt +should focus on the non-vector static-mixed attention kernel and keep the +five-fixture drift gate as the promotion check. + +## Rejected FlashAttention Tile Variants + +Artifact roots: + +- `speed-bench/local-runs/20260514-233823-flash-attn-c32-real/` +- `speed-bench/local-runs/20260514-234143-flash-attn-q16-real/` + +Two real non-vector prefill FlashAttention specializations were tested after +the stage profiler pointed at `static_mixed_nonvec.attention`: + +- `C=32`, `Q=8`, `NSG=4`; +- `Q=16`, `C=64`, `NSG=8`. + +Both used matching attention, pad, and block-map tile sizes in the tested local +patch. Earlier host-only screens for `C=32` and `Q=16` were discarded because +the exported attention kernel is template-specialized for `Q=8,C=64`; changing +only host pad/block constants is not a valid candidate. + +Compact two-repeat medians versus current Tensor auto: + +| Candidate | 512 | 1024 | 2048 | 4096 | 8192 | Generation impact | +| --- | ---: | ---: | ---: | ---: | ---: | --- | +| real `C=32` | -9.5% | -5.0% | -5.4% | -3.1% | +0.5% | -1.5% to flat | +| real `Q=16` | -8.7% | +0.8% | +0.3% | -0.2% | -0.3% | -1.7% to -0.1% | + +Decision: revert/no production knob and no drift gate. The corrected +specializations did not meet the speed bar, so the next attention attempt needs +a real kernel design change rather than changing only the query/key tile +geometry. + +## Routed-MoE Prototype Boundary + +Current routed-MoE prefill already has these measured Metal 4 variants: + +- default conservative Tensor window: down from layer 12, gate/up from layer 15; +- `DS4_METAL_MPP_FAST=1`: all-layer routed-MoE Tensor; +- route-specific windows and filters for gate/up/down; +- `DS4_METAL_MPP_MOE_TILE_N=64`; +- `DS4_METAL_MPP_MOE_FAST_LAYOUT=0`; +- `DS4_METAL_MPP_MOE_PAIR_GATE_UP=1`; +- a local standard-Metal paired gate/up kernel that kept the legacy simdgroup + reduction shape but reused the activation tile; +- `DS4_METAL_MOE_MID_F32=1`. + +The useful default-off frontier is now the skip-26/29/30 family: + +- fastest prefill: `DS4_METAL_MPP_FAST=1` plus + `DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42`; +- balanced generation: same env plus `DS4_METAL_MOE_MID_F32=1`. + +Both pass the five-fixture gate and `./ds4_test --metal-mpp-equivalence`, but +they remain default-off because Tensor-vs-standard drift is materially larger +than the conservative default. Additional gate/up exclusion scans on the +fastest skip candidate did not find local comparator breaches, and excluding +more down layers, such as layer 31, gave up too much generation and long-context +prefill speed. A later hybrid that disabled all late `moe_down` Tensor while +keeping fast gate/up Tensor still failed the strict Tensor-vs-standard envelope, +which reinforces that the remaining movement is route-wide rather than a single +late down-layer issue. + +Conclusion: env-only routed-MoE tuning is exhausted for this branch. The next +routed-MoE optimization should be a real kernel design change, not another +route-window combination. A useful design target would preserve the current +fast-layout speed while reducing accumulated full-model movement from the +all-layer gate/up/down window, with the route comparator and five-fixture gate +as hard promotion checks. + +## Early Routed-MoE Kernel Contract + +Inspection target: + +- `metal/moe.metal`: `kernel_mul_mm_id`, `kernel_mul_mm_id_mpp_fast_layout`, + and `kernel_mul_mm_id_pair_mpp`. +- `ds4_metal.m`: `ds4_gpu_routed_mm_pipeline`, + `ds4_gpu_encode_mul_mm_id_map`, and the routed batch MoE dispatch around + `ds4_gpu_encode_mul_mm_id_mapped_tile`. + +Current dispatch already does the right high-level batching: + +- one expert-major route map is built per layer and reused for gate, up, and + down; +- gate and up share the same `gate_mm_args` and activation source, but the + measured paired gate/up kernels were slower than two separate matmuls; +- the stage profile shows the `map` stage is not the target; early-window + gate/up/down matmul time is. + +Arithmetic/layout constraints for the next real kernel: + +- The legacy `kernel_mul_mm_id` path uses a 64-row by 32-token tile, legacy + threadgroup layout, `simdgroup_load`, and eight + `simdgroup_multiply_accumulate` accumulators. This is the reference behavior + for low-drift output order. +- The current fast-layout path changes the threadgroup tensor layout and uses + Metal 4 cooperative tensors. It is fast, but widening it into early layers + causes route-wide Tensor-vs-standard drift; local per-projection comparator + deltas alone are not enough to prove promotion safety. +- A replacement should first preserve the legacy output layout and writeback + order, then remove overhead around loads, barriers, or pointer/index setup. + Starting from cooperative tensor math is acceptable only if the local + comparator stays tight and the five-fixture gate remains green. + +Prototype acceptance order: + +1. Build and route the candidate behind a default-off env var. +2. Run a local comparator probe for the touched route (`moe_gate`, `moe_up`, or + `moe_down`) with enough comparisons to cover early and late layers. +3. Run `run_prefill_candidate_gate.py` without drift first. The candidate must + clear both the median and repeat-level compact prefill floors. +4. Only then run the five-fixture drift gate. Promotion still requires no new + top-1 mismatch, no Tensor-vs-standard greedy mismatch, and Tensor-vs-standard + worst RMS/top20 abs inside the configured envelope. + +This rules out another small route-window probe as the next step. The next code +candidate should be a new routed-MoE matmul variant with an explicit comparator +route and speed-gate artifact. + +## Rejected Q8_0 N64 Dense Tile + +Artifact roots: + +- `speed-bench/local-runs/20260514-215521-q8-n64-attn-q-b/` +- `speed-bench/local-runs/20260514-215814-q8-n64-attn-out/` + +Patch tested: an experimental `kernel_mul_mm_q8_0_f32_n64` with 64 token +columns and eight simdgroups, guarded by `DS4_METAL_Q8_PREFILL_N64=1` plus an +optional route filter. The kernel preserved the legacy Q8_0 dequantization and +per-element accumulation order, but widened the token tile from 32 to 64. + +Compact timing versus the current Tensor baseline was not a clean win: + +| Candidate | 512 | 1024 | 2048 | 4096 | 8192 | Generation impact | +| --- | ---: | ---: | ---: | ---: | ---: | --- | +| `attn_q_b` N64 | -4.4% | -1.6% | -0.9% | +0.2% | +0.9% | -2.0% to +0.7% | +| `attn_out` N64 | -4.8% | -2.2% | -0.3% | +0.1% | +0.8% | -0.7% to +0.6% | + +Decision: revert/no production knob. The wider tile helped an isolated profile +stage in places, but whole-model compact prefill regressed short contexts and +only improved long contexts by less than 1%. This was rejected before running +the drift gate because the performance bar was not met. + +## Dense Q8_0 Prototype Boundary + +The current generic dense Q8_0 prefill dispatch is back on the legacy +`kernel_mul_mm_q8_0_f32` path: 64 output rows by 32 token columns, four +SIMD-group MMA slices for the output rows, and two SIMD-group MMA slices for +the token columns. It already uses `simdgroup_multiply_accumulate` and preserves +the legacy dequantization/reduction order. + +Rejected or reverted dense Q8_0 directions now cover the obvious low-risk +scheduling variants: + +- splitting full 32-token tiles from the tail was noise-level + (`+0.3%` prefill on the targeted long fixture); +- widening the token tile to 64 (`kernel_mul_mm_q8_0_f32_n64`) was not a + whole-model win; +- cooperative/direct-RHS Tensor prototypes for `attn_q_b` and `attn_output_b` + either regressed mid-context/generation or failed the five-fixture gate. + +Conclusion: do not add another dense Q8_0 switch without a genuinely new kernel +design. The next Q8_0 attempt should be a separate default-off kernel family +with its own comparator and five-fixture gate, not a small variant of the +current legacy wrapper. + +## Cleaned Baseline Drift Gate + +Artifact root: + +- `speed-bench/local-runs/20260514-221837-quality-drift-gate/` + +Command: + +```sh +python3 speed-bench/run_quality_drift_gate.py +``` + +Result: gate OK after removing the rejected N64 source patch. + +| Pair | top1 mismatches | greedy mismatches | min top20 | worst rms | worst top20 abs | +| --- | ---: | ---: | ---: | ---: | ---: | +| standard vs quality | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| tensor vs quality | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| tensor vs standard | 0 | 0 | 19/20 | 0.239946 | 0.55422 | + +Conclusion: the current conservative Tensor default remains drift-controlled +relative to standard Metal. The one greedy mismatch is already present in +standard Metal versus `--quality`; Tensor does not add a greedy mismatch against +standard in the five-fixture gate. + +The same saved five-fixture dumps were later regenerated with the production +Tensor-vs-standard envelope enabled: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --reuse \ + --out-dir speed-bench/local-runs/20260514-221837-quality-drift-gate \ + --max-tensor-standard-rms 0.30 \ + --max-tensor-standard-top20-abs 0.60 +``` + +Result: gate OK. Tensor-vs-standard remained at zero top-1 mismatches, zero +greedy mismatches, min top20 overlap `19/20`, worst RMS `0.239946`, and worst +top20 max abs `0.55422`, so the current conservative default is inside the +strict promotion envelope. + +## Rejected FlashAttention Static Mask Cache + +Artifact root: + +- `speed-bench/local-runs/20260514-235636-flash-attn-mask-cache/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label flash-attn-mask-cache \ + --set-env DS4_METAL_FLASH_ATTN_MASK_CACHE=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: a default-off cache for static mixed FlashAttention prefill masks +and block maps, limited to the non-vector static mixed path. + +Median timing versus the current Tensor baseline: + +| ctx | candidate vs Tensor prefill | candidate vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -3.9% | -1.3% | +| 1024 | -4.3% | -0.2% | +| 2048 | -2.4% | -0.3% | +| 4096 | -0.2% | -0.4% | +| 8192 | +1.2% | -0.0% | + +Decision: revert/no production knob. The cache removes repeated mask/block-map +work in the stage profiler, but whole-model compact prefill regresses short and +mid contexts and only improves the 8192-token point by 1.2%. This was rejected +before running the drift gate because the performance bar was not met. + +## Rejected FlashAttention CPU Block Map + +Artifact root: + +- `speed-bench/local-runs/20260515-000658-flash-attn-cpu-block-map/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label flash-attn-cpu-block-map \ + --set-env DS4_METAL_FLASH_ATTN_CPU_BLOCK_MAP=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: a default-off analytic CPU block-map fill for static mixed +non-vector FlashAttention prefill. The candidate used per-call transient block +buffers to avoid CPU writes racing later GPU reads in the shared command +buffer. + +`DS4_METAL_FLASH_ATTN_CPU_BLOCK_MAP=1 ./ds4_test --metal-mpp-equivalence` +passed with the same summary as the current default: +`top1_mismatch=0`, `greedy_fail=0`, `worst_rms=0.239946`, +`worst_top20_max_abs=0.55422`. + +Median timing versus the current Tensor baseline: + +| ctx | candidate vs Tensor prefill | candidate vs Tensor generation | +| ---: | ---: | ---: | +| 512 | +2.3% | -0.1% | +| 1024 | -0.9% | -3.1% | +| 2048 | -3.1% | -2.7% | +| 4096 | +0.5% | +0.2% | +| 8192 | -0.3% | +0.0% | + +Decision: revert/no production knob. Avoiding the GPU block-map dispatch is not +a stable whole-model win once the extra CPU work and transient buffer allocation +are included. + +## Rejected FlashAttention NSG4 Geometry + +Artifact root: + +- `speed-bench/local-runs/20260515-001146-flash-attn-nsg4/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label flash-attn-nsg4 \ + --set-env DS4_METAL_FLASH_ATTN_NSG4=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: a host-only default-off switch that kept the existing non-vector +static mixed FlashAttention `Q=8,C=64` specialization but changed the runtime +simdgroup count from `NSG=8` to `NSG=4`, making each simdgroup handle two query +rows. + +Median timing versus the current Tensor baseline: + +| ctx | candidate vs Tensor prefill | candidate vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -10.4% | -2.0% | +| 1024 | -6.8% | -1.0% | +| 2048 | -6.8% | -1.1% | +| 4096 | -4.2% | -0.9% | +| 8192 | -0.3% | -0.8% | + +Decision: revert/no production knob. The lower simdgroup count consistently +regresses compact prefill and slightly hurts generation, so the default `NSG=8` +remains the right geometry for the current static mixed path. + +## Q/KV RMS Fusion Boundary + +Artifact root: + +- `speed-bench/local-runs/20260515-001750-disable-qkv-norm-fusion/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label disable-qkv-norm-fusion \ + --set-env DS4_METAL_DISABLE_QKV_NORM_FUSION=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: no code change; this uses the existing reference-path switch to +disable the default fused Q/KV RMSNorm path in prefill. + +Median timing versus the current Tensor baseline: + +| ctx | disabled fusion vs Tensor prefill | disabled fusion vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -5.1% | -2.5% | +| 1024 | -6.1% | -1.8% | +| 2048 | -4.2% | -2.0% | +| 4096 | -1.7% | -0.8% | +| 8192 | +1.4% | -1.3% | + +Decision: keep the Q/KV RMSNorm fusion enabled by default. Disabling it is a +short/mid-context regression and hurts generation at every compact point. + +## Compressor Pair Projection Scope + +No benchmark run. + +`DS4_METAL_DISABLE_COMPRESSOR_PAIR_PROJ` and +`DS4_METAL_COMPRESSOR_PAIR_NR4` were inspected as possible compressor +projection boundaries. Both are decode-scoped in the current graph path: + +- `DS4_METAL_DISABLE_COMPRESSOR_PAIR_PROJ` selects the reference pair of F16 + matvecs instead of `ds4_gpu_matmul_f16_pair_tensor()` while updating + compressed KV/indexer state for the current decode token. +- `DS4_METAL_COMPRESSOR_PAIR_NR4` only changes the paired F16 Tensor matvec + dispatch when `n_tok == 1`. + +Decision: skip them for prefill optimization. They may be useful for a focused +decode throughput A/B later, but they do not address compact prefill time. + +## Rejected FlashAttention Q4 Geometry + +Artifact root: + +- `speed-bench/local-runs/20260515-002819-flash-attn-q4/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label flash-attn-q4 \ + --set-env DS4_METAL_FLASH_ATTN_Q4=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: a default-off non-vector static-mixed FlashAttention +specialization with `Q=4,C=64,NSG=4`, compared with the current +`Q=8,C=64,NSG=8` default. + +Median timing versus the current Tensor baseline: + +| ctx | candidate vs Tensor prefill | candidate vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -11.3% | -1.0% | +| 1024 | -2.7% | -0.5% | +| 2048 | -0.7% | +0.3% | +| 4096 | +0.7% | -0.2% | +| 8192 | +0.9% | -2.4% | + +Decision: revert/no production knob and no drift gate. Smaller query tiles +hurt short-context compact prefill and only give sub-1% long-context gains, +with a generation regression at 8192. + +## RMSNorm Rsqrt Boundary + +Artifact root: + +- `speed-bench/local-runs/20260515-003403-norm-rsqrt/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label norm-rsqrt \ + --set-env DS4_METAL_NORM_RSQRT_DISABLE=0 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: no code change; this disables the current drift-stabilizing +RMSNorm unification macro and restores hardware `rsqrt()` in +`kernel_rms_norm_f32`. + +Median timing versus the current Tensor baseline: + +| ctx | `rsqrt()` vs Tensor prefill | `rsqrt()` vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -1.8% | +0.2% | +| 1024 | -3.7% | -0.4% | +| 2048 | -2.7% | -0.5% | +| 4096 | -2.5% | -0.6% | +| 8192 | -0.9% | -0.9% | + +Decision: keep `DS4_METAL_NORM_RSQRT_DISABLE` enabled by default. Restoring +hardware `rsqrt()` is slower at every compact prefill point and would also +remove a deliberate drift-control patch, so no drift gate was run. + +## Prefill Chunk Size Boundary + +Artifact root: + +- `speed-bench/local-runs/20260515-003739-prefill-chunk-full/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label prefill-chunk-full \ + --set-env DS4_METAL_PREFILL_CHUNK=0 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: no code change; this uses the existing `DS4_METAL_PREFILL_CHUNK=0` +override to prefill each prompt as one full chunk instead of using the default +4096-token cap for long prompts. + +Median timing versus the current Tensor baseline: + +| ctx | full chunk vs Tensor prefill | full chunk vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -7.3% | -0.1% | +| 1024 | -1.2% | -0.2% | +| 2048 | -1.8% | -1.1% | +| 4096 | -3.3% | -2.0% | +| 8192 | -1.0% | -0.4% | + +Decision: keep the default 4096-token long-prompt prefill cap. Full-prompt +prefill was slower at every compact point, so no drift gate was run. + +The smaller `DS4_METAL_PREFILL_CHUNK=2048` cap was also screened later: + +- `speed-bench/local-runs/20260515-051759-prefill-chunk-2048-screen/prefill-candidate-summary.md` + +One-repeat timing versus the current Tensor baseline: + +| ctx | 2048 chunk vs Tensor prefill | 2048 chunk vs Tensor generation | +| ---: | ---: | ---: | +| 512 | +0.1% | -1.0% | +| 1024 | -1.4% | -0.9% | +| 2048 | +0.7% | -0.1% | +| 4096 | +1.6% | -1.0% | +| 8192 | -7.0% | -4.5% | + +Decision: reject before drift. Smaller chunks give a small 2048/4096 bump in +this noisy single-repeat screen but regress the 8192 point badly and increase +dispatch/setup pressure. Keep the default 4096-token cap for compact and +long-context prefill timing. + +The larger `DS4_METAL_PREFILL_CHUNK=8192` cap was screened later with the +current strict two-repeat candidate gate: + +- `speed-bench/local-runs/20260515-170138-prefill-chunk-8192-screen/prefill-candidate-summary.md` + +Two-repeat median timing versus the current Tensor baseline: + +| ctx | 8192 chunk vs Tensor prefill | 8192 chunk vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -8.2% | -0.4% | +| 1024 | -3.6% | +1.7% | +| 2048 | -1.7% | -0.7% | +| 4096 | -0.5% | -1.2% | +| 8192 | +1.4% | -0.8% | + +Decision: reject before drift. The median line only helps at 8192 tokens, and +the repeat-level prefill floor was much worse (`-12.1%`). This closes the +obvious chunk-size boundary: `2048`, full-prompt, and `8192` chunks all lose to +the default 4096-token cap under the compact speed screen. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-170446-local-run-index/local-run-index.md` + +## Rejected RoPE exp2/log2 Arithmetic + +Artifact root: + +- `speed-bench/local-runs/20260515-004221-rope-exp2-log2/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label rope-exp2-log2 \ + --set-env DS4_METAL_ROPE_EXP2_LOG2=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: no code change; this uses the existing diagnostic macro that +computes RoPE frequency powers as `exp2(log2())` instead of `pow()`. + +Median timing versus the current Tensor baseline: + +| ctx | exp2/log2 vs Tensor prefill | exp2/log2 vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -0.8% | -0.4% | +| 1024 | -0.5% | -0.5% | +| 2048 | -1.2% | -0.8% | +| 4096 | -1.9% | -0.3% | +| 8192 | -1.5% | -1.2% | + +Decision: keep the default `pow()` RoPE path. The `exp2(log2())` variant is +slower at every compact prefill point and also slightly hurts generation, so no +drift gate was run. + +## KV Raw F32 Precision Boundary + +Artifact root: + +- `speed-bench/local-runs/20260515-004510-kv-raw-f32/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label kv-raw-f32 \ + --set-env DS4_METAL_KV_RAW_F32=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: no code change; this uses the existing diagnostic macro that +keeps raw KV cache values in F32 instead of matching the half-typed +FlashAttention KV buffer precision. + +Median timing versus the current Tensor baseline: + +| ctx | F32 raw KV vs Tensor prefill | F32 raw KV vs Tensor generation | +| ---: | ---: | ---: | +| 512 | +0.2% | +0.5% | +| 1024 | -0.0% | -0.6% | +| 2048 | +1.1% | +0.1% | +| 4096 | +0.2% | -0.5% | +| 8192 | -0.2% | -0.4% | + +Decision: keep F32 raw KV default-off. The compact speed result is noise-level +and mixed, while the macro intentionally changes a precision boundary between +the raw indexer view and the FlashAttention half KV view. No drift gate was run. + +## Routed-MoE Gate/Up Disable Boundary + +Artifact root: + +- `speed-bench/local-runs/20260515-005052-moe-gate-up-disable/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label moe-gate-up-disable \ + --set-env DS4_METAL_MPP_MOE_GATE_DISABLE=1 \ + --set-env DS4_METAL_MPP_MOE_UP_DISABLE=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: no code change; this disables only the current routed-MoE gate +and up Tensor routes while leaving the promoted down route enabled. + +Median timing versus the current Tensor baseline: + +| ctx | disabled gate/up vs Tensor prefill | disabled gate/up vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -19.5% | -0.6% | +| 1024 | -21.4% | -0.0% | +| 2048 | -18.5% | +0.1% | +| 4096 | -13.9% | -0.1% | +| 8192 | -9.7% | -0.1% | + +Decision: keep the current gate/up Tensor window enabled. Disabling those +routes removes a large part of the compact prefill win, so no drift gate was +run. + +## Routed-MoE Down Disable Boundary + +Artifact root: + +- `speed-bench/local-runs/20260515-005523-moe-down-disable/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label moe-down-disable \ + --set-env DS4_METAL_MPP_MOE_DOWN_DISABLE=1 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: no code change; this disables only the current routed-MoE down +Tensor route while keeping the promoted gate/up routes enabled. + +Median timing versus the current Tensor baseline: + +| ctx | disabled down vs Tensor prefill | disabled down vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -10.1% | -0.4% | +| 1024 | -12.5% | -1.1% | +| 2048 | -10.0% | -0.1% | +| 4096 | -7.3% | +0.5% | +| 8192 | -5.8% | +0.4% | + +Decision: keep the current down Tensor window enabled. Disabling the down route +also removes a clear compact prefill win, so no drift gate was run. + +## GPU Embedding Threshold Boundary + +Artifact root: + +- `speed-bench/local-runs/20260515-010001-gpu-embed-min2048/` + +Command: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label gpu-embed-min2048 \ + --set-env DS4_METAL_GPU_BATCH_EMBED_MIN=2048 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 +``` + +Patch tested: no code change; this raises the batched prompt embedding GPU +crossover from 512 tokens to 2048 tokens, forcing the 512- and 1024-token +compact points through the CPU embedding upload path. + +Median timing versus the current Tensor baseline: + +| ctx | threshold 2048 vs Tensor prefill | threshold 2048 vs Tensor generation | +| ---: | ---: | ---: | +| 512 | -0.7% | +0.4% | +| 1024 | -1.3% | +0.4% | +| 2048 | -1.7% | -1.0% | +| 4096 | -4.0% | -1.0% | +| 8192 | -1.0% | -0.5% | + +Decision: keep the default 512-token GPU embedding crossover. Raising the +threshold did not help the short contexts and regressed the whole compact +sweep, so no drift gate was run. + +## Boundary Sweep Conclusion + +The current env-only and low-risk patch search has covered the production +prefill routes that are still relevant on this branch: + +- routed-MoE Tensor defaults are independently justified: disabling gate/up or + down regresses compact prefill by 5.8% to 21.4%; +- attention-output Tensor low projection is justified and its known tile/direct + RHS alternatives have been rejected; +- F16 compressor Tensor default is justified, while pair/wide variants are + either slower or drift-prone; +- dense Q8_0 and FlashAttention tile/setup variants have been rejected unless a + genuinely new kernel design is introduced; +- precision/math boundaries (`rsqrt`, RoPE `exp2/log2`, F32 raw KV) do not + provide useful prefill speed and are not promotion candidates; +- prefill scheduling/setup boundaries (`DS4_METAL_PREFILL_CHUNK=0`, + `DS4_METAL_GPU_BATCH_EMBED_MIN=2048`) are slower than the current defaults. + +Remaining untested switches are not good prefill optimization candidates: + +- `DS4_METAL_NO_PREFILL_KERNEL_WARMUP`, `DS4_METAL_NO_MODEL_WARMUP`, + `DS4_METAL_NO_RESIDENCY`, and + `DS4_METAL_DISABLE_HOT_PIPELINE_STATICS` change startup/warmup behavior, not + steady-state prefill kernel throughput. +- `DS4_METAL_DISABLE_COMPRESSOR_STORE_ONE`, + `DS4_METAL_DISABLE_COMPRESSOR_PAIR_PROJ`, + `DS4_METAL_COMPRESSOR_PAIR_NR4`, `DS4_METAL_INDEXED_ATTN_RB4`, + `DS4_METAL_DECODE_INDEXER_*`, and the fused decode `DS4_METAL_DISABLE_*` + switches are decode-scoped for this compact prefill gate. +- `DS4_METAL_TENSOR_MATMUL_DISABLE=1`, `DS4_METAL_TENSOR_DISABLE=1`, and + `DS4_METAL_MPP_DISABLE=1` are global negative controls that collapse the + current promoted Tensor routes back toward the standard Metal baseline; the + route-specific disable checks above provide more actionable evidence. + +Next useful optimization work should therefore be code-design work rather than +another env sweep: + +1. a new routed-MoE matmul design that preserves the fast all-layer profile + while reducing Tensor-vs-standard drift; +2. a genuinely new dense Q8_0 prefill kernel family for `attn_q_b` or + `attn_output_b`, with its own comparator and five-fixture gate; +3. a real static-mixed FlashAttention kernel redesign rather than changing + only query/key tile sizes or setup kernels. + +Promotion rule remains unchanged: keep a change only if compact prefill timing +improves and the five-fixture gate shows no new top-1 mismatch and no new +Tensor-vs-standard greedy continuation mismatch. + +## Routed-MoE Kernel Design Triage + +Code inspection of the current routed-MoE prefill path confirms there is not an +obvious one-line drift fix left in the existing Tensor route. The host selector +uses the fast MPP layout by default for routed-MoE unless `N=64` tiles or +`DS4_METAL_MPP_MOE_FAST_LAYOUT=0` are requested. Both the generic MPP variant +and the fast layout variant ultimately accumulate through Metal 4 +`matmul2d::run(...)`; the non-MPP reference in the same template keeps the +legacy `simdgroup_multiply_accumulate` loop and is what the route comparator +replays for local checks. + +That matches the measurements: disabling fast layout, widening to 64-token +tiles, pairing gate/up, and forcing F32 mid storage either regressed speed or +did not reduce the full-model Tensor-vs-standard drift. Comparator scans found +actionable local `moe_down` outliers at the already-skipped layers, while +gate/up did not show a single large local breach. The remaining movement is +therefore accumulated route-wide arithmetic movement from the cooperative Tensor +matmul, not a small dispatch or precision-boundary bug. + +Next routed-MoE work should be a new default-off kernel family with a comparator +from day one. The remaining useful direction is a reference-order simdgroup +kernel that preserves the legacy reduction shape but improves expert-major +staging and writeback around the prefill map. + +The later skip-26/29/30 and clean-early hybrid probes already tested the +selective `moe_down` idea: local comparator exclusions reduced the largest +projection outliers, but the full five-fixture Tensor-vs-standard envelope still +failed. Treat further route-filtering as exhausted unless a new kernel changes +the local arithmetic or output layout first. + +Do not promote another route-window change unless it improves compact prefill +and passes the five-fixture gate with no new top-1 mismatch and no new +Tensor-vs-standard greedy continuation mismatch. + +## Drift Gate Artifact Update + +`speed-bench/run_quality_drift_gate.py` now writes `summary.md` beside +`summary.json`. The Markdown report contains the same five-scenario tables for +`standard_vs_quality`, `tensor_vs_quality`, and `tensor_vs_standard`, plus the +aggregate gate status. This keeps the promotion evidence persistent and +human-readable under the ignored `speed-bench/local-runs/` artifact tree. + +Validation used the existing current-default drift dumps with `--reuse`: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --reuse \ + --out-dir speed-bench/local-runs/20260514-221837-quality-drift-gate +``` + +The regenerated Markdown report is: + +- `speed-bench/local-runs/20260514-221837-quality-drift-gate/summary.md` + +Gate result stayed `OK`: Tensor-vs-standard had zero top-1 mismatches, zero +greedy mismatches, min top20 overlap `19/20`, worst RMS `0.239946`, and worst +top20 max abs `0.55422`. + +`speed-bench/run_prefill_candidate_gate.py` now also writes +`prefill-candidate-summary.md` beside `prefill-candidate-summary.json`. The +candidate Markdown report combines the median compact speed table with the +five-scenario drift-gate status when `--run-drift-gate` is used and the speed +screen passes. If the speed screen fails or the drift gate is otherwise not +run, the report says so explicitly to avoid promoting speed-only candidate +artifacts. + +The candidate scorecard also computes a conservative promotion decision: + +- every measured compact context must beat the Tensor baseline by at least + `--min-prefill-gain-pct` (default `0.0`); +- every repeat/context pair must clear `--min-repeat-prefill-gain-pct` + (default `0.0`), and the Markdown report now prints the per-context repeat + deltas so median-only wins are easy to audit; +- the five-scenario drift gate must be present and green; +- Tensor-vs-standard drift must stay inside the configured production envelope: + `--max-tensor-standard-rms=0.30` and + `--max-tensor-standard-top20-abs=0.60` by default; +- failed speed screens skip the nested drift gate and still write + JSON/Markdown artifacts; failed drift gates also write artifacts before + returning non-zero. Pass `--no-fail` for exploratory sweeps that should keep + going after a rejected candidate. + +Writer validation used the existing `gpu-embed-min2048` candidate summary +without rerunning benchmarks: + +- `speed-bench/local-runs/20260515-010001-gpu-embed-min2048/prefill-candidate-summary.md` + +`--reuse --out-dir=` now regenerates candidate scorecards from +saved CSVs/charts and passes `--reuse` through to nested drift-gate dumps. This +was validated on the default-off fast routed-MoE skip candidate without +rerunning benchmarks or model captures: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --reuse \ + --out-dir speed-bench/local-runs/20260514-212340-mpp-fast-skip-down26-29-30 \ + --candidate-label mpp-fast-skip-down26-29-30 \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42 \ + --repeat 2 \ + --ctx-max 8192 \ + --gen-tokens 16 \ + --run-drift-gate \ + --no-fail +``` + +The regenerated scorecard correctly reports that the candidate is not +production promotion-safe under the default drift envelope even though it is a +useful default-off eval candidate: it passes top-1/greedy gates and has minimum +compact prefill gain `+6.0%`, but Tensor-vs-standard worst RMS `0.64381` and +worst top20 abs `1.13945` exceed the production envelope. + +The standalone `run_quality_drift_gate.py` also accepts the same optional drift +envelope flags. The candidate gate passes them through to the nested drift gate, +so the nested `quality-drift-gate/summary.md` now reports `Gate: FAIL` for +production-envelope breaches while still preserving the raw five-scenario +tables. + +## Stage Profile Shape Tables + +`speed-bench/summarize_stage_profile.py` now keeps per-shape totals for dense +Q8_0 profile lines, matching the existing FlashAttention shape tables. This +makes the dense matmul targets explicit in persistent local reports instead of +requiring manual parsing of stderr. + +Validation regenerated a summary from the existing current-default profile log +without rerunning benchmarks: + +```sh +python3 speed-bench/summarize_stage_profile.py \ + speed-bench/local-runs/20260514-161802-current-default-attn-all-profile/long_code_audit_profile.log \ + --output speed-bench/local-runs/20260515-012815-stage-profile-summary/stage-profile-summary.md \ + --json speed-bench/local-runs/20260515-012815-stage-profile-summary/stage-profile-summary.json +``` + +The generated Q8 shape table ranks `attn_out in=8192 out=4096 tok=3844` at +`808.055 ms` total and `attn_q_b in=1024 out=32768 tok=3844` at `805.319 ms` +total, followed by `attn_q_a` and `attn_kv`. These ignored local artifacts are +kept under: + +- `speed-bench/local-runs/20260515-012815-stage-profile-summary/stage-profile-summary.md` +- `speed-bench/local-runs/20260515-012815-stage-profile-summary/stage-profile-summary.json` + +## Candidate Generation Floor + +`speed-bench/run_prefill_candidate_gate.py` now treats generation throughput as +a secondary promotion condition instead of an informational-only column. The +scorecard still prioritizes prefill, but a candidate is not production-safe if +any measured context falls below `--min-generation-gain-pct` versus the current +Tensor baseline. The default floor is `-5.0%`, which allows small generation +noise for prefill-first work while rejecting larger regressions before eval. + +Negative-control validation reused the saved long-context CSVs for +`mpp-fast-gate0-up15-down12-long128` without rerunning benchmarks: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --reuse \ + --out-dir speed-bench/local-runs/20260514-191816-mpp-fast-gate0-up15-down12-long128 \ + --candidate-label mpp-fast-gate0-up15-down12-long128 \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env DS4_METAL_MPP_MOE_UP_START_LAYER=15 \ + --set-env DS4_METAL_MPP_MOE_DOWN_START_LAYER=12 \ + --repeat 1 \ + --ctx-max 65536 \ + --gen-tokens 128 \ + --no-fail +``` + +The regenerated scorecard fails promotion for both the prefill floor +(`min=-3.9%`) and the generation floor (`min=-8.0%`, required `-5.0%`), and +also notes that the drift gate was not run: + +- `speed-bench/local-runs/20260514-191816-mpp-fast-gate0-up15-down12-long128/prefill-candidate-summary.md` + +The candidate gate also now records repeat-level prefill gains and requires +every repeat/context pair to clear `--min-repeat-prefill-gain-pct` before +marking a candidate promotion-safe. The default is `0.0%`, matching the median +prefill floor but avoiding hidden one-repeat regressions in noisy two-repeat +screens. Repeat-level generation is reported as a diagnostic, while the +promotion floor for generation remains median-based because short generation +timing is noisier than prefill timing. + +## Drift Worst-Fixture Attribution + +`speed-bench/run_quality_drift_gate.py` now writes an `extrema` block for each +pair and adds a "Worst fixture" table to `summary.md`. Drift-envelope failures +also name the fixture that caused the breach. + +Validation regenerated the existing fast skip-26/29/30 drift summary with +`--reuse`, without rerunning logits or logprobs captures: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --reuse \ + --out-dir speed-bench/local-runs/20260514-212340-mpp-fast-skip-down26-29-30/quality-drift-gate \ + --max-tensor-standard-rms 0.30 \ + --max-tensor-standard-top20-abs 0.60 \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env DS4_METAL_MPP_MOE_DOWN_FILTER=layer=0-25,layer=27-28,layer=31-42 \ + --no-fail +``` + +For `tensor_vs_standard`, the envelope failures are now attributed to +`long_memory_archive` for worst RMS (`0.64381`) and `long_code_audit` for worst +top20 abs (`1.13945`). The parent prefill candidate scorecard was regenerated +from saved CSVs and now carries those fixture names in its promotion failures +and its compact drift-target table: + +- `speed-bench/local-runs/20260514-212340-mpp-fast-skip-down26-29-30/quality-drift-gate/summary.md` +- `speed-bench/local-runs/20260514-212340-mpp-fast-skip-down26-29-30/prefill-candidate-summary.md` + +Both `run_quality_drift_gate.py` and `run_prefill_candidate_gate.py` now write a +`run_config` JSON block, and their Markdown reports show a compact Run Config +table. This preserves the thresholds, context range, repeat count, reuse mode, +resolved tool paths, and command arguments needed to reproduce a saved baseline +or candidate gate. The Markdown reports also include a quoted replay command so +the same gate can be copied directly into a shell. + +## Persistent Local Artifacts + +`speed-bench/run_metal_tensor_bench.sh` now defaults to a timestamped ignored +output directory: + +```sh +OPEN_CHART=0 speed-bench/run_metal_tensor_bench.sh +``` + +The current branch chart was regenerated and kept locally at: + +- `speed-bench/local-runs/20260514-220230-metal-tensor-bench/ds4_bench_standard_quality_tensor_128.png` +- `speed-bench/local-runs/20260515-021428-metal-tensor-bench/ds4_bench_standard_quality_tensor_128.png` + +`speed-bench/index_local_runs.py` builds a persistent Markdown/JSON index across +saved local run summaries without rerunning benchmarks or drift captures: + +```sh +RUN_ID=$(date +%Y%m%d-%H%M%S) +OUT_DIR=speed-bench/local-runs/${RUN_ID}-local-run-index +python3 speed-bench/index_local_runs.py \ + --output ${OUT_DIR}/local-run-index.md \ + --json-output ${OUT_DIR}/local-run-index.json +``` + +Validation artifact: + +- `speed-bench/local-runs/20260515-015819-local-run-index/local-run-index.md` + +Refreshed local index after the comparator follow-up: + +- `speed-bench/local-runs/20260515-021401-local-run-index/local-run-index.md` + +Refreshed local index after the full current-branch chart regeneration: + +- `speed-bench/local-runs/20260515-022807-local-run-index/local-run-index.md` + +Refreshed local index after the gate/up-fast, down-clean-early hybrid rejection: + +- `speed-bench/local-runs/20260515-023724-local-run-index/local-run-index.md` + +Refreshed local index after the dense Q8_0 comparator smoke: + +- `speed-bench/local-runs/20260515-024233-local-run-index/local-run-index.md` + +Refreshed local index after wiring Q8 into the comparator probe wrapper: + +- `speed-bench/local-runs/20260515-024511-local-run-index/local-run-index.md` + +Refreshed local index after adding `q8_filter` to the comparator probe run +config: + +- `speed-bench/local-runs/20260515-024648-local-run-index/local-run-index.md` + +Refreshed local index after the `attn_out` dense Q8_0 comparator smoke: + +- `speed-bench/local-runs/20260515-024755-local-run-index/local-run-index.md` + +Refreshed local index after the long-shape dense Q8_0 comparator baselines: + +- `speed-bench/local-runs/20260515-025020-local-run-index/local-run-index.md` + +## Comparator Continue-On-Breach Probe + +The local comparator can now keep scanning after a target breach: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --preset mpp-fast-skip-down26-29-30 \ + --case long_memory_archive \ + --route moe_down \ + --continue-after-breach \ + --compare-max 80 \ + --top 12 +``` + +Validation artifact: + +- `speed-bench/local-runs/20260515-021315-mpp-fast-skip-down26-29-30-mpp-compare-probe/mpp-compare-summary.md` + +This confirms the rejected skip-26/29/30 candidate is not only a single +layer-31 local-delta issue. With continue-on-breach enabled, `moe_down` +breaches repeated across layers 31-40 and 42 on `long_memory_archive`; worst +local max abs was `0.0205078` at layer 42. This keeps the candidate rejected +and makes further down-projection expansion unattractive without a different +accuracy strategy. + +## Dense Q8_0 Comparator Hook + +Added a default-off dense Q8_0 comparator hook for future kernel prototypes: + +```sh +DS4_METAL_Q8_COMPARE=1 \ +DS4_METAL_Q8_COMPARE_FILTER=attn_q_b \ +DS4_METAL_MPP_COMPARE_MAX=3 \ +DS4_METAL_MPP_COMPARE_VERBOSE=1 \ +./ds4 --metal -mt auto \ + --prompt-file tests/test-vectors/prompts/short_code_completion.txt \ + -c 4096 -n 1 --system "" --nothink --temp 0 +``` + +Validation artifact: + +- `speed-bench/local-runs/20260515-024144-q8-compare-smoke/mpp-compare-summary.md` + +The smoke run compared the current legacy Q8_0 prefill output against a legacy +reference for the first three `attn_q_b` layers and reported zero delta for all +three `32768x27x1024` comparisons. This does not change production behavior or +promote a new kernel; it gives the next dense Q8_0 prototype a local +ref-vs-candidate check before the five-fixture logprob gate. + +`speed-bench/run_mpp_compare_probe.py` now supports the same hook directly: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --case short_code_completion \ + --route q8 \ + --q8-filter attn_q_b \ + --compare-max 3 \ + --verbose \ + --top 10 +``` + +Validation artifact: + +- `speed-bench/local-runs/20260515-024453-manual-mpp-compare-probe/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-024637-manual-mpp-compare-probe/mpp-compare-summary.md` + +The wrapper set `DS4_METAL_Q8_COMPARE=1` and +`DS4_METAL_Q8_COMPARE_FILTER=attn_q_b`, then produced the same zero-delta +three-layer `attn_q_b` summary. Future Q8 kernel candidates can use this +wrapper instead of hand-written env commands before the five-fixture gate. The +newer artifact also records `q8_filter=attn_q_b` explicitly in `run_config`. + +The second dense Q8_0 hotspot was smoke-checked through the same wrapper: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --case short_code_completion \ + --route q8 \ + --q8-filter attn_out \ + --compare-max 3 \ + --verbose \ + --top 10 +``` + +Validation artifact: + +- `speed-bench/local-runs/20260515-024740-manual-mpp-compare-probe/mpp-compare-summary.md` + +This produced three zero-delta `attn_out` comparisons with shape +`4096x27x8192`. Dense Q8_0 prototypes for both current hotspots now have a +one-command local comparator smoke before compact timing and the five-fixture +logprob gate. + +Long-shape comparator baselines were also captured on `long_code_audit` with +`--compare-max 50 --verbose`, covering all 43 layers for each hotspot: + +- `speed-bench/local-runs/20260515-024918-manual-mpp-compare-probe/mpp-compare-summary.md` + (`attn_q_b`, 43 comparisons, shape `32768x3844x1024`, zero delta) +- `speed-bench/local-runs/20260515-024956-manual-mpp-compare-probe/mpp-compare-summary.md` + (`attn_out`, 43 comparisons, shape `4096x3844x8192`, zero delta) + +These are reference artifacts for the next dense Q8_0 kernel attempt. A useful +prototype should improve compact prefill timing, keep these local comparisons +inside target, then pass the five-fixture logprob gate before promotion. + +## Current Default Baseline Refresh + +Regenerated the full current-branch standard/quality/Tensor chart with +timestamped local artifacts: + +```sh +OPEN_CHART=0 speed-bench/run_metal_tensor_bench.sh +``` + +Artifact root: + +- `speed-bench/local-runs/20260515-025303-metal-tensor-bench/` + +Chart: + +- `speed-bench/local-runs/20260515-025303-metal-tensor-bench/20260515-025303_gen128_ds4_bench_standard_quality_tensor.png` + +The Tensor default remains a clear prefill win over standard Metal on the full +512..65536 context sweep: + +| ctx | Tensor prefill vs standard | Tensor generation vs standard | +| ---: | ---: | ---: | +| 512 | +31.3% | -0.9% | +| 1024 | +31.4% | -1.2% | +| 2048 | +26.5% | -0.7% | +| 4096 | +22.1% | -0.5% | +| 8192 | +19.9% | -0.8% | +| 16384 | +19.8% | -0.5% | +| 32768 | +16.6% | -0.6% | +| 65536 | +15.4% | -1.1% | + +Also reran the strict five-fixture drift gate against the current source: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --max-tensor-standard-rms 0.30 \ + --max-tensor-standard-top20-abs 0.60 +``` + +Artifact root: + +- `speed-bench/local-runs/20260515-030753-quality-drift-gate/` + +Result: `Gate: OK`. + +Tensor-vs-standard stayed inside the conservative drift envelope: + +| Metric | Value | +| --- | ---: | +| top1 mismatches | 0 | +| greedy mismatches | 0 | +| min top20 overlap | 19/20 | +| worst RMS | 0.239946 | +| worst top20 max abs | 0.55422 | + +This is the current production baseline for the next prefill attempt: any new +default candidate should improve compact/full-sweep prefill while preserving a +green five-fixture gate and staying inside the `0.30` RMS / `0.60` top20 +Tensor-vs-standard envelope. + +## Current Stage Profile Refresh + +Ran a fresh current-branch profile on `long_code_audit` with routed-MoE, dense +Q8_0, FlashAttention, and layer profiling enabled: + +```sh +env DS4_METAL_MOE_STAGE_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE_FILTER=attn_ \ + DS4_METAL_FLASH_ATTN_STAGE_PROFILE=1 \ + DS4_METAL_LAYER_PROFILE=1 \ + ./ds4 --metal -mt auto \ + --prompt-file tests/test-vectors/prompts/long_code_audit.txt \ + -c 16384 -n 1 --system "" --nothink --temp 0 +``` + +Artifact root: + +- `speed-bench/local-runs/20260515-031301-current-stage-profile/` + +Summary: + +- `speed-bench/local-runs/20260515-031301-current-stage-profile/stage-profile-summary.md` + +The refreshed profile produced `420.69` prefill t/s and parsed `5001.333 ms` +of profiled stage time. The top stage families are still routed-MoE matmuls and +the two large dense Q8_0 attention projections: + +| Stage | total ms | events | avg ms | +| --- | ---: | ---: | ---: | +| `moe_stage.gate` | 906.862 | 43 | 21.090 | +| `moe_stage.up` | 906.022 | 43 | 21.070 | +| `moe_stage.down` | 834.385 | 43 | 19.404 | +| `q8.attn_out` | 806.859 | 43 | 18.764 | +| `q8.attn_q_b` | 795.933 | 43 | 18.510 | +| `flash_attn.static_mixed_nonvec.attention` | 310.296 | 20 | 15.515 | + +`speed-bench/summarize_stage_profile.py` now also reports routed-MoE timing by +Tensor mask. On this run: + +| MoE mpp mask | top stages | total ms | +| --- | --- | ---: | +| `0/0/0` | `up`=410.4, `gate`=409.9, `down`=408.7 | 1266.616 | +| `1/1/1` | `gate`=397.5, `up`=395.3, `down`=385.3 | 1252.849 | +| `0/0/1` | `up`=100.4, `gate`=99.5, `down`=40.3 | 248.163 | + +This makes the next prefill target concrete: a new routed-MoE kernel should +focus on the early legacy `0/0/0` window first. Simply switching those layers +to the existing cooperative-Tensor path has already been rejected by drift +gates, so the useful work is a reference-compatible MoE matmul design that +keeps the low-drift arithmetic behavior while reducing the early-window cost. +Dense Q8_0 `attn_out` and `attn_q_b` remain the next largest targets, but their +small tile/direct-RHS variants have already been rejected. + +Legacy `kernel_mul_mm_id` inspection notes: + +- the early `0/0/0` path already uses the same simdgroup MMA shape as the + standard Metal reference; +- each expert-major tile produces a logical `64 x 32` result, but the 32 + columns map back through `hids` to token/expert slots rather than to a + contiguous dense destination; +- the current threadgroup writeback is therefore doing a real scatter + transpose, not just an avoidable staging copy; +- a useful reference-compatible kernel is more likely to improve expert-major + staging or produce a token-major/down-sum layout directly than to replace the + final scatter with a dense-style `simdgroup_store`. + +That rules out the simplest "direct store" tweak. The next kernel prototype +should either change the work map/output layout deliberately or focus on +computing the routed down projection closer to the token-major summed output, +with a comparator before any timing gate. + +## FlashAttention Vector-Path Boundary + +The current static-mixed prefill router keeps the vector FlashAttention helper +only for `n_tokens < 20`; larger prefill batches use the non-vector helper. This +is not an arbitrary threshold. The vector helper launches `n_tokens * n_head * +nwg` workgroups and stores one partial `head_dim` result plus softmax state per +query/head/workgroup before a reduce pass: + +```c +tmp_bytes = nrows * head_dim * nwg * sizeof(float) + + nrows * (2 * nwg) * sizeof(float); +``` + +With the current DS4 shape (`n_head=64`, `head_dim=512`, `nwg=32`), forcing the +existing vector path for normal prefill would require the following temporary +buffer sizes: + +| tokens | vector tmp | +| ---: | ---: | +| 16 | 64.2 MiB | +| 20 | 80.3 MiB | +| 64 | 257.0 MiB | +| 128 | 514.0 MiB | +| 256 | 1028.0 MiB | +| 512 | 2056.0 MiB | +| 1024 | 4112.0 MiB | +| 2048 | 8224.0 MiB | +| 4096 | 16448.0 MiB | +| 8192 | 32896.0 MiB | + +Conclusion: reject a simple force-vector prefill patch before timing or drift. +The memory footprint is already about 2.0 GiB at 512 tokens and about 32.1 GiB +at 8192 tokens. Future FlashAttention prefill work needs a streaming or +reduced-temporary design; reusing the decode-style vector helper is not a +production candidate for normal prefill. + +## Rejected M5 SIMD-Group Barrier Elision Probe + +Checked the `swival-ds4-m5/simdgroup_matrix` idea of dropping the three +`simdgroup_barrier(mem_none)` calls inside the existing dense and routed-MoE +`simdgroup_multiply_accumulate` loops behind an M5 function constant. This +keeps the same MMA arithmetic, so it was a plausible low-drift prefill +candidate, but the timing was not favorable. + +The local patch was tested and then reverted. The run used the candidate gate +in inverted form: `tensor` was the patched default-on M5 path, and +`disable-m5-sgmatrix-control` set `DS4_METAL_DISABLE_M5_SIMDGROUP_MATRIX=1`. + +Artifact: + +- `speed-bench/local-runs/20260515-032257-disable-m5-sgmatrix-control/prefill-candidate-summary.md` + +Disabled control vs patched default: + +| ctx | disabled-control prefill vs patched | disabled-control generation vs patched | +| ---: | ---: | ---: | +| 512 | -2.0% | +0.1% | +| 1024 | +5.3% | +0.2% | +| 2048 | +3.2% | +0.1% | +| 4096 | +3.4% | -0.5% | +| 8192 | +0.6% | -0.6% | + +Conclusion: reject and do not port this Swival M5 barrier-elision patch. It +regresses the compact prefill median at most measured contexts, so a drift gate +is unnecessary. + +## Q8_0 MPP Bug Triage: Block Size + +Closed the first diagnostic from the older `m5-neural-accelerator` Phase 5 +notes before revisiting any generic Q8_0 MPP kernel. The concern was that +Metal might pad: + +```metal +struct block_q8_0 { + half d; + int8_t qs[32]; +}; +``` + +to something other than the host-side 34-byte row stride. A local runtime +Metal compile/run with `static_assert(sizeof(block_q8_0) == 34)` passed and +returned `34`. + +Artifact: + +- `speed-bench/local-runs/20260515-033017-q8-block-size-check/result.txt` + +Conclusion: the old generic Q8_0 MPP bug is not explained by `block_q8_0` +padding. If that kernel is revisited, the next diagnostics should focus on +K-loop accumulation semantics and q8 dequant precision/layout, using the dense +Q8 comparator hook before any full-model timing. + +## Q8_0 MPP Bug Triage: Static-K Accumulation + +Ran a local runtime Metal harness for the next Phase 5 hypothesis: whether +`mpp::tensor_ops::matmul2d` accumulates into the same cooperative tensor across +a manual static-`TILEK` K-loop. + +Artifact: + +- `speed-bench/local-runs/20260515-033248-mpp-kloop-accum-check/result.txt` + +The harness compares three half x half -> float kernels on the same +`M=64, N=32, K=128` tile: + +- `k_full`: one dynamic-K `matmul2d` call; +- `k_loop`: four default-mode `TILEK=32` `matmul2d.run()` calls into the + same zeroed cooperative tensor; +- `k_loop_mac`: the same static K-loop but with + `matmul2d_descriptor::mode::multiply_accumulate`, matching this branch's + existing Tensor kernels. + +Result: + +| Comparison | max abs | rms | +| --- | ---: | ---: | +| `kloop_vs_full` | 0.240234 | 0.101835 | +| `kloop_mac_vs_full` | 0 | 0 | +| `full_vs_host_f32` | 0 | 0 | +| `kloop_vs_host_f32` | 0.240234 | 0.101835 | +| `kloop_vs_host_last32` | 0 | 0 | +| `kloop_mac_vs_host_f32` | 0 | 0 | + +Conclusion: default-mode static-`TILEK` `matmul2d.run()` calls overwrite with +the last K block rather than accumulating across the loop. The +`multiply_accumulate` descriptor mode accumulates correctly and matches both +dynamic-K `matmul2d` and the host fp32 reference for this shape. This branch's +existing Tensor kernels already use `multiply_accumulate`, so they are not +exposed to this specific failure. If the older generic Q8_0 MPP prototype is +revisited, verify it uses `multiply_accumulate` plus explicit cooperative-tensor +zeroing before moving on to dequant precision/layout diagnostics. + +## Q8_0 MPP Bug Triage: Dequantized Tile Correctness + +Ran a standalone q8_0 -> threadgroup-half -> `matmul2d` harness using the +corrected `multiply_accumulate` descriptor. The kernel uses the same q8_0 block +layout (`sizeof(block_q8_0) == 34`), dequantizes each 32-K weight block into a +`TN x TILEK` threadgroup half tile, then accumulates a `64 x 32 x 128` half x +half -> float matmul. The host reference mirrors DS4's legacy prefill math: +activations are half-rounded, q8 weights are dequantized in float and rounded +to half before fp32 accumulation. + +Artifact: + +- `speed-bench/local-runs/20260515-033841-q8-mpp-correctness-check/result.txt` + +Result: + +| Comparison | max abs | rms | +| --- | ---: | ---: | +| `q8_mpp_vs_host_half_reference` | 0 | 0 | + +Conclusion: the corrected static-K q8_0 MPP tile is numerically sound in a +standalone harness. This does not promote a production Q8_0 Tensor route, but +it narrows the old failure down to implementation details rather than a +fundamental `block_q8_0` layout or `matmul2d` accumulation issue. The next +production experiment, if any, should be a default-off single instantiation of +the existing generic `kernel_mul_mm_mpp` for q8_0, gated through the dense Q8 +comparator before any whole-model timing or drift gate. + +## Rejected Q8_0 Generic MPP Matmul Route + +Tried the proposed default-off single-instantiation generic Q8_0 MPP route +locally, then removed the production hook/template because timing was not +competitive with the current Tensor default. + +Correctness/comparator artifacts: + +- `speed-bench/local-runs/20260515-034306-manual-mpp-compare-probe/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-034322-manual-mpp-compare-probe/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-034336-manual-mpp-compare-probe/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-034411-manual-mpp-compare-probe/mpp-compare-summary.md` + +The long `attn_q_b` probe compared all 43 layers with no breaches; worst max +abs was `3.57628e-06` and worst RMS was `7.3025e-08`. The long `attn_out` +probe also compared all 43 layers with no breaches; worst max abs was +`0.000335693` and worst RMS was `3.16847e-06`. + +Timing artifacts: + +- `speed-bench/local-runs/20260515-040005-experimental-q8-attn-q-b/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-040427-experimental-q8-attn-out/prefill-candidate-summary.md` + +Median speed vs current Tensor default: + +| Candidate | 512 | 1024 | 2048 | 4096 | 8192 | Generation range | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `attn_q_b` Q8_0 MPP | -8.4% | -5.8% | -1.6% | -0.7% | -0.0% | -0.4%..-0.1% | +| `attn_out` Q8_0 MPP | -6.2% | -7.6% | -3.7% | -1.0% | +0.3% | -0.8%..+0.4% | + +Conclusion: reject before the five-fixture drift gate. The corrected MPP tile is +locally accurate, but the whole-kernel path regresses compact prefill where it +matters most and only reaches noise-level parity at 8192 tokens. Keeping a +default-off Q8_0 Tensor route would add surface area without a usable speed +tradeoff. + +Post-cleanup validation: + +- `make ds4 ds4-bench` +- `python3 -m py_compile speed-bench/*.py` +- `git diff --check` +- `python3 speed-bench/run_quality_drift_gate.py --max-tensor-standard-rms 0.30 --max-tensor-standard-top20-abs 0.60` + +Fresh drift artifact: + +- `speed-bench/local-runs/20260515-041151-quality-drift-gate/summary.md` +- `speed-bench/local-runs/20260515-041450-local-run-index/local-run-index.md` + +Post-cleanup Tensor-vs-standard drift: + +| Metric | Result | +| --- | ---: | +| top-1 mismatches | 0 | +| greedy mismatches | 0 | +| min top20 overlap | 19/20 | +| worst RMS | 0.239946 | +| worst top20 max abs | 0.55422 | + +Gate result: OK. + +## Rejected Legacy Routed-MoE Gate/Up Pair Kernel + +Tried a default-off legacy `simdgroup_multiply_accumulate` pair kernel for the +early routed-MoE gate/up projections. The design preserved the reference +reduction shape for each projection while reusing the same activation tile for +gate and up. It was intended to target the early `0/0/0` window without taking +the drift-prone cooperative-Tensor route. + +Comparator artifact: + +- `speed-bench/local-runs/20260515-042045-manual-mpp-compare-probe/mpp-compare-summary.md` + +The long `long_code_audit` comparator run covered `40` gate and `40` up +comparisons with no target breaches. Worst max abs was `8.39233e-05` and worst +RMS was `2.10939e-06`. + +Timing artifact: + +- `speed-bench/local-runs/20260515-042136-experimental-moe-legacy-pair-gate-up/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-042900-local-run-index/local-run-index.md` + +Median speed vs current Tensor default: + +| 512 | 1024 | 2048 | 4096 | 8192 | Generation range | +| ---: | ---: | ---: | ---: | ---: | ---: | +| +0.5% | -4.5% | -4.6% | -0.4% | -0.9% | -2.1%..+0.4% | + +Conclusion: reject before the five-fixture drift gate and remove the +experimental kernel/hook. The pair kernel was locally close to the reference, +but register pressure and the second accumulated output likely outweighed the +saved activation staging; it regressed the compact mid-contexts and generation +instead of improving prefill. + +## Rechecked MoE Sum6 Boundary + +Rechecked the existing `DS4_METAL_MOE_SUM6_DISABLE=1` control after the current +Tensor default changes, because the routed-MoE sum stage remains a possible +direct-down-sum target. + +Artifact: + +- `speed-bench/local-runs/20260515-043038-disable-moe-sum6-control/prefill-candidate-summary.md` + +Median speed vs current Tensor default: + +| 512 | 1024 | 2048 | 4096 | 8192 | Generation range | +| ---: | ---: | ---: | ---: | ---: | ---: | +| +0.9% | +5.5% | +4.0% | -0.3% | -0.7% | -1.0%..+0.1% | + +This differs from the older boundary sweep enough to test a thresholded +candidate. A local patch added `DS4_METAL_MOE_SUM6_MIN_TOKENS=4096`, keeping +the fused `sum6` kernel for larger batches and using the generic add chain +below the threshold. + +Threshold artifact: + +- `speed-bench/local-runs/20260515-043605-moe-sum6-min4096/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-044100-local-run-index/local-run-index.md` + +Threshold result vs current Tensor default: + +| 512 | 1024 | 2048 | 4096 | 8192 | Generation range | +| ---: | ---: | ---: | ---: | ---: | ---: | +| -1.1% | -2.0% | +0.5% | +0.0% | -0.5% | -0.4%..+0.0% | + +Conclusion: reject and remove the threshold knob before the five-fixture drift +gate. The all-disabled control shows the sum stage is noisy enough to revisit, +but the obvious token-threshold policy does not produce a clean compact prefill +win. A future direct-down-sum kernel still needs to beat the current fused +`sum6` baseline, not the slower generic fallback. + +## Rejected Prefill Direct Down-Sum Probe + +Tried a local default-off probe that reused the existing six-expert direct +down-sum kernel for batched prefill (`DS4_METAL_MOE_PREFILL_DIRECT_DOWN_SUM=1`) +instead of writing per-expert down outputs and running the separate `sum6` +kernel. The probe also forced the MoE mid buffer back to F32 because the +existing direct-sum kernels read F32 activations. + +Short screen artifact: + +- `speed-bench/local-runs/20260515-044921-moe-prefill-direct-down-sum/prefill-candidate-summary.md` + +One-repeat screen vs current Tensor default: + +| 512 | 1024 | 2048 | Generation range | +| ---: | ---: | ---: | ---: | +| -19.7% | -20.1% | -29.6% | -0.9%..+1.4% | + +Conclusion: reject before the five-fixture drift gate and remove the temporary +hook. Saving the down scratch write plus sum dispatch does not compensate for +giving up the grouped prefill matmul; a production direct-down-sum design would +need to keep batched matmul throughput while accumulating directly into the +token-major output. + +## Rejected Dense Q8_0 F16-RHS Prepack Probe + +Tried a local default-off dense Q8_0 prefill probe that prepacked the RHS +activation matrix to half once, then ran a legacy simdgroup-MMA Q8_0 matmul +variant that read half RHS values. This preserved the same effective MMA input +precision as the current kernel, which casts F32 activations to half inside +each threadgroup, but added one F32-to-F16 prepack dispatch and a scratch RHS +buffer. + +Short screen artifacts: + +- `speed-bench/local-runs/20260515-045423-q8-f16-rhs-attn-q-b/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-045455-q8-f16-rhs-attn-out/prefill-candidate-summary.md` + +One-repeat screen vs current Tensor default: + +| Candidate | 512 | 1024 | 2048 | Generation range | +| --- | ---: | ---: | ---: | ---: | +| `attn_q_b` F16 RHS | -3.2% | -0.0% | +0.2% | +0.0%..+0.7% | +| `attn_out` F16 RHS | -5.6% | -6.6% | -5.3% | -0.4%..+0.2% | + +Conclusion: reject before the five-fixture drift gate and remove the temporary +kernel/hook. The prepack dispatch does not amortize at compact contexts, and +the only positive point is noise-level on `attn_q_b` at 2048 tokens. + +## Rejected FlashAttention GPU Mask Fill + +Tried a local default-off static-mixed FlashAttention mask-fill kernel +(`DS4_METAL_FLASH_ATTN_GPU_MASK_FILL=1`). The goal was to replace the CPU write +of the full transient half mask with a GPU analytic fill while leaving the +existing pad, block-map, and attention kernels unchanged. + +Short screen artifact: + +- `speed-bench/local-runs/20260515-045825-flash-attn-gpu-mask-fill/prefill-candidate-summary.md` + +One-repeat screen vs current Tensor default: + +| 512 | 1024 | 2048 | Generation range | +| ---: | ---: | ---: | ---: | +| -1.6% | -0.1% | -0.5% | -0.4%..+1.2% | + +Conclusion: reject before the five-fixture drift gate and remove the temporary +kernel/hook. Moving mask fill to a separate GPU dispatch did not beat the CPU +fill path at compact contexts; the FlashAttention setup work still needs a more +integrated redesign if it is worth targeting. + +## Rejected Routed-MoE Down-0 Window + +Rechecked one remaining env-only routed-MoE window after the current Tensor +cleanup: move only the down projection to layer 0 while leaving gate/up on the +conservative default window (`DS4_METAL_MPP_MOE_DOWN_START_LAYER=0`). A short +screen looked plausible, so the candidate was run through the full two-repeat +candidate gate and five-fixture drift gate. + +Artifacts: + +- short screen: + `speed-bench/local-runs/20260515-050301-moe-down0-gate15-up15-screen/prefill-candidate-summary.md` +- full gate: + `speed-bench/local-runs/20260515-050334-moe-down0-gate15-up15/prefill-candidate-summary.md` + +Median speed vs current Tensor default: + +| 512 | 1024 | 2048 | 4096 | 8192 | Generation range | +| ---: | ---: | ---: | ---: | ---: | ---: | +| +5.6% | +6.0% | +0.0% | +2.0% | +1.2% | -2.6%..-0.0% | + +Promotion decision: reject. The repeat-level speed floor failed at 2048 and +8192 (`min repeat=-4.0%`), and the five-fixture drift gate failed: +`long_memory_archive` changed top-1 and greedy step 0, Tensor-vs-standard worst +RMS rose to `0.550345`, and worst top20 abs rose to `1.38147`. This confirms +that simply extending the current Tensor down route into the early layers is +not a production path; early routed-MoE needs a reference-compatible kernel +design, not another window expansion. + +An adjacent short screen with `DS4_METAL_MPP_MOE_DOWN_START_LAYER=4` also +failed before drift: + +- `speed-bench/local-runs/20260515-051113-moe-down4-gate15-up15-screen/prefill-candidate-summary.md` + +That run was +3.5% at 512 and +3.2% at 1024, but -0.3% at 2048 with a -5.3% +generation point. Excluding layers 0..3 therefore does not recover a clean +early-down production candidate. + +The drift-mitigation variant +`DS4_METAL_MPP_MOE_DOWN_START_LAYER=0 DS4_METAL_MOE_MID_F32=1` also failed the +short speed screen before drift: + +- `speed-bench/local-runs/20260515-051250-moe-down0-mid-f32-screen/prefill-candidate-summary.md` + +It measured +4.1% at 512 and +3.3% at 1024, but -0.4% at 2048. Preserving the +F32 routed intermediate is therefore not a usable way to make the down-0 window +production-safe. + +## Rejected Mul-MM-ID Writeback Index Probe + +Tried a local default-off function-constant probe that changed the generic +`kernel_mul_mm_id` writeback column assignment from `sgitg` to `tiitg/32`, +matching the separate fast-layout kernel's writeback loop while preserving the +same matmul arithmetic and result layout. + +Short screen artifact: + +- `speed-bench/local-runs/20260515-051517-mul-mm-id-writeback-tiidx-screen/prefill-candidate-summary.md` + +One-repeat screen vs current Tensor default: + +| 512 | 1024 | 2048 | Generation range | +| ---: | ---: | ---: | ---: | +| -5.6% | +0.1% | -0.5% | -0.4%..+3.7% | + +Conclusion: reject before drift and remove the temporary hook. This writeback +mapping is arithmetic-neutral but not a prefill win; the generic routed-MoE +kernel still needs a real staging or output-layout change rather than a +thread-index assignment tweak. + +## Rejected Legacy Gate/Up Pair Probe + +Tried a local default-off `DS4_METAL_MOE_PAIR_GATE_UP_LEGACY=1` probe that +computed routed-MoE gate and up in one legacy simdgroup-MMA kernel for early +non-MPP layers. The goal was to preserve the standard Metal reduction order +while reusing the shared expert map and activation tile. + +Comparator spot checks on `long_memory_archive` matched the existing legacy +matmuls for the first large layer-0 projections: + +- `moe_gate`: `max_abs=0`, `rms=0`; +- `moe_up`: `max_abs=0`, `rms=0`. + +Speed-screen artifact: + +- `speed-bench/local-runs/20260515-072058-moe-pair-gate-up-legacy-v2/prefill-candidate-summary.md` + +Two-repeat compact screen vs current Tensor default: + +| 512 | 1024 | 2048 | 4096 | 8192 | Generation range | +| ---: | ---: | ---: | ---: | ---: | ---: | +| -0.9% | +0.2% | +1.5% | +2.5% | +1.9% | -1.2%..+0.3% | + +Repeat-level prefill still dipped negative at every measured context except +the 512-token median was already negative: min repeat was `-1.3%`. Conclusion: +reject before the five-fixture drift gate and remove the temporary kernel. The +pairing idea is locally equivalent but not repeat-stable enough to carry as a +default-off production candidate. + +## Current Default Chart Refresh, Timestamped Local Artifact + +Regenerated the current branch standard/quality/Tensor chart with the updated +`speed-bench/run_metal_tensor_bench.sh` defaults. The script now writes +timestamped artifacts under ignored `speed-bench/local-runs/` instead of +`/tmp`, so multiple comparison runs can be kept locally without pushing them. + +Command: + +```sh +OPEN_CHART=0 speed-bench/run_metal_tensor_bench.sh +``` + +Artifact root: + +- `speed-bench/local-runs/20260515-052156-metal-tensor-bench/` + +Chart: + +- `speed-bench/local-runs/20260515-052156-metal-tensor-bench/20260515-052156_gen128_ds4_bench_standard_quality_tensor.png` + +Tensor default remains a broad prefill win over standard Metal with only a +small generation tax: + +| ctx | Tensor prefill vs standard | Tensor generation vs standard | +| ---: | ---: | ---: | +| 512 | +30.2% | -0.5% | +| 1024 | +31.4% | -1.3% | +| 2048 | +26.3% | -1.0% | +| 4096 | +22.1% | -0.9% | +| 8192 | +20.1% | -0.7% | +| 16384 | +19.4% | -0.8% | +| 32768 | +17.7% | -0.6% | +| 65536 | +15.1% | -0.6% | + +## Compact Current Stage Profile + +Reran the current Tensor default stage profile on `long_code_audit` at +`-c 8192` after the earlier oversized-prompt attempt failed. This uses the +same 3844-token prompt as the 16k profile while keeping the context closer to +the middle of the benchmark sweep. + +Command: + +```sh +env DS4_METAL_MOE_STAGE_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE_FILTER=attn_ \ + DS4_METAL_FLASH_ATTN_STAGE_PROFILE=1 \ + DS4_METAL_LAYER_PROFILE=1 \ + ./ds4 --metal -mt auto \ + --prompt-file tests/test-vectors/prompts/long_code_audit.txt \ + -c 8192 -n 1 --system "" --nothink --temp 0 +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-053713-current-ctx8192-stage-profile/run.log` +- `speed-bench/local-runs/20260515-053713-current-ctx8192-stage-profile/stage-profile-summary.md` +- `speed-bench/local-runs/20260515-053713-current-ctx8192-stage-profile/stage-profile-summary.json` + +Result: `420.33` prefill t/s, `603` parsed profile events, and +`5011.795 ms` parsed stage time. The compact profile matches the earlier 16k +profile: routed-MoE gate/up/down and the two large dense Q8_0 attention +projections remain the dominant prefill cost. + +| Stage | total ms | events | avg ms | +| --- | ---: | ---: | ---: | +| `moe_stage.gate` | 909.794 | 43 | 21.158 | +| `moe_stage.up` | 909.728 | 43 | 21.156 | +| `moe_stage.down` | 834.073 | 43 | 19.397 | +| `q8.attn_out` | 803.923 | 43 | 18.696 | +| `q8.attn_q_b` | 797.692 | 43 | 18.551 | +| `flash_attn.static_mixed_nonvec.attention` | 310.597 | 20 | 15.530 | + +MoE timing by Tensor mask: + +| MoE mpp mask | top stages | total ms | +| --- | --- | ---: | +| `0/0/0` | `up`=412.5, `gate`=409.3, `down`=409.1 | 1268.948 | +| `1/1/1` | `gate`=400.4, `up`=397.5, `down`=383.9 | 1256.632 | +| `0/0/1` | `gate`=100.0, `up`=99.7, `down`=41.0 | 248.767 | + +Conclusion: the next production candidate should not be another route-window +or tile-size sweep. Those have been exhausted and either fail speed stability +or the five-fixture drift gate. The remaining plausible prefill work is a +reference-compatible routed-MoE or dense Q8_0 kernel redesign that keeps the +current low-drift arithmetic envelope while reducing staging/writeback cost. + +## Bench-Prompt Current Stage Profile + +Reran the stage profiler on the same `speed-bench/promessi_sposi.txt` prompt +used by the chart and candidate gate, walking the 512..8192 frontiers in one +Tensor run. This checks that the hotspot ranking from the smaller fixture also +holds on the actual speed-gate workload. + +Command: + +```sh +env DS4_METAL_MOE_STAGE_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE_FILTER=attn_ \ + DS4_METAL_FLASH_ATTN_STAGE_PROFILE=1 \ + DS4_METAL_LAYER_PROFILE=1 \ + ./ds4-bench -mt auto \ + --prompt-file speed-bench/promessi_sposi.txt \ + --ctx-start 512 --ctx-max 8192 --gen-tokens 1 +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-073001-current-promessi-stage-profile/bench.csv` +- `speed-bench/local-runs/20260515-073001-current-promessi-stage-profile/stage-profile-summary.md` +- `speed-bench/local-runs/20260515-073001-current-promessi-stage-profile/stage-profile-summary.json` + +Parsed profile result: `3071` events and `11745.870 ms` parsed stage time. +The profile confirms the same target order as the previous current-default +profile: + +| Stage | total ms | share | +| --- | ---: | ---: | +| `moe_stage.up` | 2519.278 | 21.4% | +| `moe_stage.gate` | 2511.646 | 21.4% | +| `moe_stage.down` | 2279.191 | 19.4% | +| `q8.attn_out` | 1790.328 | 15.2% | +| `q8.attn_q_b` | 1723.122 | 14.7% | +| `flash_attn.static_mixed_nonvec.attention` | 77.665 | 0.7% | + +MoE by Tensor mask: + +| MoE mpp mask | top stages | total ms | +| --- | --- | ---: | +| `0/0/0` | `up`=1151.6, `gate`=1146.8, `down`=1120.8 | 3521.858 | +| `1/1/1` | `up`=1090.0, `gate`=1086.5, `down`=1049.6 | 3454.142 | +| `0/0/1` | `gate`=278.4, `up`=277.7, `down`=108.7 | 689.084 | + +Decision: keep FlashAttention work deprioritized for prefill on this branch. +The next production candidate still needs to attack routed-MoE or dense Q8_0 +matmul. Within routed-MoE, the early `0/0/0` window remains the best target, +but the rejected legacy gate/up pair shows that simply combining two reference +matmuls is not enough; the next kernel must reduce staging/writeback cost +without changing the low-drift arithmetic envelope. + +## Continuation-Chunk Routed-MoE Probe + +Tried a position-filtered routed-MoE policy that keeps the current conservative +default window at `pos=0`, but uses the fast all-layer routed-MoE profile on +later prefill chunks: + +```sh +DS4_METAL_MPP_FAST=1 +DS4_METAL_MPP_MOE_GATE_FILTER=layer=15-42,pos=512,pos=1024,pos=2048,pos=4096 +DS4_METAL_MPP_MOE_UP_FILTER=layer=15-42,pos=512,pos=1024,pos=2048,pos=4096 +DS4_METAL_MPP_MOE_DOWN_FILTER=layer=12-42,pos=512,pos=1024,pos=2048,pos=4096 +``` + +Artifact: + +- `speed-bench/local-runs/20260515-073209-mpp-fast-continuation-chunks/prefill-candidate-summary.md` + +Two-repeat compact screen vs current Tensor default: + +| 512 | 1024 | 2048 | 4096 | 8192 | Generation range | +| ---: | ---: | ---: | ---: | ---: | ---: | +| +4.2% | +24.0% | +13.3% | +13.6% | +8.3% | -0.7%..+0.8% | + +Repeat-level prefill was positive at every measured point; min repeat prefill +was `+1.5%`. The usual five-fixture drift gate also stayed green with the same +Tensor-vs-standard summary as the current default: top1 mismatches `0`, greedy +mismatches `0`, worst RMS `0.239946`, and worst top20 abs `0.55422`. + +Important caveat: this is not production-safe on the current evidence. The +five fixtures mostly exercise `pos=0`, while this candidate's new behavior is +the nonzero-position continuation chunks. `run_prefill_candidate_gate.py` now +marks nonzero `pos=` candidates as not promotion-safe until a chunked or +long-prompt drift check covers that route. Keep this as a promising +default-off direction, not an auto-policy change. + +## Dense Q8_0 Comparator Hook Refresh + +The earlier dense Q8_0 comparator notes were stale relative to the current +code: the README documented `DS4_METAL_Q8_COMPARE=1`, but the active Q8 path +only had profiling (`DS4_METAL_Q8_PREFILL_PROFILE=1`). Restored the default-off +compare hook in `ds4_gpu_matmul_q8_0_tensor()` and wired +`run_mpp_compare_probe.py --route q8 --q8-filter ` so future dense +Q8_0 kernel attempts can be checked locally before the five-fixture drift gate. + +Smoke command: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --case short_code_completion \ + --route q8 \ + --q8-filter attn_q_b \ + --compare-max 3 \ + --verbose \ + --top 10 +``` + +Artifact: + +- `speed-bench/local-runs/20260515-054611-manual-mpp-compare-probe/mpp-compare-summary.md` + +Result: `3` parsed `q8` comparisons for `attn_q_b`, no target breaches, +and zero delta against the current legacy candidate/reference path: + +| Route | Module | Shape | Max abs | RMS | +| --- | --- | --- | ---: | ---: | +| `q8` | `layer=0 pos=0 attn_q_b` | `32768x27x1024` | 0 | 0 | +| `q8` | `layer=1 pos=0 attn_q_b` | `32768x27x1024` | 0 | 0 | +| `q8` | `layer=2 pos=0 attn_q_b` | `32768x27x1024` | 0 | 0 | + +## Rejected Dense Q8_0 Tok64 MPP Probe + +Tried a local default-off Q8_0 Metal Tensor tile that swapped the previous +generic MPP shape from `64x32` output-row/token tiles to `32x64`, aiming to +reuse q8 dequantized rows across a wider token tile. The temporary hook used: + +```sh +DS4_METAL_Q8_MPP_TOK64=1 +DS4_METAL_Q8_MPP_TOK64_FILTER= +``` + +Comparator smoke artifacts: + +- `speed-bench/local-runs/20260515-055108-manual-mpp-compare-probe/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-055201-manual-mpp-compare-probe/mpp-compare-summary.md` + +The local comparator was clean before timing. For `attn_q_b`, the first three +layers had worst max abs `1.13249e-06` and worst RMS `2.32904e-08`. For +`attn_out`, the first three layers had worst max abs `2.95639e-05` and worst +RMS `2.98521e-06`. + +Short timing artifacts: + +- `speed-bench/local-runs/20260515-055126-q8-mpp-tok64-attn-q-b-screen/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-055212-q8-mpp-tok64-attn-out-screen/prefill-candidate-summary.md` + +One-repeat timing versus the current Tensor default: + +| Candidate | 512 | 1024 | 2048 | Generation range | +| --- | ---: | ---: | ---: | ---: | +| `attn_q_b` tok64 MPP | -5.1% | +0.2% | +0.0% | -0.7%..-0.1% | +| `attn_out` tok64 MPP | -5.9% | -8.1% | -5.8% | -0.1%..+2.7% | + +Conclusion: reject before the five-fixture drift gate and remove the temporary +kernel/hook. The wider token tile was locally accurate, but it did not improve +compact prefill; `attn_q_b` only reached noise-level parity after a short-context +regression, and `attn_out` regressed all measured compact contexts. + +## Rejected Dense Q8_0 64x64 MPP Probe + +Tried the other plausible MPP tile shape in the same family: `64x64` +output-row/token tiles. This kept the output-row width of the earlier generic +MPP route while doubling token width, with a temporary default-off hook: + +```sh +DS4_METAL_Q8_MPP_64X64=1 +DS4_METAL_Q8_MPP_64X64_FILTER= +``` + +Comparator smoke artifacts: + +- `speed-bench/local-runs/20260515-055459-manual-mpp-compare-probe/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-055719-manual-mpp-compare-probe/mpp-compare-summary.md` + +The first three `attn_q_b` layers were clean with worst max abs +`1.13249e-06` and RMS `2.32904e-08`. The first three `attn_out` layers were +also clean with worst max abs `2.95639e-05` and RMS `2.98521e-06`. + +Timing artifacts: + +- `speed-bench/local-runs/20260515-055512-q8-mpp-64x64-attn-q-b-screen/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-055548-q8-mpp-64x64-attn-q-b-long-screen/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-055730-q8-mpp-64x64-attn-out-screen/prefill-candidate-summary.md` + +One-repeat timing versus the current Tensor default: + +| Candidate | 512 | 1024 | 2048 | 4096 | 8192 | Generation range | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `attn_q_b` 64x64 short | -4.0% | +0.7% | +0.3% | n/a | n/a | +0.4%..+4.0% | +| `attn_q_b` 64x64 long | +5.9% | +7.0% | -3.5% | -1.2% | +0.7% | -6.2%..+0.5% | +| `attn_out` 64x64 short | -1.6% | -0.3% | -1.0% | n/a | n/a | +0.5%..+0.8% | + +Conclusion: reject before the five-fixture drift gate and remove the temporary +kernel/hook. The candidate was locally accurate, but not speed-stable: it +regressed compact `attn_out`, regressed `attn_q_b` at 512 in the short screen, +and the longer `attn_q_b` screen showed mid-context prefill regressions plus +generation-floor breaches. + +## Rejected FlashAttention Fast CPU Mask Fill + +Tried a local CPU-side prefill mask fill rewrite behind +`DS4_METAL_FLASH_ATTN_FAST_CPU_MASK_FILL=1`. The patch kept the same mask +values but replaced per-element causal/window branches with row fill plus +contiguous zero spans for visible raw and compressed keys. + +Short timing artifact: + +- `speed-bench/local-runs/20260515-060204-flash-attn-fast-cpu-mask-fill-screen/prefill-candidate-summary.md` + +One-repeat timing versus the current Tensor default: + +| 512 | 1024 | 2048 | Generation range | +| ---: | ---: | ---: | ---: | +| -0.6% | -0.1% | -0.2% | -0.3%..+0.0% | + +Conclusion: reject before drift and remove the temporary hook. The rewrite was +math-identical, but the existing branchy fill is already efficient enough at +compact contexts; the row-fill/memset variant added overhead instead of saving +prefill time. + +## Rejected M5 Private Scratch Buffers + +Ported the `swival-ds4-m5/m5` private scratch-buffer idea as a local opt-in +candidate (`DS4_METAL_PRIVATE_SCRATCH=1`), keeping CPU-written masks and +attention-output group-id tables in shared storage. The change only affected +GPU-only scratch allocation storage mode, so arithmetic and drift risk were low, +but timing was not favorable. + +Short timing artifact: + +- `speed-bench/local-runs/20260515-060603-private-scratch-screen/prefill-candidate-summary.md` + +One-repeat timing versus the current Tensor default: + +| 512 | 1024 | 2048 | Generation range | +| ---: | ---: | ---: | ---: | +| -0.2% | -0.1% | -2.0% | -5.2%..-0.5% | + +Conclusion: reject before the five-fixture drift gate and remove the temporary +hook. Private scratch storage did not improve compact prefill and introduced a +generation-floor miss at 1024 tokens. + +## Rejected MoE Clamped-Activation Writeback + +Screened the existing diagnostic `DS4_METAL_MOE_WRITE_CLAMPED_ACT=1` switch +after the compact stage profile showed `moe_stage.activation_weight` around one +percent of parsed prefill time. The normal release path already avoids writing +the clamped gate/up intermediates because no later inference stage consumes +them; this switch restores those writes only for intermediate-tensor +diagnostics. + +Short timing artifact: + +- `speed-bench/local-runs/20260515-061018-moe-write-clamped-act-screen/prefill-candidate-summary.md` + +One-repeat timing versus the current Tensor default: + +| 512 | 1024 | 2048 | Generation range | +| ---: | ---: | ---: | ---: | +| -0.1% | -0.5% | -0.5% | -1.1%..+0.8% | + +Conclusion: reject before the five-fixture drift gate. The switch is useful for +diagnostics, but it is not a production optimization and confirms that the +default no-writeback activation path is already the right choice. + +## Current Default Drift Gate Refresh + +Reran the five-fixture quality drift gate after the local comparator/script +changes and the rejected activation-writeback screen. No rejected speed probe +was enabled for this run. + +Command: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --no-fail \ + --out-dir speed-bench/local-runs/20260515-061111-current-default-quality-drift-gate +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-061111-current-default-quality-drift-gate/summary.md` +- `speed-bench/local-runs/20260515-061111-current-default-quality-drift-gate/summary.json` + +Gate result: `OK`. + +| Pair | Top1 mismatches | Greedy mismatches | Min top20 | Worst RMS | Worst top20 abs | +| --- | ---: | ---: | ---: | ---: | ---: | +| `standard_vs_quality` | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| `tensor_vs_quality` | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| `tensor_vs_standard` | 0 | 0 | 19/20 | 0.239946 | 0.55422 | + +Conclusion: current default Tensor remains inside the strict Tensor-vs-standard +envelope (`0.30` RMS, `0.60` top20 abs) after the recent non-production +diagnostic and bench-script changes. + +## Remaining Prefill-Audit Notes + +Re-audited the current code and env surface after the rejected activation +writeback screen to avoid repeating low-value probes. + +Dense Q8_0: + +- The active prefill path is still `kernel_mul_mm_q8_0_f32`, a hand-written + simdgroup-MMA kernel with a hard-coded `64x32` output-row/token tile. +- The four simdgroups are mapped over two 32-row halves and two 16-token halves, + so changing the output-row tile is not a host-only knob; it requires a new + simdgroup layout and a new kernel family. +- Already rejected Q8_0 scheduling/prototype axes include split-tail, token-64 + widening, generic MPP, direct-RHS Tensor, F16 RHS prepack, tok64 MPP, and + `64x64` MPP. + +FlashAttention: + +- Static-mixed non-vector attention remains a secondary hotspot, but the + low-risk setup/geometry probes have already been rejected: mask cache, CPU + block map, NSG4, real `C=32`, real `Q=16`, GPU mask fill, and fast CPU mask + fill. +- The remaining work is inside the attention kernel body, not another + mask/setup toggle. + +Env surface: + +- `DS4_METAL_DISABLE_ROUTER_SELECT_FUSION` is decode-only for this branch's + router fast path (`n_tokens == 1`), so it is not a prefill gate candidate. +- Startup/residency/hot-pipeline switches still affect warmup behavior rather + than steady-state prefill throughput. + +Conclusion: there is no obvious untested env-only or one-line prefill candidate +left. The next optimization pass should start as a new default-off kernel +family, with the dense Q8_0 comparator and the five-fixture drift gate as the +first acceptance checks. + +## Rejected Dense Q8_0 Row-Pair Probe + +Tried a local default-off dense Q8_0 kernel family that computed two adjacent +`64x32` output-row/token tiles in one threadgroup and shared the staged RHS tile +between them. The goal was to reduce RHS staging and dispatch overhead while +keeping each `64x32` tile's dequantization and simdgroup-MMA accumulation order +aligned with `kernel_mul_mm_q8_0_f32`. + +Temporary hook: + +```sh +DS4_METAL_Q8_ROWPAIR=1 +DS4_METAL_Q8_ROWPAIR_FILTER= +``` + +Comparator smoke artifacts: + +- `speed-bench/local-runs/20260515-062046-manual-mpp-compare-probe/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-062103-manual-mpp-compare-probe/mpp-compare-summary.md` + +The first three `attn_q_b` and `attn_out` layers were exact against the legacy +Q8_0 path: worst max abs `0`, RMS `0`. + +Short timing artifacts: + +- `speed-bench/local-runs/20260515-062116-q8-rowpair-attn-q-b-screen/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-062148-q8-rowpair-attn-out-screen/prefill-candidate-summary.md` + +One-repeat timing versus the current Tensor default: + +| Candidate | 512 | 1024 | 2048 | Generation range | +| --- | ---: | ---: | ---: | ---: | +| `attn_q_b` row-pair | +0.3% | -0.8% | -4.1% | -2.4%..-0.5% | +| `attn_out` row-pair | -5.7% | -7.1% | -6.5% | -1.3%..-0.2% | + +Conclusion: reject before the five-fixture drift gate and remove the temporary +kernel/hook. Sharing the RHS tile did not compensate for the extra accumulator +pressure and larger threadgroup footprint; it made `attn_out` consistently +slower and only gave a noise-level 512-token point on `attn_q_b`. + +## Small-Batch Dense Boundary Audit + +Checked the dense `mul_mv_ext` path before starting another prefill candidate. +Both Q8_0 and F16 Tensor dense wrappers route through `mul_mv_ext` only when +`n_tok <= 8` and the input dimension is divisible by 128. The compact prefill +gate starts at 512 tokens, and the Q8_0 profiling/comparator hooks are +deliberately scoped to `n_tok > 8`, so this helper is outside the measured +steady-state prefill route. + +The F16 pair Tensor path also rejects `n_tok <= 8` for its batched pair-MPP +candidate and falls back to the single-output dense helper. The previously +audited FlashAttention vector helper has the same shape issue in the opposite +direction: it is kept below 20 tokens because forcing it into normal prefill +would allocate multi-GiB temporary buffers. + +Conclusion: do not run a compact prefill timing gate for the small-batch dense +boundary. It may matter for prompt tails, speculative/MTP-style microbatches, or +decode-adjacent work, but it is not a promotion candidate for the current +512-token-and-up prefill benchmark. + +## FlashAttention Static-Mixed Kernel Triage + +Inspected the static-mixed non-vector prefill path after the routed-MoE and +dense Q8_0 frontier checks. The current path materializes a half mask on the +CPU, optionally copies a compressed mask into it, scans that mask with +`kernel_flash_attn_ext_blk`, then runs the generic +`kernel_flash_attn_ext_f16_dk512_dv512` non-vector attention kernel with +`has_mask=true`, `has_sinks=true`, `has_bias=false`, `has_scap=false`, +`nqptg=8`, `ncpsg=64`, and `nsg=8` for the DS4 512-wide heads. + +Previously rejected FlashAttention probes already cover the simple knobs: + +- `NCPSG=128`, real `C=32`, real `Q=16`, and `NSG=4` did not produce a compact + whole-model prefill win; +- CPU/GPU mask-fill rewrites, mask caching, and CPU block-map generation either + regressed speed or were noise-level; +- forcing the vector helper into normal prefill is not viable because its + temporary buffer scales to multi-GiB at ordinary prefill sizes. + +The remaining plausible attention target is therefore not another host toggle. +It is a new static-mixed-specific non-vector kernel that computes the raw +causal/window visibility and compressed-row visibility from `(q, k, ratio, +window)` inside the kernel, avoiding the materialized mask and block-map path +for the common unmasked static-mixed prefill case. This should be default-off +at first and must compare against the existing generic masked path before any +whole-model timing. Because it changes masking implementation rather than the +intended math, acceptance should require: + +- local head-output comparator against the existing generic FlashAttention path + on static-mixed fixtures; +- compact prefill timing versus current Tensor default; +- the five-fixture drift gate before promotion. + +Conclusion: do not start another small FlashAttention flag screen. The next +attention optimization should be a separate static-mixed kernel family with +explicit local output comparison and the usual five-scenario drift gate. + +## FlashAttention Comparator Hook + +Added the local output comparator needed before implementing the +static-mixed-specific attention kernel family. The hook is default-off and does +not change normal inference: + +```sh +DS4_METAL_FLASH_ATTN_COMPARE=1 +DS4_METAL_MPP_COMPARE_ROUTE=flash_attn +DS4_METAL_FLASH_ATTN_COMPARE_FILTER= +``` + +When enabled, the current candidate head output is snapshotted and the existing +generic static-mixed FlashAttention path is replayed into a reference buffer on +the same command buffer. The result is registered through the same comparator +summary path used by routed-MoE, attention-output, and dense Q8_0 probes. The +graph now sets compare context around the static-mixed prefill attention call, +so reports include the layer and `pos0` context. + +`speed-bench/run_mpp_compare_probe.py` also accepts `--route flash_attn` and +`--flash-attn-filter ...`, which enables the hook and writes the usual +`mpp-compare-summary.md/json` artifacts under `speed-bench/local-runs/`. + +Smoke command: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --case short_code_completion \ + --route flash_attn \ + --flash-attn-filter static_mixed \ + --compare-max 1 \ + --gen-tokens 1 \ + --verbose +``` + +Artifact: + +- `speed-bench/local-runs/20260515-063525-manual-mpp-compare-probe/mpp-compare-summary.md` + +Result: one `flash_attn` comparison on layer 2, shape `512x64x27`, with max abs +`0`, RMS `0`, and no nonfinite values. + +This is scaffolding only: the current default still runs the generic +static-mixed path. No speed or drift gate was run for this change because it is +inactive unless the diagnostic env is set. + +## Rejected FlashAttention Analytic Static Mask Probe + +Tried a default-off analytic static-mixed mask path that skipped the +materialized mask and block-map for unmasked static-mixed prefill. Local +comparator checks first exposed a mixed raw/compressed boundary bug, then passed +after forcing the crossing block through per-element masking: + +- `speed-bench/local-runs/20260515-064033-manual-mpp-compare-probe/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-064229-manual-mpp-compare-probe/mpp-compare-summary.md` + +The short speed screen failed before the drift gate: + +- `speed-bench/local-runs/20260515-064253-flash-attn-static-mask-screen/prefill-candidate-summary.md` + +One-repeat timing versus the current Tensor default: + +| Context | Prefill delta | Generation delta | +| --- | ---: | ---: | +| 512 | -11.9% | +1.0% | +| 1024 | -5.5% | +0.2% | +| 2048 | -5.1% | +2.3% | + +Conclusion: reject and remove the production hook. The local comparator +scaffold remains useful, but this analytic-mask variant is slower on the +prefill target, so no five-fixture drift gate was run. + +## Post-Cleanup Frontier Check + +Re-smoked the FlashAttention comparator after removing the rejected analytic +static-mask hook: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --case short_code_completion \ + --route flash_attn \ + --flash-attn-filter static_mixed \ + --compare-max 1 \ + --gen-tokens 1 \ + --verbose +``` + +Artifact: + +- `speed-bench/local-runs/20260515-065041-manual-mpp-compare-probe/mpp-compare-summary.md` + +Result: one static-mixed prefill comparison on layer 2, shape `512x64x27`, +max abs `0`, RMS `0`, no nonfinite values. The comparator scaffold is still +valid for future FlashAttention kernel work. + +Also wrote a timestamped local-run index: + +- `speed-bench/local-runs/20260515-065056-local-run-index/local-run-index.md` +- `speed-bench/local-runs/20260515-065625-local-run-index/local-run-index.md` + +The candidate gate now enforces the speed-first workflow before nested drift +runs. Verification used the saved rejected `f16-pair-current` run with +`--reuse --run-drift-gate --no-fail`; it reused existing CSVs, did not run the +model, skipped the drift gate, and wrote the skip reason into the ignored local +summary: + +- `speed-bench/local-runs/20260514-171939-f16-pair-current/prefill-candidate-summary.md` + +The Markdown scorecard repeat table was validated by regenerating the saved +`mpp-gateup0-3-down12` candidate with `--reuse`. The report now shows the exact +repeat-level cause for skipping drift: at 512 tokens, repeat prefill deltas were +`-0.5%` and `+3.9%` even though the median was `+1.7%`. + +- `speed-bench/local-runs/20260515-065835-mpp-gateup0-3-down12/prefill-candidate-summary.md` + +The local-run index now mirrors that stricter screen by showing both median and +repeat-level minimum prefill deltas. This keeps median-positive but +repeat-unstable candidates visible as rejected in the top-level artifact index, +instead of requiring a separate JSON lookup. + +- `speed-bench/local-runs/20260515-070910-local-run-index/local-run-index.md` + +Important caveat from that index: older host-only FlashAttention tile screens, +such as `flash-attn-ncpsg32`, can still appear near the top by speed. Do not +revive those directly. The later real specializations with matching host and +Metal template geometry were tested in `Rejected FlashAttention Tile Variants` +and did not meet the compact prefill speed bar. + +Current frontier remains the early routed-MoE `0/0/0` window. The existing MPP +fast-layout gate/up/down route is fast but fails the strict Tensor-vs-standard +drift envelope when expanded into early layers. A useful next kernel must +therefore preserve the standard simdgroup-MMA arithmetic closely while reducing +the early-window gate/up/down cost; another route-window scan or stale +FlashAttention geometry flag is unlikely to be productive. + +## Continuation-Chunk Drift Gate + +Added a resumed-prefill drift gate for candidates that only route nonzero +`pos=` chunks: + +```sh +python3 speed-bench/run_chunked_prefill_drift_gate.py \ + --preset mpp-fast-continuation-chunks \ + --max-tensor-standard-rms 0.30 \ + --max-tensor-standard-top20-abs 0.60 \ + --no-fail +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-074852-mpp-fast-continuation-chunks-chunked-drift-gate/summary.md` +- `speed-bench/local-runs/20260515-075200-local-run-index/local-run-index.md` + +The candidate still has no top-1 mismatch at resumed frontiers, but it fails +the strict Tensor-vs-standard drift envelope: + +| Frontier | Same top1 | Top20 | RMS | Top20 abs | +| ---: | --- | ---: | ---: | ---: | +| 512 | yes | 19/20 | 0.202659 | 0.579939 | +| 1024 | yes | 19/20 | 0.707456 | 1.95875 | +| 2048 | yes | 18/20 | 0.451973 | 1.25351 | +| 4096 | yes | 18/20 | 0.382888 | 1.08998 | +| 8192 | yes | 19/20 | 0.409673 | 0.654034 | + +Conclusion: reject `mpp-fast-continuation-chunks` for production promotion. +The speed gain is real, but the newly covered resumed chunks drift too far from +standard Metal. Keep the new gate for future nonzero-`pos` candidates. + +Follow-up tooling change: `run_prefill_candidate_gate.py --run-drift-gate` now +detects nonzero `pos=` route filters and runs this chunked frontier gate after +the speed screen passes. The promotion scorecard treats missing or failing +chunked coverage as a blocker for that class of candidate, so future +continuation-prefill experiments cannot pass on the five-fixture gate alone. + +Regenerated the original `mpp-fast-continuation-chunks` candidate scorecard +with the integrated nested chunked gate: + +- `speed-bench/local-runs/20260515-073209-mpp-fast-continuation-chunks/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-073209-mpp-fast-continuation-chunks/chunked-drift-gate/summary.md` +- `speed-bench/local-runs/20260515-081337-local-run-index/local-run-index.md` +- `speed-bench/local-runs/20260515-081533-local-run-index/local-run-index.md` + +The promotion decision now reports the actual blocker directly: the candidate +passes the speed screen and the five-fixture drift gate, but fails chunked +Tensor-vs-standard drift at frontier `1024` with worst RMS `0.707456` and worst +top20 abs `1.95875`. The local-run index now separates five-fixture drift from +coverage drift, so this candidate appears as `5-fixture OK=yes` but +`Coverage OK=no` instead of looking drift-clean in the speed table. + +Follow-up baseline check: the current default Tensor path itself does not meet +the strict absolute chunked Tensor-vs-standard envelope on resumed frontiers, +so coverage for candidate env overrides now uses candidate Tensor versus the +current no-env Tensor baseline instead of candidate Tensor versus standard +Metal. The standalone chunked gate still reports all pairs, but when env +overrides are present it also captures `default_tensor` and reports +`tensor_vs_default_tensor`. + +Artifacts: + +- `speed-bench/local-runs/20260515-081710-current-default-chunked-drift-gate/summary.md` +- `speed-bench/local-runs/20260515-073209-mpp-fast-continuation-chunks/chunked-drift-gate/summary.md` + +Current default chunked Tensor-vs-standard had no top-1 mismatches, but reached +worst RMS `0.667784` and worst top20 abs `1.47467` at resumed frontier `1024`. +After switching coverage to candidate-vs-default-Tensor, the +`mpp-fast-continuation-chunks` candidate still fails: `tensor_vs_default_tensor` +worst RMS is `0.512339` at frontier `2048`, and worst top20 abs is `1.41916` +at frontier `1024`. + +The local-run index now also picks up persistent chart-only runs from +`run_metal_tensor_bench.sh`, so the saved current-branch charts are visible +beside candidate gates, drift gates, comparator probes, and stage profiles. +For the latest chart run, +`20260515-052156-metal-tensor-bench`, Tensor prefill was `+15.1%..+31.4%` +versus standard Metal across the eight measured frontiers, while generation was +`-1.3%..-0.5%`. + +## Experimental Routed-MoE Matmul Recheck + +Rechecked the experimental routed-MoE matmul window on the current candidate +gate because the older notes had an under-verified start-layer 15 result. Both +runs used `--run-drift-gate --no-fail`, so drift would only run after the +speed screen passed. + +Artifacts: + +- `speed-bench/local-runs/20260515-080102-experimental-moe-matmul-start15-current/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-080356-experimental-moe-matmul-start14-current/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-080749-experimental-moe-matmul-gateup14-down12-current/prefill-candidate-summary.md` +- `speed-bench/local-runs/20260515-080658-local-run-index/local-run-index.md` +- `speed-bench/local-runs/20260515-081042-local-run-index/local-run-index.md` + +Two-repeat median speed versus current Tensor default: + +| Candidate | 512 | 1024 | 2048 | 4096 | 8192 | Min repeat prefill | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1`, start layer `15` | -0.6% | -0.0% | +0.2% | +2.5% | +3.0% | -3.2% | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1`, start layer `14` | -0.6% | -0.5% | -0.7% | -0.8% | -0.2% | -2.1% | +| `DS4_METAL_EXPERIMENTAL_MOE_MATMUL=1`, gate/up start layer `14`, down start layer `12` | -1.1% | -1.9% | -2.2% | -3.3% | -0.1% | -3.9% | + +Conclusion: reject both before the five-fixture drift gate. Start layer 15 is +only useful at larger contexts and is not repeat-stable; start layer 14 is +slower at every compact prefill point; preserving the current down-from-12 +window while moving gate/up to 14 is slower still. The current conservative +routed-MoE default remains the baseline. + +## Current Prefill Frontier Audit + +Regenerated the persistent current-branch standard/quality/Tensor chart with +`speed-bench/run_metal_tensor_bench.sh` after moving chart artifacts out of +`/tmp` and into ignored local storage. + +Artifacts: + +- `speed-bench/local-runs/20260515-083543-metal-tensor-bench/20260515-083543_gen128_ds4_bench_standard_quality_tensor.png` +- `speed-bench/local-runs/20260515-083543-metal-tensor-bench/20260515-083543_gen128_ds4_bench_standard_metal.csv` +- `speed-bench/local-runs/20260515-083543-metal-tensor-bench/20260515-083543_gen128_ds4_bench_quality.csv` +- `speed-bench/local-runs/20260515-083543-metal-tensor-bench/20260515-083543_gen128_ds4_bench_tensor_metal.csv` +- `speed-bench/local-runs/20260515-084949-local-run-index/local-run-index.md` + +Latest chart result versus standard Metal: + +| Context | Tensor prefill gain | Tensor generation gain | +| ---: | ---: | ---: | +| 512 | +35.6% | +0.1% | +| 1024 | +42.4% | +0.6% | +| 2048 | +34.6% | +0.4% | +| 4096 | +30.0% | +0.2% | +| 8192 | +23.5% | -0.3% | +| 16384 | +18.9% | -0.1% | +| 32768 | +18.8% | -0.3% | +| 65536 | +15.7% | -0.3% | + +The local-run index now sees four persistent Metal Tensor chart runs and keeps +them beside candidate gates, drift gates, comparator probes, and stage +profiles. + +Re-audited the current MoE dispatch path before starting another kernel probe: + +- `ds4_gpu_routed_moe_batch_tensor()` already builds one expert-major route map + and reuses it for gate, up, and down; +- the map stage is not the measured bottleneck in the routed-MoE stage + profiles; +- the final `kernel_mul_mm_id` writeback is a real scatter through `hids`, not + a dense store that can be replaced safely with a one-line `simdgroup_store`; +- already-rejected probes cover paired gate/up, `tiidx` writeback, direct + down-sum, N64/tok64/row-pair dense Q8, F16 RHS, FlashAttention setup knobs, + and route-window expansion. + +Conclusion: the current default remains the production baseline because it has +the best confirmed low-drift envelope from the five-fixture gate. The next +prefill optimization should not be another env-only screen. It should be a +default-off kernel-family prototype, with routed MoE as the highest-value target +and dense Q8 as the secondary target: + +1. Preserve the legacy simdgroup-MMA arithmetic/writeback order first. +2. Reduce real staging/writeback cost instead of just widening the existing + cooperative-Tensor window. +3. Prove local comparator tightness on the touched route before speed gating. +4. Run `run_prefill_candidate_gate.py` speed-only first, then the five-fixture + drift gate only after the speed floor passes. + +## Rejected Routed-MoE Up-SwiGLU Fusion + +Tried a bounded default-off routed-MoE prefill prototype that fused the legacy +`moe_up` grouped matmul with the SwiGLU/route-weight write into the `mid` +buffer. The idea was to keep the legacy simdgroup-MMA arithmetic for the up +projection while avoiding the up scratch write/read and separate activation +dispatch. + +Initial speed artifact: + +- `speed-bench/local-runs/20260515-085820-moe-prefill-up-swiglu/prefill-candidate-summary.md` + +The speed-only part was promising versus the then-current Tensor baseline: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | +6.7% | -0.1% | +| 1024 | +37.7% | +0.5% | +| 2048 | +23.7% | +0.4% | +| 4096 | +14.3% | +0.0% | +| 8192 | +12.6% | +0.1% | + +The first drift scorecard for that artifact was invalid because the helper had +rebuilt `ds4-bench` for the speed path but the drift gate used a stale `ds4` +binary. After rebuilding `ds4`/`ds4_test`, `./ds4_test --metal-mpp-equivalence` +with `DS4_METAL_MOE_PREFILL_UP_SWIGLU=1` failed hard on the long fixtures: + +| Fixture | Same top1 | Top20 | RMS | Top20 abs | Greedy | +| --- | --- | ---: | ---: | ---: | --- | +| `long_memory_archive` | no | 12/20 | 1.80489 | 6.19391 | diff@0 | +| `long_code_audit` | no | 11/20 | 1.95671 | 4.80762 | diff@0 | + +Setting `DS4_METAL_MOE_MID_F32=1` did not change the failure shape, so this is +not just the F16 mid storage path. The fused kernel/prototype was removed rather +than kept as another broken env mode. + +Tooling fix from this miss: + +- `run_quality_drift_gate.py` now refuses to run against a stale `ds4` binary + when core sources or `metal/*.metal` are newer than the binary. +- `run_prefill_candidate_gate.py` now does the same for `ds4-bench` and passes + the guard through to nested quality drift gates. +- `run_chunked_prefill_drift_gate.py` now applies the same stale-`ds4-bench` + guard for standalone resumed-frontier coverage runs. +- `run_metal_tensor_bench.sh` now applies the same stale-`ds4-bench` guard for + persistent standard/quality/Tensor chart regeneration. +- `run_mpp_compare_probe.py` now applies the same stale-`ds4` guard for local + comparator probes. +- `--allow-stale-binary` exists only for intentional old-artifact summaries. + +Fresh restored-baseline artifacts: + +- `speed-bench/local-runs/20260515-091751-current-default-quality-drift-gate/summary.md` + +The fresh no-env five-fixture gate is back to the known-good default envelope: +Tensor-vs-standard has top1 mismatches `0`, greedy mismatches `0`, min top20 +`19/20`, worst RMS `0.239946`, and worst top20 abs `0.55422`. + +## Rejected Narrow Gate/Up Route Windows + +Screened the narrower routed-MoE gate/up Tensor window that was still adjacent +to the rejected `0-3` and `0-5` sweeps: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label mpp-gateup0-1-down12 \ + --set-env DS4_METAL_MPP_MOE_GATE_START_LAYER=0 \ + --set-env DS4_METAL_MPP_MOE_GATE_FILTER=layer=0-1,layer=15-42 \ + --set-env DS4_METAL_MPP_MOE_UP_START_LAYER=0 \ + --set-env DS4_METAL_MPP_MOE_UP_FILTER=layer=0-1,layer=15-42 \ + --no-fail +``` + +Artifact: + +- `speed-bench/local-runs/20260515-093425-mpp-gateup0-1-down12/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -0.4% | -0.6% | +| 1024 | -0.2% | -0.4% | +| 2048 | -0.7% | -0.2% | +| 4096 | +0.6% | -0.3% | +| 8192 | +2.2% | -0.1% | + +The repeat-level floor also failed with min repeat prefill `-3.6%`. Reject +before drift gate: a two-layer early gate/up expansion only helps larger compact +contexts and still regresses the short/mid contexts. + +Then screened the remaining `0-2` gap: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --candidate-label mpp-gateup0-2-down12 \ + --set-env DS4_METAL_MPP_MOE_GATE_START_LAYER=0 \ + --set-env DS4_METAL_MPP_MOE_GATE_FILTER=layer=0-2,layer=15-42 \ + --set-env DS4_METAL_MPP_MOE_UP_START_LAYER=0 \ + --set-env DS4_METAL_MPP_MOE_UP_FILTER=layer=0-2,layer=15-42 \ + --no-fail +``` + +Artifact: + +- `speed-bench/local-runs/20260515-093802-mpp-gateup0-2-down12/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | +2.2% | -0.0% | +| 1024 | +3.1% | +2.3% | +| 2048 | +2.0% | +0.4% | +| 4096 | +0.0% | -0.2% | +| 8192 | -0.7% | -0.1% | + +The repeat-level floor failed with min repeat prefill `-2.0%`. Reject before +drift gate: it improves the short/mid contexts but gives back the 8192 point and +is not repeat-stable at 4096 or 8192. This closes the narrow route-window gap +between the failed `0-1`, repeat-unstable `0-3`, and slower `0-5` screens; route +window expansion remains exhausted. + +## Rejected Routed-MoE X-F16 Prepack Probe + +Tried a local default-off prototype, `DS4_METAL_MOE_PREFILL_X_F16=1`, that +prepacked the routed-MoE input activation to half once per layer and fed the +existing F16-RHS routed matmul variants for gate/up. The goal was to avoid +restaging the same F32 input as half separately in both gate and up matmuls +without changing the default path. + +Artifact: + +- `speed-bench/local-runs/20260515-094520-moe-prefill-x-f16/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -2.9% | +0.1% | +| 1024 | +0.2% | -0.4% | +| 2048 | +0.2% | +0.1% | +| 4096 | +0.5% | -0.2% | +| 8192 | +2.5% | -0.9% | + +The repeat-level floor failed with min repeat prefill `-8.0%`, so the +five-fixture drift gate was not run. The copy/prepack cost is too high at short +contexts and too noisy through the compact gate. The prototype code was removed +rather than kept as another non-promotable environment mode. + +Fresh restored-baseline check after removing the prototype: + +- `speed-bench/local-runs/20260515-095024-current-default-quality-drift-gate/summary.md` + +The no-env five-fixture gate passed. Tensor-vs-standard had top1 mismatches +`0`, greedy mismatches `0`, min top20 `19/20`, worst RMS `0.239946`, and worst +top20 abs `0.55422`, matching the known current-default envelope. + +## Current-Default Residual `moe_down` Comparator + +Ran a current-default local comparator on the `long_memory_archive` fixture to +attribute the remaining conservative Tensor-vs-standard movement before trying +another kernel candidate: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --route moe_gate,moe_up,moe_down \ + --case long_memory_archive \ + --compare-max 120 \ + --continue-after-breach \ + --verbose +``` + +Artifact: + +- `speed-bench/local-runs/20260515-095750-manual-mpp-compare-probe/mpp-compare-summary.md` + +The current default still has clean local `moe_gate` and `moe_up` comparisons +under the `max_abs <= 0.001` target. All target breaches came from `moe_down`, +mostly in late layers. The worst local delta was `layer=42` with max abs +`0.0166016` and RMS `8.91692e-06`; the other breaches were layers `26`, `29`, +`30`, `31`, `32`, `33`, `34`, `35`, `36`, `37`, `38`, `39`, and `40`. + +Repeated the same current-default comparator on `long_code_audit`, the fixture +responsible for current-default worst Tensor-vs-standard RMS in the five-case +gate: + +- `speed-bench/local-runs/20260515-100424-manual-mpp-compare-probe/mpp-compare-summary.md` + +The result matched `long_memory_archive`: 87 comparisons, the same 14 local +`moe_down` breaches, no `moe_gate`/`moe_up` target breach, and the same worst +layer-42 max abs `0.0166016` with RMS `8.37744e-06`. + +Tried a local default-off implementation probe, +`DS4_METAL_MPP_MOE_DOWN_FAST_LAYOUT=0`, that disabled the first-PR fast MPP +layout only for `moe_down` while leaving gate/up on the current fast layout. +This was meant to test whether the late `moe_down` residual drift came from the +fast-layout staging/writeback instead of the cooperative Tensor matmul itself. + +Artifact: + +- `speed-bench/local-runs/20260515-100727-manual-mpp-compare-probe/mpp-compare-summary.md` + +The comparator result was unchanged from the current default on +`long_code_audit`: 31 `moe_down` comparisons, the same 14 target breaches, and +the same worst layer-42 max abs `0.0166016` with RMS `8.37744e-06`. Reject and +remove the hook before speed/drift gates. The remaining `moe_down` movement is +not fixed by swapping the MPP fast layout for the generic MPP layout; it needs a +new arithmetic path, not a layout selector. + +That suggested the only simple drift mitigation left for the promoted default +would be narrowing `moe_down` to the locally clean early range. Screened that +candidate without the drift gate: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --out-dir speed-bench/local-runs/20260515-095930-current-down12-25 \ + --candidate-label current-down12-25 \ + --set-env DS4_METAL_MPP_MOE_DOWN_FILTER=layer=12-25 \ + --no-fail +``` + +Artifact: + +- `speed-bench/local-runs/20260515-095930-current-down12-25/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -4.9% | -0.0% | +| 1024 | -3.8% | +0.4% | +| 2048 | -2.6% | +1.5% | +| 4096 | -1.5% | +0.8% | +| 8192 | -3.1% | -1.1% | + +The repeat-level floor also failed with min repeat prefill `-6.5%`. Reject +before drift gate: the current conservative default's residual local +`moe_down` movement is real, but disabling the late down Tensor layers gives up +too much prefill throughput. Do not spend more route-filter time on cleaning +current-default `moe_down` drift unless a new down kernel preserves the speed of +the late Tensor route. + +Refreshed local run index after these artifacts: + +- `speed-bench/local-runs/20260515-100856-local-run-index/local-run-index.md` + +## Rejected Strict `mpp-fast` Route Window Recheck + +Reran the earlier `mpp-fast` gate/up/down route-window candidate against the +current branch after the later drift and cleanup work, using the strict +repeat-floor candidate gate: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --out-dir speed-bench/local-runs/20260515-101058-mpp-fast-gate0-up15-down12-current-strict \ + --candidate-label mpp-fast-gate0-up15-down12-current-strict \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env DS4_METAL_MPP_MOE_UP_START_LAYER=15 \ + --set-env DS4_METAL_MPP_MOE_DOWN_START_LAYER=12 \ + --run-drift-gate \ + --no-fail +``` + +Artifact: + +- `speed-bench/local-runs/20260515-101058-mpp-fast-gate0-up15-down12-current-strict/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | +3.6% | -0.3% | +| 1024 | +1.8% | -0.2% | +| 2048 | +2.5% | -0.1% | +| 4096 | +3.7% | -0.4% | +| 8192 | +4.4% | +0.3% | + +Reject before drift gate. The median profile is useful, but the repeat-level +prefill floor failed with min repeat `-0.1%` at 1024 tokens, so it is not +promotion-stable under the strict gate. This keeps the current conservative +default as the baseline and leaves future work focused on a new routed-MoE +arithmetic path rather than more environment-only route-window tuning. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-101358-local-run-index/local-run-index.md` + +## Rejected Current-Default Gate/Up Layer-16 Contraction + +Closed the one remaining small route-window gap around the current conservative +default by moving only gate/up from layer 15 to layer 16 while leaving down at +layer 12: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --out-dir speed-bench/local-runs/20260515-101837-mpp-gateup16-down12-current-strict \ + --candidate-label mpp-gateup16-down12-current-strict \ + --set-env DS4_METAL_MPP_MOE_GATE_START_LAYER=16 \ + --set-env DS4_METAL_MPP_MOE_UP_START_LAYER=16 \ + --set-env DS4_METAL_MPP_MOE_DOWN_START_LAYER=12 \ + --run-drift-gate \ + --no-fail +``` + +Artifact: + +- `speed-bench/local-runs/20260515-101837-mpp-gateup16-down12-current-strict/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -2.6% | -0.2% | +| 1024 | -1.9% | -0.8% | +| 2048 | -1.7% | +0.1% | +| 4096 | -0.5% | -0.5% | +| 8192 | +1.0% | -0.4% | + +Reject before drift gate. The contraction fails both the median prefill floor +and repeat-level floor, with min median prefill `-2.6%` and min repeat prefill +`-4.7%`. This confirms the current layer-15 gate/up window is still the better +production baseline; the next useful improvement remains a new default-off +routed-MoE arithmetic path rather than shifting the conservative route window. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-102142-local-run-index/local-run-index.md` + +## Rejected MoE `sum6` Vec4 Probe + +Tried a local default-off probe, `DS4_METAL_MOE_SUM6_VEC4=1`, that replaced the +six-expert post-down summation kernel with a `float4` vectorized load/add/store +variant when `out_dim`, offsets, and strides were 16-byte aligned. This kept the +same expert summation order and did not change the grouped down matmul. + +Artifact: + +- `speed-bench/local-runs/20260515-102448-moe-sum6-vec4/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -2.2% | +0.1% | +| 1024 | -1.5% | -0.1% | +| 2048 | -2.0% | -0.2% | +| 4096 | -1.1% | -0.0% | +| 8192 | +1.6% | +0.1% | + +Reject before drift gate. The median prefill floor failed with min `-2.2%`, +and the repeat-level floor failed with min repeat `-5.3%`. The temporary +kernel and environment hook were removed after the screen. The existing scalar +`sum6` kernel remains the baseline; optimizing the sum stage alone is not a +useful compact prefill path unless a future design also changes the down/sum +dataflow without losing expert-major matmul throughput. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-102819-local-run-index/local-run-index.md` + +## Rejected Strict MoE `sum6` Disable Recheck + +Reran the older `DS4_METAL_MOE_SUM6_DISABLE=1` control through the current +strict two-repeat candidate gate. The earlier one-off control had shown a +small-context median gain, so this recheck tests whether that survives the +repeat-floor rule used for promotion. + +Artifact: + +- `speed-bench/local-runs/20260515-103032-disable-moe-sum6-current-strict/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -1.6% | +0.2% | +| 1024 | -2.0% | -0.3% | +| 2048 | -1.8% | -0.1% | +| 4096 | -2.0% | -1.0% | +| 8192 | +0.3% | +0.1% | + +Reject before drift gate. The median prefill floor failed with min `-2.0%`, +and the repeat-level floor failed with min repeat `-5.3%`. Together with the +rejected vec4 probe, this closes the current `sum6` stage as a standalone +prefill optimization target. A future down/sum direction needs a different +dataflow, not another replacement for the final summation kernel. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-103339-local-run-index/local-run-index.md` + +## Current FlashAttention Stage Profile Refresh + +Reran the isolated static-mixed FlashAttention stage profiler on the current +branch after the routed-MoE and `sum6` cleanup work. This was a profile-only +baseline, not a production candidate. + +Command: + +```sh +env DS4_METAL_FLASH_ATTN_STAGE_PROFILE=1 \ + DS4_METAL_FLASH_ATTN_STAGE_PROFILE_FILTER=static_mixed \ + ./ds4-bench -mt auto \ + --prompt-file speed-bench/promessi_sposi.txt \ + --ctx-start 2048 --ctx-max 2048 --gen-tokens 1 \ + --csv speed-bench/local-runs/20260515-103653-current-flash-attn-stage-profile-2048/bench.csv +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-103653-current-flash-attn-stage-profile-2048/bench.csv` +- `speed-bench/local-runs/20260515-103653-current-flash-attn-stage-profile-2048/stage-profile-summary.md` +- `speed-bench/local-runs/20260515-103653-current-flash-attn-stage-profile-2048/stage-profile-summary.json` + +The measured 2048-token throughput was `471.50` prefill t/s and `35.92` +generation t/s. Parsed FlashAttention profile time was `506.613 ms` across +`225` events: + +| Stage | total ms | events | share | +| --- | ---: | ---: | ---: | +| `flash_attn.static_mixed_nonvec.attention` | 425.729 | 41 | 84.0% | +| `flash_attn.static_mixed_nonvec.mask_fill` | 46.790 | 41 | 9.2% | +| `flash_attn.static_mixed_nonvec.block_map` | 10.250 | 41 | 2.0% | +| `flash_attn.static_mixed_nonvec.copy_raw` | 9.164 | 41 | 1.8% | +| `flash_attn.static_mixed_nonvec.copy_comp` | 8.179 | 41 | 1.6% | +| `flash_attn.static_mixed_nonvec.pad` | 6.501 | 20 | 1.3% | + +Shape split: + +| Shape | total ms | events | +| --- | ---: | ---: | +| `tokens=2048 comp=512 keys=2560 ratio=4` | 316.188 | 105 | +| `tokens=2048 comp=16 keys=2064 ratio=128` | 190.425 | 120 | + +Conclusion: the current branch still matches the earlier FlashAttention triage. +The isolated attention kernel body dominates the FlashAttention slice, while +the full current `promessi_sposi` stage profile shows that slice is only a +secondary whole-model prefill target (`0.7%` parsed stage share for +`flash_attn.static_mixed_nonvec.attention`). Keep FlashAttention deprioritized +unless the next pass is a true static-mixed-specific kernel family with local +head-output comparison; do not repeat the already rejected setup/mask/tile +knobs. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-103729-local-run-index/local-run-index.md` + +## Rejected Current-Default F32-Mid `moe_down` Comparator Check + +Ran a current-default `moe_down` local comparator with +`DS4_METAL_MOE_MID_F32=1` on `long_code_audit` to check whether the residual +late-layer `moe_down` movement came from the F16 routed-MoE intermediate rather +than the Tensor matmul route. + +Command: + +```sh +python3 speed-bench/run_mpp_compare_probe.py \ + --out-dir speed-bench/local-runs/20260515-103935-current-mid-f32-moe-down-compare \ + --route moe_down \ + --case long_code_audit \ + --compare-max 120 \ + --continue-after-breach \ + --verbose \ + --set-env DS4_METAL_MOE_MID_F32=1 +``` + +Artifact: + +- `speed-bench/local-runs/20260515-103935-current-mid-f32-moe-down-compare/mpp-compare-summary.md` + +Result: unchanged from the no-env current-default comparator. The probe parsed +`31` `moe_down` comparisons and found the same `14` target breaches. Worst +delta remained layer 42 with max abs `0.0166016` and RMS `8.37744e-06`. + +Conclusion: reject before speed or five-fixture drift gates. Keeping the MoE +intermediate in F32 does not clean up the current default's local `moe_down` +movement, so the remaining residual is still in the routed Tensor matmul +arithmetic path rather than the F16 mid buffer. + +## Attention-Output Stage Profiler Boundary Fix + +Tried a focused attention-output stage profile to split the promoted +attention-output route into its low projection and final Q8 output projection: + +- initial artifact: + `speed-bench/local-runs/20260515-104057-current-attn-out-stage-profile-2048/stage-profile-summary.md` + +The first run exposed a profiler issue rather than a kernel result: +`attn_output.low_proj` reported `3778.693 ms` total (`87.877 ms` per layer), +which was inconsistent with the full-model profile. The attention-output +profiler did not flush the pending command buffer at function entry, so the +first `low_proj` timing in each layer included upstream queued work. + +Patch: make `DS4_METAL_ATTN_OUT_STAGE_PROFILE=1` follow the MoE and +FlashAttention profiler pattern by ending the current batch and starting a new +command buffer before starting the first attention-output stage timer. This is +profiling-only code; normal inference is unchanged unless the profile env is +set. + +Validation: + +```sh +make ds4-bench ds4_test ds4 +``` + +Fixed-profile artifact: + +- `speed-bench/local-runs/20260515-104146-current-attn-out-stage-profile-2048/stage-profile-summary.md` + +Fixed 2048-token profile: + +| Stage | total ms | events | avg ms | share | +| --- | ---: | ---: | ---: | ---: | +| `attn_output.out_proj` | 441.999 | 43 | 10.279 | 41.2% | +| `q8.attn_out` | 436.981 | 43 | 10.162 | 40.7% | +| `attn_output.low_proj` | 195.033 | 43 | 4.536 | 18.2% | + +Conclusion: the promoted attention-output low projection is no longer the +dominant target in this route. The remaining secondary hotspot is the final +generic Q8 `attn_out` output projection. That keeps dense Q8 as the secondary +kernel-family target, but the already rejected Q8 tile/direct-RHS/row-pair +probes still apply; a future attempt needs a genuinely new out-projection Q8 +kernel design, not another host-side profiler or tile switch. + +Refreshed local run index after these artifacts: + +- `speed-bench/local-runs/20260515-104232-local-run-index/local-run-index.md` + +## Current Default Drift Gate After Profiler Fix + +Reran the no-env five-fixture quality drift gate after the +attention-output profiler boundary fix and rebuild. The profiler fix is gated +behind `DS4_METAL_ATTN_OUT_STAGE_PROFILE`, but this refresh keeps the branch +evidence current after touching `ds4_metal.m`. + +Command: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --no-fail \ + --out-dir speed-bench/local-runs/20260515-104329-current-default-quality-drift-gate +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-104329-current-default-quality-drift-gate/summary.md` +- `speed-bench/local-runs/20260515-104329-current-default-quality-drift-gate/summary.json` + +Gate result: `OK`. + +| Pair | Top1 mismatches | Greedy mismatches | Min top20 | Worst RMS | Worst top20 abs | +| --- | ---: | ---: | ---: | ---: | ---: | +| `standard_vs_quality` | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| `tensor_vs_quality` | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| `tensor_vs_standard` | 0 | 0 | 19/20 | 0.239946 | 0.55422 | + +Conclusion: current default Tensor remains in the established low-drift +envelope after the profiler-only code change. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-104628-local-run-index/local-run-index.md` + +## Routed-MoE Down/Sum Follow-Up Boundary + +Follow-up code inspection after the current-default `moe_down` comparator +checks and the attention-output profiler fix. This does not reopen the older +rejected `DS4_METAL_MOE_PREFILL_DIRECT_DOWN_SUM=1` prototype; that artifact +was already strongly negative: + +- `speed-bench/local-runs/20260515-044921-moe-prefill-direct-down-sum/prefill-candidate-summary.md` + (`-19.7%`, `-20.1%`, `-29.6%` prefill at 512/1024/2048 vs Tensor). + +Relevant current path shape: + +- `kernel_mul_mm_id_map0` builds an expert-major token map (`htpe`/`hids`) so + each routed matmul tile reuses one expert's weight rows across the tokens + routed to that expert. +- `kernel_mul_mm_id` then writes each selected expert result into the + token-major expert slot layout, and `kernel_dsv4_moe_sum6_f32` performs the + final six-expert reduction. +- The measured `sum` stage is small compared with the matmuls + (`~0.5-1.1 ms/layer` in the 2048/3844-token profiles), while `moe_down` + itself is still one of the dominant stages. + +Conclusion: a naive direct token-major down/sum kernel is closed. It loops over +six experts inside each output tile, removes useful expert-parallel work, and +attacks a small standalone sum cost while losing the grouped prefill matmul. +The next routed-MoE candidate should instead keep the expert-major map and +either: + +1. introduce a reference-compatible early-window matmul variant that reduces + staging/pointer overhead while preserving the legacy simdgroup-MMA arithmetic + order, or +2. design a down/sum fused kernel that still dispatches expert-major work and + only changes the final accumulation dataflow after a local `moe_down` + comparator proves it is tight. + +Acceptance remains unchanged: default-off env hook, local route comparator, +speed-only compact gate, then the five-fixture drift gate. + +## Rejected Routed-MoE `ne20=6` Legacy Specialization + +Tried a local default-off prototype, `DS4_METAL_MOE_NE20_6=1`, that +compile-time-specialized the legacy routed-MoE `kernel_mul_mm_id` path for the +DS4 fixed six selected experts. The prototype preserved the existing legacy +simdgroup-MMA arithmetic path and only replaced runtime `args.ne20` division and +modulo with a template constant for the early non-MPP routed-MoE matmuls. + +Local comparator smoke: + +- `speed-bench/local-runs/20260515-151302-moe-ne20-6-compare-long-code/mpp-compare-summary.md` + +The comparator parsed `129` route comparisons on `long_code_audit`. `moe_gate` +and `moe_up` stayed under target. The only breaches were the already-known late +`moe_down` Tensor residuals, with the same worst layer-42 max abs `0.0166016` +and RMS `8.37744e-06`. + +Speed artifact: + +- `speed-bench/local-runs/20260515-151422-moe-ne20-6/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | +1.1% | +0.1% | +| 1024 | +2.2% | -0.1% | +| 2048 | +1.7% | -1.4% | +| 4096 | +0.0% | -1.0% | +| 8192 | +1.4% | -0.1% | + +Reject before drift gate. The median line is mildly positive, but the strict +repeat floor failed with min repeat prefill `-4.0%` and min repeat generation +`-2.6%`. This is too small and noisy to keep as another default-off production +path. The prototype code was removed after the screen. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-152039-local-run-index/local-run-index.md` + +## Rejected Narrow Continuation-Chunk Early MoE Window + +Screened a narrower version of the earlier continuation-chunk idea using the +existing `module@layer` filter syntax. This kept the current conservative +`pos=0` defaults, then added only routed-MoE layers `0..3` on resumed +frontiers `512`, `1024`, `2048`, and `4096`: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --out-dir speed-bench/local-runs/20260515-152507-mpp-cont-gud0-3 \ + --candidate-label mpp-cont-gud0-3 \ + --set-env DS4_METAL_MPP_FAST=1 \ + --set-env 'DS4_METAL_MPP_MOE_GATE_FILTER=layer=15-42,pos=512 routed_moe@layer=0-3,pos=1024 routed_moe@layer=0-3,pos=2048 routed_moe@layer=0-3,pos=4096 routed_moe@layer=0-3' \ + --set-env 'DS4_METAL_MPP_MOE_UP_FILTER=layer=15-42,pos=512 routed_moe@layer=0-3,pos=1024 routed_moe@layer=0-3,pos=2048 routed_moe@layer=0-3,pos=4096 routed_moe@layer=0-3' \ + --set-env 'DS4_METAL_MPP_MOE_DOWN_FILTER=layer=12-42,pos=512 routed_moe@layer=0-3,pos=1024 routed_moe@layer=0-3,pos=2048 routed_moe@layer=0-3,pos=4096 routed_moe@layer=0-3' \ + --run-drift-gate \ + --no-fail +``` + +Artifact: + +- `speed-bench/local-runs/20260515-152507-mpp-cont-gud0-3/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -1.7% | +0.3% | +| 1024 | +2.4% | -0.3% | +| 2048 | +0.4% | -0.4% | +| 4096 | +1.5% | -0.3% | +| 8192 | +1.9% | -0.6% | + +Reject before drift gate. The median line was weakly positive after the first +frontier, but the strict speed screen failed with min median prefill `-1.7%` +and min repeat prefill `-5.8%`. This makes the narrow continuation route too +noisy to pursue into chunked drift coverage. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-152840-local-run-index/local-run-index.md` + +## Rejected Dense Q8 Half-Dequant Probe + +Tried a local default-off prototype, `DS4_METAL_Q8_HALF_DEQUANT=1`, that kept +the existing dense Q8 prefill tile shape but dequantized the packed Q8 blocks +through `half` values instead of the existing float temporary path. + +Local comparator smokes: + +- `speed-bench/local-runs/20260515-153048-q8-half-dequant-compare/mpp-compare-summary.md` +- `speed-bench/local-runs/20260515-153048-q8-half-dequant-compare-attn-out/mpp-compare-summary.md` + +Both comparator smokes parsed `3` Q8 comparisons and found exact zero deltas +for their filtered early-layer checks: + +- `attn_q_b`: worst max abs `0`, RMS `0` +- `attn_out`: worst max abs `0`, RMS `0` + +Speed artifact: + +- `speed-bench/local-runs/20260515-153122-q8-half-dequant/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -5.6% | -2.1% | +| 1024 | -9.0% | -4.2% | +| 2048 | -6.8% | -2.3% | +| 4096 | -4.4% | +0.1% | +| 8192 | -0.2% | +0.1% | + +Reject before drift gate. The local comparator was exact on the two smoke +routes, but the speed screen failed badly: min median prefill was `-9.0%` and +min repeat prefill was `-13.5%`. The prototype code was removed after the +screen. + +## Refreshed Persistent Metal Tensor Bench Chart + +Regenerated the current branch Standard Metal / Quality Metal / Tensor Metal +chart using: + +```sh +OPEN_CHART=0 speed-bench/run_metal_tensor_bench.sh +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-153948-metal-tensor-bench/20260515-153948_gen128_ds4_bench_quality.csv` +- `speed-bench/local-runs/20260515-153948-metal-tensor-bench/20260515-153948_gen128_ds4_bench_standard_metal.csv` +- `speed-bench/local-runs/20260515-153948-metal-tensor-bench/20260515-153948_gen128_ds4_bench_tensor_metal.csv` +- `speed-bench/local-runs/20260515-153948-metal-tensor-bench/20260515-153948_gen128_ds4_bench_standard_quality_tensor.png` + +The artifacts live under `speed-bench/local-runs/`, which is ignored by +`speed-bench/.gitignore`, so repeated timestamped charts stay local. + +| Context | Tensor prefill vs Standard | Tensor generation vs Standard | Quality prefill vs Standard | +| ---: | ---: | ---: | ---: | +| 512 | +34.6% | +1.5% | +3.9% | +| 1024 | +36.3% | +1.9% | +17.8% | +| 2048 | +31.0% | +2.4% | +12.1% | +| 4096 | +26.7% | +2.2% | +10.8% | +| 8192 | +25.0% | +1.9% | +5.7% | +| 16384 | +22.8% | +0.3% | -9.4% | +| 32768 | +19.3% | -0.0% | -3.7% | +| 65536 | +14.9% | -1.4% | -6.3% | + +Current persistent chart summary: Tensor prefill remains ahead of Standard by +`+14.9%..+36.3%`; Tensor generation is roughly flat at `-1.4%..+2.4%`. + +Refreshed local run index after these artifacts: + +- `speed-bench/local-runs/20260515-155451-local-run-index/local-run-index.md` + +## Current Default Drift Refresh After Chart Persistence + +Reran the no-env five-fixture quality drift gate after the benchmark chart +script started writing timestamped artifacts under ignored `speed-bench/local-runs/`. +The first sandboxed attempt could not access the Metal device; the same command +was rerun with local Metal access: + +```sh +python3 speed-bench/run_quality_drift_gate.py \ + --no-fail \ + --out-dir speed-bench/local-runs/20260515-171007-current-default-quality-drift-refresh +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-171007-current-default-quality-drift-refresh/summary.md` +- `speed-bench/local-runs/20260515-171007-current-default-quality-drift-refresh/summary.json` + +Gate result: `OK`. + +| Pair | Top1 mismatches | Greedy mismatches | Min top20 | Worst RMS | Worst top20 abs | +| --- | ---: | ---: | ---: | ---: | ---: | +| `standard_vs_quality` | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| `tensor_vs_quality` | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| `tensor_vs_standard` | 0 | 0 | 19/20 | 0.239946 | 0.55422 | + +Conclusion: the current default Tensor route still matches the established +low-drift envelope while keeping the persistent benchmark artifacts local. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-171500-local-run-index/local-run-index.md` + +## AIME25 Eval Check + +User-reported AIME25 eval result on the current baseline using the +`q2-imatrix` model: + +| Mode | AIME25 score | +| --- | ---: | +| Standard Metal (`q2-imatrix`) | 86.7% | +| Tensor Metal (`q2-imatrix`) | 86.7% | + +Conclusion: the current Tensor Metal baseline is quality-neutral on this eval +relative to Standard Metal, while retaining the measured prefill speed gain and +the clean five-fixture drift gate above. + +## Current 8192-Context Stage Profile Refresh + +Reran a focused current-default profile on the bench prompt at the 8192 context +row with layer, routed-MoE, Q8, FlashAttention, and attention-output stage +profiling enabled: + +```sh +env DS4_METAL_LAYER_STAGE_PROFILE=1 \ + DS4_METAL_MOE_STAGE_PROFILE=1 \ + DS4_METAL_Q8_PREFILL_PROFILE=1 \ + DS4_METAL_FLASH_ATTN_STAGE_PROFILE=1 \ + DS4_METAL_ATTN_OUT_STAGE_PROFILE=1 \ + ./ds4-bench \ + --prompt-file speed-bench/promessi_sposi.txt \ + --ctx-start 8192 \ + --ctx-max 8192 \ + --gen-tokens 16 \ + --csv speed-bench/local-runs/20260515-155652-current-ctx8192-stage-profile/bench.csv +``` + +Artifacts: + +- `speed-bench/local-runs/20260515-155652-current-ctx8192-stage-profile/bench.csv` +- `speed-bench/local-runs/20260515-155652-current-ctx8192-stage-profile/profile.stderr` +- `speed-bench/local-runs/20260515-155652-current-ctx8192-stage-profile/stage-profile-summary.md` +- `speed-bench/local-runs/20260515-155652-current-ctx8192-stage-profile/stage-profile-summary.json` + +The profiled row measured `428.85` prefill tokens/s and `32.69` generation +tokens/s for the single 8192-context run. Parsed profile highlights: + +| Stage | total ms | share | +| --- | ---: | ---: | +| `ffn.routed_moe` | 5802.228 | 17.7% | +| `attn.attention` | 4358.051 | 13.3% | +| `attn.output_proj` | 2468.958 | 7.5% | +| `attn.q_path` | 2439.041 | 7.4% | +| `moe_stage.up` | 1906.220 | 5.8% | +| `moe_stage.gate` | 1905.542 | 5.8% | +| `moe_stage.down` | 1735.243 | 5.3% | +| `q8.attn_out` | 1699.754 | 5.2% | +| `q8.attn_q_b` | 1682.686 | 5.1% | + +MoE mask split: + +| MoE mask | top stages | total ms | +| --- | --- | ---: | +| `0/0/0` | `gate`=859.1, `up`=855.5, `down`=852.5 | 2639.113 | +| `1/1/1` | `up`=837.2, `gate`=834.0, `down`=798.2 | 2626.682 | +| `0/0/1` | `up`=213.6, `gate`=212.5, `down`=84.6 | 527.369 | + +Conclusion: dense Q8 `attn_q_b`/`attn_out` remain the largest non-MoE matmuls, +but the corrected generic Q8 MPP route and later Q8 probes are already closed +as slower. The bigger actionable bucket is still early routed-MoE work: the +legacy `0/0/0` layers cost about the same total time as the larger fully-Tensor +`1/1/1` window despite covering fewer events. Any new env screen should target +that early MoE region and must pass the five-fixture drift gate. + +## Rejected Sparse Early Gate/Up Tensor Window + +Screened a sparse early routed-MoE Tensor window based on the 8192-context +profile. The candidate left the current conservative `down` route unchanged +and added Tensor `gate`/`up` on early even layers `0,2,4,6,8,10` plus the +current default `15..42` range: + +```sh +python3 speed-bench/run_prefill_candidate_gate.py \ + --out-dir speed-bench/local-runs/20260515-161513-mpp-gateup-even0-10-down12 \ + --candidate-label mpp-gateup-even0-10-down12 \ + --set-env DS4_METAL_MPP_MOE_GATE_START_LAYER=0 \ + --set-env DS4_METAL_MPP_MOE_GATE_FILTER=layer=0,layer=2,layer=4,layer=6,layer=8,layer=10,layer=15-42 \ + --set-env DS4_METAL_MPP_MOE_UP_START_LAYER=0 \ + --set-env DS4_METAL_MPP_MOE_UP_FILTER=layer=0,layer=2,layer=4,layer=6,layer=8,layer=10,layer=15-42 \ + --run-drift-gate \ + --no-fail +``` + +Artifact: + +- `speed-bench/local-runs/20260515-161513-mpp-gateup-even0-10-down12/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | +3.5% | +0.2% | +| 1024 | +4.1% | +0.0% | +| 2048 | +3.5% | -0.2% | +| 4096 | +4.2% | +0.2% | +| 8192 | +3.4% | -0.9% | + +The speed signal was repeat-stable enough to run the five-fixture drift gate, +but the gate failed: + +| Pair | Top1 mismatches | Greedy mismatches | Min top20 | Worst RMS | Worst top20 abs | +| --- | ---: | ---: | ---: | ---: | ---: | +| `standard_vs_quality` | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| `tensor_vs_quality` | 1 | 2 | 17/20 | 0.618172 | 2.45835 | +| `tensor_vs_standard` | 1 | 1 | 17/20 | 0.525365 | 2.47542 | + +Reject. The prefill win is real, but the candidate introduces a top-1 mismatch +on `long_memory_archive`, a Tensor-vs-standard greedy mismatch, and a large +`long_code_audit` top20 drift. This is outside the branch's current low-drift +envelope. + +Follow-up narrowed the sparse window to layers `4,6,8,10` only: + +- `speed-bench/local-runs/20260515-162057-mpp-gateup-even4-10-down12/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | +2.2% | -0.1% | +| 1024 | +3.1% | -0.7% | +| 2048 | +0.6% | -0.6% | +| 4096 | -0.6% | -0.8% | +| 8192 | +0.1% | +0.9% | + +Reject before drift gate. Removing layers `0` and `2` avoids spending more +drift time, but it also loses the speed signal: min median prefill was `-0.6%` +and min repeat prefill was `-2.6%`. The sparse early-layer result therefore +does not expose a promotable speed/drift middle ground. + +Refreshed local run index after these artifacts: + +- `speed-bench/local-runs/20260515-162432-local-run-index/local-run-index.md` + +## Rejected Early Gate/Up Parity Follow-Ups + +Followed up the sparse even-layer result by splitting the early routed-MoE +gate/up additions into the `0,2` and odd-layer halves. Both candidates kept the +current conservative `down` route unchanged and only added Tensor `gate`/`up` +before the default `15..42` gate/up window. + +### Layers `0,2` + +Artifact: + +- `speed-bench/local-runs/20260515-162536-mpp-gateup-even0-2-down12/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | -2.0% | -0.7% | +| 1024 | -4.5% | -1.7% | +| 2048 | -2.3% | -1.0% | +| 4096 | +0.0% | -0.7% | +| 8192 | +2.6% | +0.7% | + +Reject before drift gate. The isolated `0,2` window was slower through the +compact range, with min median prefill `-4.5%` and min repeat prefill `-6.8%`. + +### Odd Layers `1,3,5,7,9,11` + +Artifact: + +- `speed-bench/local-runs/20260515-162841-mpp-gateup-odd1-11-down12/prefill-candidate-summary.md` + +Two-repeat median speed versus the current Tensor default: + +| Context | Candidate prefill vs Tensor | Candidate generation vs Tensor | +| ---: | ---: | ---: | +| 512 | +3.4% | -1.4% | +| 1024 | +2.2% | -0.8% | +| 2048 | +3.9% | -1.1% | +| 4096 | +1.6% | -0.3% | +| 8192 | +2.4% | -0.3% | + +The speed screen passed, so the five-fixture drift gate ran: + +| Pair | Top1 mismatches | Greedy mismatches | Min top20 | Worst RMS | Worst top20 abs | +| --- | ---: | ---: | ---: | ---: | ---: | +| `standard_vs_quality` | 0 | 1 | 18/20 | 0.618172 | 2.24006 | +| `tensor_vs_quality` | 0 | 1 | 17/20 | 0.618172 | 2.24006 | +| `tensor_vs_standard` | 0 | 0 | 17/20 | 0.54454 | 0.949314 | + +Reject. The odd-layer sparse route is cleaner than the even `0,2,4,6,8,10` +screen because it introduces no top-1 or greedy mismatch, but the local +Tensor-vs-standard envelope is still too wide: RMS `0.54454` on +`long_memory_archive` and top20 abs `0.949314` on `long_code_audit`. + +Conclusion for this direction: sparse early gate/up windows can buy another +`~2-4%` compact prefill, but the only speed-positive variants widen +Tensor-vs-standard drift well beyond the current branch envelope. This closes +the parity-shaped early-window idea unless a new arithmetic path reduces the +routed-MoE Tensor local movement. + +Refreshed local run index after these artifacts: + +- `speed-bench/local-runs/20260515-163440-local-run-index/local-run-index.md` + +## Early Odd Gate/Up Drift Isolation + +Followed the rejected `1,3,5,7,9,11` sparse gate/up candidate with a local +MoE comparator probe and two five-fixture drift splits. The goal was to check +whether the full-logit drift came from an obviously bad Tensor matmul site or +from cumulative early-layer movement. + +Local comparator artifact: + +- `speed-bench/local-runs/20260515-163903-manual-mpp-compare-probe/mpp-compare-summary.md` + +The probe reused the rejected odd candidate filters and compared `moe_gate` and +`moe_up` separately on the two fixtures that drove the full-logit rejection: +`long_memory_archive` and `long_code_audit`. + +| Metric | Value | +| --- | ---: | +| Parsed comparisons | 136 | +| Target breaches | 0 | +| Worst `moe_gate` max abs | 9.15527e-05 | +| Worst `moe_gate` RMS | 2.10598e-06 | +| Worst `moe_up` max abs | 9.91821e-05 | +| Worst `moe_up` RMS | 1.6725e-06 | + +This clears the individual gate/up Tensor matmuls at the local comparator +threshold. The full-model drift is therefore not explained by a single bad +gate/up projection; it is more consistent with cumulative amplification from +moving early routed-MoE projections onto the Tensor path. + +Then split the odd early window into `1,3,5` and `7,9,11`, keeping the current +default `down` route unchanged and retaining the default `15..42` gate/up +window. + +### Layers `1,3,5` + +Artifact: + +- `speed-bench/local-runs/20260515-164155-drift-gate-gateup-odd1-5-down12/summary.md` + +Tensor-vs-standard five-fixture result: + +| Top1 mismatches | Greedy mismatches | Min top20 | Worst RMS | Worst top20 abs | +| ---: | ---: | ---: | ---: | ---: | +| 0 | 0 | 19/20 | 0.569373 | 1.95196 | + +Reject. This half keeps top-1 and greedy stable, but it fails the current +Tensor-vs-standard envelope on `long_memory_archive`: RMS `0.569373` and +top20 abs `1.95196`. + +### Layers `7,9,11` + +Artifact: + +- `speed-bench/local-runs/20260515-164507-drift-gate-gateup-odd7-11-down12/summary.md` + +Tensor-vs-standard five-fixture result: + +| Top1 mismatches | Greedy mismatches | Min top20 | Worst RMS | Worst top20 abs | +| ---: | ---: | ---: | ---: | ---: | +| 1 | 1 | 16/20 | 0.518334 | 1.67467 | + +Reject. This half is worse qualitatively: it introduces a top-1 and greedy +mismatch on `long_memory_archive`, and its worst RMS/top20 drift lands on +`long_code_audit`. + +Conclusion: the speed-positive early odd gate/up window cannot be narrowed into +a safe half-window with the current Tensor arithmetic. Since both halves fail +the five-scenario drift gate, further speed benchmarking of these split windows +is not useful. Keep the promoted conservative route and do not add early +gate/up layers unless the underlying routed-MoE Tensor arithmetic changes. + +Refreshed local run index after these artifacts: + +- `speed-bench/local-runs/20260515-164718-local-run-index/local-run-index.md` + +## Routed-MoE Kernel Variant Triage Refresh + +Re-inspected the currently wired routed-MoE and attention-output Tensor +matmul variants after closing the sparse early-layer screens: + +- `metal/moe.metal`: `kernel_mul_mm_id`, the generic MPP function-constant + branch inside it, `kernel_mul_mm_id_mpp_fast_layout`, + `kernel_mul_mm_id_pair_mpp`, and the attention-output low-Q8 MPP direct-RHS + kernels. +- `ds4_metal.m`: `ds4_gpu_routed_mm_pipeline`, + `ds4_gpu_routed_mm_f16_rhs_pipeline`, `ds4_gpu_encode_mul_mm_id_mapped_tile`, + `ds4_gpu_encode_mul_mm_id_pair_mpp`, and the attention-output low-projection + dispatch. + +Status of the existing variants: + +| Variant | Current status | +| --- | --- | +| Attention-output low-Q8 direct RHS | Promoted default; all-layer route passed the five-fixture gate and is part of the current baseline. | +| Attention-output staged RHS / tile-32 | Rejected as slower; keep direct RHS and tile-64 defaults. | +| Routed-MoE first-PR fast layout | Promoted only in the conservative layer window; wider early use is fast but widens Tensor-vs-standard drift. | +| Routed-MoE generic MPP function-constant path | Already screened via `DS4_METAL_MPP_MOE_FAST_LAYOUT=0`; it gives up speed without improving full-model drift. | +| Routed-MoE gate/up pair MPP | Rejected as consistently slower on both the old and current conservative windows. | +| Routed-MoE tile-64 | Rejected as slower. | + +This leaves no untried source-level switch in the current routed-MoE Tensor +family that is likely to improve the prefill/drift tradeoff. The local +comparator shows individual early gate/up Tensor matmuls are clean at about +`1e-4` max abs, but five-fixture full-logit gates still fail when those early +layers are enabled. That points to cumulative arithmetic movement rather than +a single broken projection. + +Next useful kernel work should be a new arithmetic-preserving routed-MoE +matmul path: keep the legacy simdgroup-MMA accumulation order as close as +possible, then optimize map/output overhead or memory layout around it. Another +`DS4_METAL_MPP_*` layer-window, tile-size, fast-layout, or pair-dispatch sweep +is unlikely to produce a promotable low-drift prefill win without changing the +underlying arithmetic. + +## Rejected Routed-MoE Writeback Offset Simplification + +Tried a local default-on source patch to simplify the final +`kernel_mul_mm_id` scatter address. The expert-major map stores each selected +output slot as `id = token * selected_experts + selected_slot`; in the current +host call shapes `args.ne1 == args.ne20`, so the writeback can algebraically +use `id * args.ne0` instead of recomputing `id % args.ne20` and +`id / args.ne20`. + +This preserved the dequantization, simdgroup-MMA accumulation order, route +selection, and destination layout. It only changed the final destination pointer +calculation, with a fallback for the general `args.ne1 != args.ne20` case. + +Artifacts: + +- Baseline CSV: + `speed-bench/local-runs/20260515-165545-pre-scatter-offset-baseline/tensor.csv` +- Patched CSV: + `speed-bench/local-runs/20260515-165545-scatter-offset-patch/tensor.csv` + +One compact `-mt auto` timing run versus the pre-patch source: + +| Context | Prefill delta | Generation delta | +| ---: | ---: | ---: | +| 512 | -4.8% | +0.1% | +| 1024 | +0.3% | -0.2% | +| 2048 | +0.1% | -0.3% | +| 4096 | -0.4% | +0.5% | +| 8192 | -4.5% | +0.4% | + +Reject before drift gate. The change is algebraically safe, but it did not +produce a speed signal and regressed the smallest and largest compact prefill +points in the smoke run. The patch was reverted and the binaries rebuilt from +the reverted source. Keep the existing writeback code unless a larger +source-level rewrite can remove more than this address arithmetic. + +Refreshed local run index after this artifact: + +- `speed-bench/local-runs/20260515-165926-local-run-index/local-run-index.md` + +## Revert Default Long-Prompt Chunk to 2048 for Official Vectors + +After rebasing on `main`, `make test` exposed a `--logprob-vectors` failure on +the `long_memory_archive` fixture. Main at `d0357ec` passes the same +`q2-imatrix` model path, and the branch failure reproduced with Tensor routes +disabled, so this was not a Tensor auto-route issue. + +Bisecting the branch stack found the regression between `8285710` and +`0fc7f33`, where the default long-prompt Metal prefill chunk changed from 2048 +to 4096. Re-running the failing test with +`DS4_METAL_PREFILL_CHUNK=2048` made it pass: + +```sh +env DS4_METAL_MPP_DISABLE=1 DS4_METAL_PREFILL_CHUNK=2048 \ + ./ds4_test --logprob-vectors +``` + +Decision: keep the production default at 4096 because reverting it to 2048 +breaks the current Tensor-vs-standard equivalence baseline, but make the strict +`--logprob-vectors` runner open the standard Metal path and pin +`DS4_METAL_PREFILL_CHUNK=2048`. This preserves the official vector +checkpoint/logit behavior without weakening the Tensor auto defaults. Tensor +route drift remains covered by `--metal-tensor-equivalence` and the +five-fixture drift gate. diff --git a/speed-bench/metal_tensor_presets.py b/speed-bench/metal_tensor_presets.py new file mode 100644 index 000000000..ded3c0935 --- /dev/null +++ b/speed-bench/metal_tensor_presets.py @@ -0,0 +1,60 @@ +"""Named Metal Tensor prefill candidate environment presets.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class CandidatePreset: + label: str + env: dict[str, str] + description: str + + +CANDIDATE_PRESETS: dict[str, CandidatePreset] = { + "mpp-fast": CandidatePreset( + label="mpp-fast", + env={"DS4_METAL_MPP_FAST": "1"}, + description="All-routed-MoE fast Tensor profile.", + ), + "mpp-fast-skip-down26-29-30": CandidatePreset( + label="mpp-fast-skip-down26-29-30", + env={ + "DS4_METAL_MPP_FAST": "1", + "DS4_METAL_MPP_MOE_DOWN_FILTER": "layer=0-25,layer=27-28,layer=31-42", + }, + description="Best current prefill-first default-off candidate.", + ), + "mpp-fast-skip-down26-29-30-mid-f32": CandidatePreset( + label="mpp-fast-skip-down26-29-30-mid-f32", + env={ + "DS4_METAL_MPP_FAST": "1", + "DS4_METAL_MPP_MOE_DOWN_FILTER": "layer=0-25,layer=27-28,layer=31-42", + "DS4_METAL_MOE_MID_F32": "1", + }, + description="Best current balanced default-off candidate for flatter generation timing.", + ), + "mpp-fast-continuation-chunks": CandidatePreset( + label="mpp-fast-continuation-chunks", + env={ + "DS4_METAL_MPP_FAST": "1", + "DS4_METAL_MPP_MOE_GATE_FILTER": "layer=15-42,pos=512,pos=1024,pos=2048,pos=4096", + "DS4_METAL_MPP_MOE_UP_FILTER": "layer=15-42,pos=512,pos=1024,pos=2048,pos=4096", + "DS4_METAL_MPP_MOE_DOWN_FILTER": "layer=12-42,pos=512,pos=1024,pos=2048,pos=4096", + }, + description="Fast routed-MoE only for continuation prefill chunks; needs extra chunked drift coverage.", + ), + "experimental-moe-matmul": CandidatePreset( + label="experimental-moe-matmul", + env={"DS4_METAL_EXPERIMENTAL_MOE_MATMUL": "1"}, + description="Experimental all-layer routed-MoE matmul route.", + ), +} + + +def preset_help() -> str: + return "\n".join( + f" {name}: {preset.description}" + for name, preset in sorted(CANDIDATE_PRESETS.items()) + ) diff --git a/speed-bench/run_chunked_prefill_drift_gate.py b/speed-bench/run_chunked_prefill_drift_gate.py new file mode 100644 index 000000000..29a6d3d8d --- /dev/null +++ b/speed-bench/run_chunked_prefill_drift_gate.py @@ -0,0 +1,668 @@ +#!/usr/bin/env python3 +"""Run a resumed-prefill frontier logit drift gate. + +The normal five-fixture quality gate captures logits after a cold prompt +prefill. Candidates that route only nonzero prefill positions need another +check: grow one long prompt through the same frontiers as ds4-bench, dump logits +after each resumed frontier, and compare: + + standard_vs_quality + tensor_vs_quality + tensor_vs_standard + +When tensor-mode environment overrides are supplied, the gate also captures the +plain no-env Tensor baseline as default_tensor and compares: + + default_tensor_vs_quality + default_tensor_vs_standard + tensor_vs_default_tensor +""" + +from __future__ import annotations + +import argparse +import json +import os +import shlex +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +from compare_logit_drift import compare, load_dump +from metal_tensor_presets import CANDIDATE_PRESETS, preset_help + + +MODES: dict[str, list[str]] = { + "quality": ["--quality"], + "standard": ["-mt", "off"], + "default_tensor": ["-mt", "auto"], + "tensor": ["-mt", "auto"], +} + +BASE_PAIRS = ( + ("standard_vs_quality", "quality", "standard"), + ("tensor_vs_quality", "quality", "tensor"), + ("tensor_vs_standard", "standard", "tensor"), +) + +DEFAULT_TENSOR_PAIRS = ( + ("default_tensor_vs_quality", "quality", "default_tensor"), + ("default_tensor_vs_standard", "standard", "default_tensor"), + ("tensor_vs_default_tensor", "default_tensor", "tensor"), +) + +DS4_BENCH_FRESHNESS_SOURCES = ( + "ds4.c", + "ds4.h", + "ds4_gpu.h", + "ds4_bench.c", + "ds4_metal.m", + "metal/*.metal", +) + + +def assert_fresh_binary( + binary: Path, + *, + repo_root: Path, + source_patterns: tuple[str, ...], + allow_stale: bool, +) -> None: + if allow_stale: + return + if not binary.exists(): + raise SystemExit(f"{binary}: binary does not exist; run the relevant make target first") + binary_mtime = binary.stat().st_mtime + stale_sources: list[Path] = [] + for pattern in source_patterns: + matches = sorted(repo_root.glob(pattern)) + if not matches: + continue + stale_sources.extend(path for path in matches if path.stat().st_mtime > binary_mtime) + if stale_sources: + newest = max(stale_sources, key=lambda path: path.stat().st_mtime) + rel = newest.relative_to(repo_root) + raise SystemExit( + f"{binary}: stale binary; {rel} is newer. " + "Rebuild before running the chunked drift gate, or pass " + "--allow-stale-binary only when intentionally summarizing old artifacts." + ) + + +def shell_join(argv: list[object]) -> str: + return " ".join(shlex.quote(str(part)) for part in argv) + + +def safe_label(value: str) -> str: + return "".join(ch if ch.isalnum() or ch in "._-" else "-" for ch in value).strip("-") + + +def markdown_escape(value: object) -> str: + return str(value).replace("|", "\\|") + + +def parse_env_overrides(values: list[str]) -> dict[str, str]: + env: dict[str, str] = {} + for value in values: + if "=" not in value: + raise SystemExit(f"--set-env expects NAME=VALUE, got: {value}") + name, env_value = value.split("=", 1) + if not name: + raise SystemExit(f"--set-env expects NAME=VALUE, got: {value}") + env[name] = env_value + return env + + +def candidate_env(args: argparse.Namespace) -> dict[str, str]: + env: dict[str, str] = {} + if args.preset: + env.update(CANDIDATE_PRESETS[args.preset].env) + env.update(parse_env_overrides(args.set_env)) + return env + + +def active_modes(capture_default_tensor: bool) -> list[str]: + if capture_default_tensor: + return ["quality", "standard", "default_tensor", "tensor"] + return ["quality", "standard", "tensor"] + + +def active_pairs(capture_default_tensor: bool) -> list[tuple[str, str, str]]: + pairs = list(BASE_PAIRS) + if capture_default_tensor: + pairs.extend(DEFAULT_TENSOR_PAIRS) + return pairs + + +def mode_dir(out_dir: Path, mode: str) -> Path: + return out_dir / f"{mode}-frontier-logits" + + +def mode_csv(out_dir: Path, mode: str) -> Path: + return out_dir / f"{mode}.csv" + + +def frontier_logits_path(out_dir: Path, mode: str, frontier: int) -> Path: + return mode_dir(out_dir, mode) / f"frontier_{frontier:06d}.logits.json" + + +def run_command( + cmd: list[object], + *, + cwd: Path, + env_overrides: dict[str, str], + dry_run: bool, +) -> None: + printable = [str(part) for part in cmd] + if env_overrides: + env_text = " ".join(f"{name}={shlex.quote(value)}" for name, value in sorted(env_overrides.items())) + print("+", env_text, shell_join(printable), flush=True) + else: + print("+", shell_join(printable), flush=True) + if dry_run: + return + env = os.environ.copy() + env.update(env_overrides) + proc = subprocess.run(printable, cwd=cwd, env=env, text=True, capture_output=True) + if proc.returncode != 0: + raise SystemExit( + f"command failed with exit {proc.returncode}: {shell_join(printable)}\n" + f"stdout:\n{proc.stdout[-4000:]}\n" + f"stderr:\n{proc.stderr[-8000:]}" + ) + + +def capture_mode( + args: argparse.Namespace, + mode: str, + *, + tensor_env: dict[str, str], +) -> None: + dump_dir = mode_dir(args.out_dir, mode) + dump_dir.mkdir(parents=True, exist_ok=True) + if args.reuse and all(frontier_logits_path(args.out_dir, mode, f).exists() for f in args.frontiers): + print(f"Reusing {mode} frontier dumps in {dump_dir}", flush=True) + return + + mode_env = tensor_env if mode == "tensor" else {} + cmd: list[object] = [ + args.ds4_bench, + "--prompt-file", + args.prompt_file, + "--ctx-start", + args.ctx_start, + "--ctx-max", + args.ctx_max, + "--step-mul", + args.step_mul, + "--gen-tokens", + args.gen_tokens, + "--dump-frontier-logits-dir", + dump_dir, + "--csv", + mode_csv(args.out_dir, mode), + ] + if args.model: + cmd[1:1] = ["-m", args.model] + cmd.extend(MODES[mode]) + run_command(cmd, cwd=args.repo_root, env_overrides=mode_env, dry_run=args.dry_run) + + +def aggregate(rows: list[dict[str, Any]]) -> dict[str, Any]: + return { + "frontiers": len(rows), + "top1_mismatches": sum(0 if row["same_top1"] else 1 for row in rows), + "min_top5_overlap": min(row["top5_overlap"] for row in rows), + "min_top20_overlap": min(row["top20_overlap"] for row in rows), + "worst_rank_delta": max(row["max_rank_delta"] for row in rows), + "worst_rms": max(row["rms"] for row in rows), + "worst_max_abs": max(row["max_abs"] for row in rows), + "worst_top20_max_abs": max(row["top20_max_abs"] for row in rows), + } + + +def extrema(rows: list[dict[str, Any]]) -> dict[str, Any]: + worst_rms = max(rows, key=lambda row: row["rms"]) + worst_top20 = max(rows, key=lambda row: row["top20_max_abs"]) + min_top20 = min(rows, key=lambda row: row["top20_overlap"]) + return { + "worst_rms_frontier": worst_rms["frontier"], + "worst_rms": worst_rms["rms"], + "worst_top20_max_abs_frontier": worst_top20["frontier"], + "worst_top20_max_abs": worst_top20["top20_max_abs"], + "min_top20_overlap_frontier": min_top20["frontier"], + "min_top20_overlap": min_top20["top20_overlap"], + "top1_mismatch_frontiers": [row["frontier"] for row in rows if not row["same_top1"]], + } + + +def summarize(args: argparse.Namespace) -> dict[str, Any]: + pairs: dict[str, Any] = {} + for pair_name, ref_mode, cand_mode in args.pairs: + rows: list[dict[str, Any]] = [] + for frontier in args.frontiers: + ref_path = frontier_logits_path(args.out_dir, ref_mode, frontier) + cand_path = frontier_logits_path(args.out_dir, cand_mode, frontier) + metrics = compare(load_dump(ref_path), load_dump(cand_path), args.top_k) + rows.append({"frontier": frontier, **metrics}) + pairs[pair_name] = { + "rows": rows, + "summary": aggregate(rows), + "extrema": extrema(rows), + } + print_pair_table(pair_name, rows) + return { + "pairs": pairs, + "modes": {mode: MODES[mode] for mode in args.modes}, + "pair_order": [pair_name for pair_name, _, _ in args.pairs], + "frontiers": args.frontiers, + } + + +def print_pair_table(pair_name: str, rows: list[dict[str, Any]]) -> None: + print(f"\n{pair_name}") + print("frontier same_top1 top5 top20 rank rms max_abs top20_abs") + for row in rows: + print( + f"{row['frontier']} " + f"{'yes' if row['same_top1'] else 'no'} " + f"{row['top5_overlap']}/5 " + f"{row['top20_overlap']}/20 " + f"{row['max_rank_delta']} " + f"{row['rms']:.6g} " + f"{row['max_abs']:.6g} " + f"{row['top20_max_abs']:.6g}" + ) + summary = aggregate(rows) + print( + "summary " + f"top1_mismatches={summary['top1_mismatches']} " + f"min_top20={summary['min_top20_overlap']}/20 " + f"worst_rms={summary['worst_rms']:.6g} " + f"worst_top20_max_abs={summary['worst_top20_max_abs']:.6g}" + ) + + +def check_gate( + payload: dict[str, Any], + *, + max_tensor_standard_rms: float | None, + max_tensor_standard_top20_abs: float | None, + max_tensor_default_rms: float | None, + max_tensor_default_top20_abs: float | None, +) -> list[str]: + failures: list[str] = [] + for pair_name in payload.get("pair_order", ("standard_vs_quality", "tensor_vs_quality", "tensor_vs_standard")): + summary = payload["pairs"][pair_name]["summary"] + if summary["top1_mismatches"] != 0: + failures.append(f"{pair_name}: top1_mismatches={summary['top1_mismatches']}") + + tensor_delta = payload["pairs"]["tensor_vs_standard"]["summary"] + tensor_extrema = payload["pairs"]["tensor_vs_standard"]["extrema"] + if max_tensor_standard_rms is not None and tensor_delta["worst_rms"] > max_tensor_standard_rms: + failures.append( + "tensor_vs_standard: worst_rms exceeds configured envelope " + f"({tensor_delta['worst_rms']:.6g} > {max_tensor_standard_rms:.6g}, " + f"frontier={tensor_extrema['worst_rms_frontier']})" + ) + if (max_tensor_standard_top20_abs is not None and + tensor_delta["worst_top20_max_abs"] > max_tensor_standard_top20_abs): + failures.append( + "tensor_vs_standard: worst_top20_max_abs exceeds configured envelope " + f"({tensor_delta['worst_top20_max_abs']:.6g} > " + f"{max_tensor_standard_top20_abs:.6g}, " + f"frontier={tensor_extrema['worst_top20_max_abs_frontier']})" + ) + + if "tensor_vs_default_tensor" in payload["pairs"]: + default_delta = payload["pairs"]["tensor_vs_default_tensor"]["summary"] + default_extrema = payload["pairs"]["tensor_vs_default_tensor"]["extrema"] + if max_tensor_default_rms is not None and default_delta["worst_rms"] > max_tensor_default_rms: + failures.append( + "tensor_vs_default_tensor: worst_rms exceeds configured envelope " + f"({default_delta['worst_rms']:.6g} > {max_tensor_default_rms:.6g}, " + f"frontier={default_extrema['worst_rms_frontier']})" + ) + if (max_tensor_default_top20_abs is not None and + default_delta["worst_top20_max_abs"] > max_tensor_default_top20_abs): + failures.append( + "tensor_vs_default_tensor: worst_top20_max_abs exceeds configured envelope " + f"({default_delta['worst_top20_max_abs']:.6g} > " + f"{max_tensor_default_top20_abs:.6g}, " + f"frontier={default_extrema['worst_top20_max_abs_frontier']})" + ) + + standard = payload["pairs"]["standard_vs_quality"]["summary"] + tensor = payload["pairs"]["tensor_vs_quality"]["summary"] + if tensor["worst_rms"] > standard["worst_rms"] * 1.10: + failures.append( + "tensor_vs_quality: worst_rms materially worse than standard " + f"({tensor['worst_rms']:.6g} > {standard['worst_rms']:.6g} * 1.10)" + ) + if tensor["worst_top20_max_abs"] > standard["worst_top20_max_abs"] * 1.10: + failures.append( + "tensor_vs_quality: worst_top20_max_abs materially worse than standard " + f"({tensor['worst_top20_max_abs']:.6g} > " + f"{standard['worst_top20_max_abs']:.6g} * 1.10)" + ) + return failures + + +def markdown_pair_table(pair_name: str, rows: list[dict[str, Any]]) -> str: + lines = [ + f"## {markdown_escape(pair_name)}", + "", + "| Frontier | Same top1 | Top5 | Top20 | Rank delta | RMS | Max abs | Top20 abs |", + "| ---: | --- | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + for row in rows: + lines.append( + "| " + f"{row['frontier']} | " + f"{'yes' if row['same_top1'] else 'no'} | " + f"{row['top5_overlap']}/5 | " + f"{row['top20_overlap']}/20 | " + f"{row['max_rank_delta']} | " + f"{row['rms']:.6g} | " + f"{row['max_abs']:.6g} | " + f"{row['top20_max_abs']:.6g} |" + ) + summary = aggregate(rows) + row_extrema = extrema(rows) + lines.extend( + [ + "", + "| Summary | Value |", + "| --- | ---: |", + f"| Top1 mismatches | {summary['top1_mismatches']} |", + f"| Min top5 overlap | {summary['min_top5_overlap']}/5 |", + f"| Min top20 overlap | {summary['min_top20_overlap']}/20 |", + f"| Worst rank delta | {summary['worst_rank_delta']} |", + f"| Worst RMS | {summary['worst_rms']:.6g} |", + f"| Worst max abs | {summary['worst_max_abs']:.6g} |", + f"| Worst top20 max abs | {summary['worst_top20_max_abs']:.6g} |", + "", + "| Worst frontier | Value |", + "| --- | --- |", + f"| Worst RMS frontier | {row_extrema['worst_rms_frontier']} " + f"({row_extrema['worst_rms']:.6g}) |", + f"| Worst top20 abs frontier | {row_extrema['worst_top20_max_abs_frontier']} " + f"({row_extrema['worst_top20_max_abs']:.6g}) |", + f"| Min top20 overlap frontier | {row_extrema['min_top20_overlap_frontier']} " + f"({row_extrema['min_top20_overlap']}/20) |", + "", + ] + ) + return "\n".join(lines) + + +def write_markdown_summary(payload: dict[str, Any], path: Path) -> None: + lines = [ + "# Chunked Prefill Drift Gate", + "", + "This gate dumps logits after resumed `ds4_session_sync()` frontiers from one long prompt.", + "", + "Modes:", + "", + ] + for mode, mode_args in payload["modes"].items(): + lines.append(f"- `{markdown_escape(mode)}`: `{' '.join(mode_args)}`") + if payload["candidate_env"]: + lines.extend(["", "Tensor-mode environment overrides:", ""]) + for name, value in sorted(payload["candidate_env"].items()): + lines.append(f"- `{markdown_escape(name)}={markdown_escape(value)}`") + else: + lines.extend(["", "Tensor-mode environment overrides: none"]) + + config = payload["run_config"] + lines.extend(["", "Run config:", "", "| Setting | Value |", "| --- | --- |"]) + for key in ( + "repo_root", + "ds4_bench", + "model", + "prompt_file", + "out_dir", + "candidate_preset", + "ctx_start", + "ctx_max", + "step_mul", + "gen_tokens", + "top_k", + "reuse", + "max_tensor_standard_rms", + "max_tensor_standard_top20_abs", + "max_tensor_default_rms", + "max_tensor_default_top20_abs", + "capture_default_tensor", + ): + lines.append(f"| `{markdown_escape(key)}` | `{markdown_escape(config.get(key))}` |") + lines.extend(["", "Replay command:", "", "```sh", shell_join(["python3", *config["argv"]]), "```"]) + + envelope = payload.get("drift_envelope") or {} + lines.extend(["", "Tensor-vs-standard drift envelope:", ""]) + if envelope.get("max_rms") is not None: + lines.append(f"- Worst RMS <= `{envelope['max_rms']:.6g}`") + if envelope.get("max_top20_abs") is not None: + lines.append(f"- Worst top20 abs <= `{envelope['max_top20_abs']:.6g}`") + if not envelope: + lines.append("- not configured") + default_envelope = payload.get("tensor_default_envelope") or {} + if default_envelope: + lines.extend(["", "Candidate-vs-default-Tensor drift envelope:", ""]) + if default_envelope.get("max_rms") is not None: + lines.append(f"- Worst RMS <= `{default_envelope['max_rms']:.6g}`") + if default_envelope.get("max_top20_abs") is not None: + lines.append(f"- Worst top20 abs <= `{default_envelope['max_top20_abs']:.6g}`") + + failures = payload["gate_failures"] + lines.extend(["", f"Gate: {'FAIL' if failures else 'OK'}", ""]) + if failures: + lines.append("Failures:") + lines.append("") + for failure in failures: + lines.append(f"- {markdown_escape(failure)}") + lines.append("") + + for pair_name in payload.get("pair_order", list(payload["pairs"])): + lines.append(markdown_pair_table(pair_name, payload["pairs"][pair_name]["rows"])) + + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def build_run_config(args: argparse.Namespace) -> dict[str, Any]: + return { + "argv": sys.argv, + "repo_root": str(args.repo_root), + "ds4_bench": str(args.ds4_bench), + "model": str(args.model) if args.model else None, + "prompt_file": str(args.prompt_file), + "out_dir": str(args.out_dir), + "candidate_preset": args.preset, + "ctx_start": args.ctx_start, + "ctx_max": args.ctx_max, + "step_mul": args.step_mul, + "gen_tokens": args.gen_tokens, + "top_k": args.top_k, + "reuse": args.reuse, + "dry_run": args.dry_run, + "max_tensor_standard_rms": args.max_tensor_standard_rms, + "max_tensor_standard_top20_abs": args.max_tensor_standard_top20_abs, + "max_tensor_default_rms": args.max_tensor_default_rms, + "max_tensor_default_top20_abs": args.max_tensor_default_top20_abs, + "capture_default_tensor": args.capture_default_tensor, + "allow_stale_binary": args.allow_stale_binary, + "no_fail": args.no_fail, + } + + +def compute_frontiers(ctx_start: int, ctx_max: int, step_mul: float) -> list[int]: + frontiers: list[int] = [] + cur = ctx_start + while True: + frontiers.append(cur) + if cur >= ctx_max: + break + next_value = int((cur * step_mul) + 0.999999) + if next_value <= cur: + next_value = cur + 1 + if next_value > ctx_max: + next_value = ctx_max + cur = next_value + return frontiers + + +def main() -> int: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=f"Candidate presets:\n{preset_help()}", + ) + parser.add_argument("--repo-root", type=Path, default=Path(".")) + parser.add_argument("--ds4-bench", type=Path, default=Path("./ds4-bench")) + parser.add_argument("--model", type=Path) + parser.add_argument("--prompt-file", type=Path, default=Path("speed-bench/promessi_sposi.txt")) + parser.add_argument("--out-dir", type=Path) + parser.add_argument("--ctx-start", type=int, default=512) + parser.add_argument("--ctx-max", type=int, default=8192) + parser.add_argument("--step-mul", type=float, default=2.0) + parser.add_argument("--gen-tokens", type=int, default=1) + parser.add_argument("--top-k", type=int, default=20) + parser.add_argument("--reuse", action="store_true", help="Reuse existing frontier dumps in --out-dir.") + parser.add_argument("--dry-run", action="store_true", help="Print commands without running them.") + parser.add_argument( + "--allow-stale-binary", + action="store_true", + help="Skip the source-vs-binary freshness check.", + ) + parser.add_argument( + "--preset", + choices=sorted(CANDIDATE_PRESETS), + help="Use a named default-off candidate environment preset for the tensor mode.", + ) + parser.add_argument( + "--set-env", + action="append", + default=[], + metavar="NAME=VALUE", + help="Set an environment variable only for the tensor-mode capture; repeatable.", + ) + parser.add_argument( + "--max-tensor-standard-rms", + type=float, + help="Optional maximum Tensor-vs-standard worst RMS allowed by this gate.", + ) + parser.add_argument( + "--max-tensor-standard-top20-abs", + type=float, + help="Optional maximum Tensor-vs-standard worst top-20 absolute drift allowed by this gate.", + ) + parser.add_argument( + "--max-tensor-default-rms", + type=float, + help="Optional maximum candidate Tensor-vs-default Tensor worst RMS allowed by this gate.", + ) + parser.add_argument( + "--max-tensor-default-top20-abs", + type=float, + help="Optional maximum candidate Tensor-vs-default Tensor worst top-20 absolute drift allowed by this gate.", + ) + parser.add_argument( + "--no-default-tensor-baseline", + action="store_true", + help="Do not capture the no-env -mt auto baseline when tensor-mode env overrides are set.", + ) + parser.add_argument( + "--no-fail", + action="store_true", + help="Always exit 0 after reporting gate failures.", + ) + args = parser.parse_args() + + if args.top_k < 20: + raise SystemExit("--top-k must be at least 20") + if args.ctx_start <= 0 or args.ctx_max < args.ctx_start: + raise SystemExit("--ctx-start must be positive and <= --ctx-max") + if args.step_mul < 1.0: + raise SystemExit("--step-mul must be >= 1") + if args.gen_tokens <= 0: + raise SystemExit("--gen-tokens must be positive") + + label = args.preset or "chunked-prefill-drift-gate" + if args.out_dir is None: + run_id = time.strftime("%Y%m%d-%H%M%S") + args.out_dir = Path("speed-bench/local-runs") / f"{run_id}-{safe_label(label)}-chunked-drift-gate" + + args.repo_root = args.repo_root.resolve() + if not args.ds4_bench.is_absolute(): + args.ds4_bench = args.repo_root / args.ds4_bench + args.out_dir.mkdir(parents=True, exist_ok=True) + if not args.dry_run: + assert_fresh_binary( + args.ds4_bench, + repo_root=args.repo_root, + source_patterns=DS4_BENCH_FRESHNESS_SOURCES, + allow_stale=args.allow_stale_binary, + ) + args.frontiers = compute_frontiers(args.ctx_start, args.ctx_max, args.step_mul) + tensor_env = candidate_env(args) + args.capture_default_tensor = bool(tensor_env) and not args.no_default_tensor_baseline + args.modes = active_modes(args.capture_default_tensor) + args.pairs = active_pairs(args.capture_default_tensor) + + if tensor_env: + print("Tensor-mode environment overrides:", flush=True) + for name, value in sorted(tensor_env.items()): + print(f" {name}={value}", flush=True) + + for mode in args.modes: + capture_mode(args, mode, tensor_env=tensor_env) + + if args.dry_run: + return 0 + + payload = summarize(args) + payload["candidate_env"] = tensor_env + payload["run_config"] = build_run_config(args) + envelope = { + "max_rms": args.max_tensor_standard_rms, + "max_top20_abs": args.max_tensor_standard_top20_abs, + } + if envelope["max_rms"] is not None or envelope["max_top20_abs"] is not None: + payload["drift_envelope"] = envelope + default_envelope = { + "max_rms": args.max_tensor_default_rms, + "max_top20_abs": args.max_tensor_default_top20_abs, + } + if default_envelope["max_rms"] is not None or default_envelope["max_top20_abs"] is not None: + payload["tensor_default_envelope"] = default_envelope + payload["gate_failures"] = check_gate( + payload, + max_tensor_standard_rms=args.max_tensor_standard_rms, + max_tensor_standard_top20_abs=args.max_tensor_standard_top20_abs, + max_tensor_default_rms=args.max_tensor_default_rms, + max_tensor_default_top20_abs=args.max_tensor_default_top20_abs, + ) + + summary_path = args.out_dir / "summary.json" + with summary_path.open("w", encoding="utf-8") as fp: + json.dump(payload, fp, indent=2) + fp.write("\n") + print(f"\nWrote {summary_path}") + + markdown_path = args.out_dir / "summary.md" + write_markdown_summary(payload, markdown_path) + print(f"Wrote {markdown_path}") + + if payload["gate_failures"]: + print("\nGate failures:") + for failure in payload["gate_failures"]: + print(f" {failure}") + return 0 if args.no_fail else 1 + print("\nGate: OK") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/speed-bench/run_metal_tensor_bench.sh b/speed-bench/run_metal_tensor_bench.sh new file mode 100755 index 000000000..6d687e15f --- /dev/null +++ b/speed-bench/run_metal_tensor_bench.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "${BASH_SOURCE[0]}")/.." + +PROMPT_FILE="${PROMPT_FILE:-speed-bench/promessi_sposi.txt}" +CTX_START="${CTX_START:-512}" +CTX_MAX="${CTX_MAX:-65536}" +STEP_MUL="${STEP_MUL:-2}" +GEN_TOKENS="${GEN_TOKENS:-128}" +RUN_ID="${RUN_ID:-$(date +%Y%m%d-%H%M%S)}" +OUT_DIR="${OUT_DIR:-speed-bench/local-runs/${RUN_ID}-metal-tensor-bench}" +PYTHON="${PYTHON:-python3}" +OPEN_CHART="${OPEN_CHART:-1}" +ALLOW_STALE_BINARY="${ALLOW_STALE_BINARY:-0}" + +if [[ "$ALLOW_STALE_BINARY" != "1" ]]; then + if [[ ! -x ./ds4-bench ]]; then + echo "error: ./ds4-bench does not exist or is not executable; run make ds4-bench first" >&2 + exit 1 + fi + stale_source="$( + { + printf '%s\n' ds4.c ds4.h ds4_gpu.h ds4_bench.c ds4_metal.m + find metal -type f -name '*.metal' + } 2>/dev/null | while IFS= read -r path; do + if [[ "$path" -nt ./ds4-bench ]]; then + printf '%s\n' "$path" + break + fi + done + )" + if [[ -n "$stale_source" ]]; then + echo "error: ./ds4-bench is stale; $stale_source is newer" >&2 + echo " rebuild first, or set ALLOW_STALE_BINARY=1 to summarize old artifacts intentionally" >&2 + exit 1 + fi +fi + +mkdir -p "$OUT_DIR" + +ARTIFACT_PREFIX="${RUN_ID}_gen${GEN_TOKENS}" +QUALITY_CSV="$OUT_DIR/${ARTIFACT_PREFIX}_ds4_bench_quality.csv" +STANDARD_CSV="$OUT_DIR/${ARTIFACT_PREFIX}_ds4_bench_standard_metal.csv" +TENSOR_CSV="$OUT_DIR/${ARTIFACT_PREFIX}_ds4_bench_tensor_metal.csv" +CHART="$OUT_DIR/${ARTIFACT_PREFIX}_ds4_bench_standard_quality_tensor.png" + +COMMON_ARGS=( + --prompt-file "$PROMPT_FILE" + --ctx-start "$CTX_START" + --ctx-max "$CTX_MAX" + --step-mul "$STEP_MUL" + --gen-tokens "$GEN_TOKENS" +) + +echo "1/3 Quality Metal -> $QUALITY_CSV" +./ds4-bench --quality "${COMMON_ARGS[@]}" --csv "$QUALITY_CSV" + +echo "2/3 Standard Metal -> $STANDARD_CSV" +./ds4-bench -mt off "${COMMON_ARGS[@]}" --csv "$STANDARD_CSV" + +echo "3/3 Tensor Metal -> $TENSOR_CSV" +./ds4-bench -mt auto "${COMMON_ARGS[@]}" --csv "$TENSOR_CSV" + +echo "Comparing runs -> $CHART" +"$PYTHON" speed-bench/compare_bench.py \ + "$STANDARD_CSV" \ + "$QUALITY_CSV" \ + "$TENSOR_CSV" \ + --labels "Standard Metal" "Quality Metal" "Tensor Metal" \ + --title "ds4-bench: Standard vs Quality vs Tensor (${GEN_TOKENS} generated tokens)" \ + -o "$CHART" + +echo +echo "Wrote:" +echo " $QUALITY_CSV" +echo " $STANDARD_CSV" +echo " $TENSOR_CSV" +echo " $CHART" + +if [[ "$OPEN_CHART" != "0" ]]; then + if command -v open >/dev/null 2>&1; then + open "$CHART" + elif command -v xdg-open >/dev/null 2>&1; then + xdg-open "$CHART" >/dev/null 2>&1 & + else + echo "No opener found; set OPEN_CHART=0 to skip this step." + fi +fi diff --git a/speed-bench/run_mpp_compare_probe.py b/speed-bench/run_mpp_compare_probe.py new file mode 100644 index 000000000..370e87f02 --- /dev/null +++ b/speed-bench/run_mpp_compare_probe.py @@ -0,0 +1,373 @@ +#!/usr/bin/env python3 +"""Run a Metal Tensor local comparator probe and summarize the result. + +This is a targeted diagnostic for default-off prefill candidates. It runs +`./ds4 --metal -mt auto` with DS4_METAL_MPP_COMPARE_* environment variables, +captures stderr/stdout under speed-bench/local-runs/, then writes a comparator +Markdown/JSON summary. It is not a replacement for the five-fixture drift gate; +use it to decide what to narrow before running run_quality_drift_gate.py. +""" + +from __future__ import annotations + +import argparse +import json +import os +import shlex +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +from metal_tensor_presets import CANDIDATE_PRESETS, preset_help +from run_quality_drift_gate import CASES +from summarize_mpp_compare import as_json, merge_summaries, parse_log, render_markdown + + +CASE_BY_ID = {case.case_id: case for case in CASES} + +DS4_FRESHNESS_SOURCES = ( + "ds4.c", + "ds4.h", + "ds4_gpu.h", + "ds4_cli.c", + "ds4_metal.m", + "metal/*.metal", +) + + +def assert_fresh_binary( + binary: Path, + *, + repo_root: Path, + source_patterns: tuple[str, ...], + allow_stale: bool, +) -> None: + if allow_stale: + return + if not binary.exists(): + raise SystemExit(f"{binary}: binary does not exist; run the relevant make target first") + binary_mtime = binary.stat().st_mtime + stale_sources: list[Path] = [] + for pattern in source_patterns: + matches = sorted(repo_root.glob(pattern)) + if not matches: + continue + stale_sources.extend(path for path in matches if path.stat().st_mtime > binary_mtime) + if stale_sources: + newest = max(stale_sources, key=lambda path: path.stat().st_mtime) + rel = newest.relative_to(repo_root) + raise SystemExit( + f"{binary}: stale binary; {rel} is newer. " + "Rebuild before running the comparator probe, or pass " + "--allow-stale-binary only when intentionally summarizing old artifacts." + ) + + +def parse_env_overrides(values: list[str]) -> dict[str, str]: + env: dict[str, str] = {} + for value in values: + if "=" not in value: + raise SystemExit(f"--set-env expects NAME=VALUE, got: {value}") + name, env_value = value.split("=", 1) + if not name: + raise SystemExit(f"--set-env expects NAME=VALUE, got: {value}") + env[name] = env_value + return env + + +def safe_label(value: str) -> str: + return "".join(ch if ch.isalnum() or ch in "._-" else "-" for ch in value).strip("-") or "probe" + + +def shell_join(argv: list[object]) -> str: + return " ".join(shlex.quote(str(part)) for part in argv) + + +def normalize_routes(values: list[str]) -> list[str]: + routes: list[str] = [] + for value in values or ["all"]: + for route in value.replace("|", ",").split(","): + route = route.strip() + if route: + routes.append(route) + return routes or ["all"] + + +def probe_env(args: argparse.Namespace, route: str) -> dict[str, str]: + env: dict[str, str] = {} + if args.preset: + env.update(CANDIDATE_PRESETS[args.preset].env) + env.update(parse_env_overrides(args.set_env)) + env["DS4_METAL_MPP_COMPARE_ROUTE"] = route + env["DS4_METAL_MPP_COMPARE_MAX"] = str(args.compare_max) + if route == "q8": + env["DS4_METAL_Q8_COMPARE"] = "1" + if args.q8_filter: + env["DS4_METAL_Q8_COMPARE_FILTER"] = args.q8_filter + if route == "flash_attn": + env["DS4_METAL_FLASH_ATTN_COMPARE"] = "1" + if args.flash_attn_filter: + env["DS4_METAL_FLASH_ATTN_COMPARE_FILTER"] = args.flash_attn_filter + if args.verbose: + env["DS4_METAL_MPP_COMPARE_VERBOSE"] = "1" + if args.continue_after_breach: + env["DS4_METAL_MPP_COMPARE_CONTINUE_ON_BREACH"] = "1" + return env + + +def ds4_command(args: argparse.Namespace, case_id: str) -> list[str]: + case = CASE_BY_ID[case_id] + cmd = [ + str(args.ds4), + "--metal", + "-mt", + "auto", + "--prompt-file", + case.prompt_path, + "-c", + str(case.ctx), + "-n", + str(args.gen_tokens), + "--system", + "", + "--nothink", + "--temp", + "0", + ] + if args.model: + cmd[1:1] = ["-m", str(args.model)] + return cmd + + +def run_probe( + cmd: list[str], + *, + cwd: Path, + env_overrides: dict[str, str], + log_path: Path, + dry_run: bool, +) -> None: + env_prefix = [f"{name}={value}" for name, value in sorted(env_overrides.items())] + print("+", shell_join(["env", *env_prefix, *cmd]), f">{log_path} 2>&1", flush=True) + if dry_run: + return + env = os.environ.copy() + env.update(env_overrides) + proc = subprocess.run(cmd, cwd=cwd, env=env, text=True, capture_output=True) + log_path.write_text(proc.stdout + proc.stderr, encoding="utf-8") + if proc.returncode != 0: + raise SystemExit( + f"probe failed with exit {proc.returncode}: {' '.join(cmd)}\n" + f"see {log_path}" + ) + + +def build_run_config( + args: argparse.Namespace, + *, + env_overrides: dict[str, dict[str, str]], + commands: dict[str, list[str]], + logs: dict[str, str], +) -> dict[str, Any]: + return { + "argv": sys.argv, + "repo_root": str(args.repo_root), + "ds4": str(args.ds4), + "model": str(args.model) if args.model else None, + "out_dir": str(args.out_dir), + "preset": args.preset, + "cases": args.case, + "routes": args.route, + "q8_filter": args.q8_filter, + "flash_attn_filter": args.flash_attn_filter, + "compare_max": args.compare_max, + "continue_after_breach": args.continue_after_breach, + "verbose": args.verbose, + "gen_tokens": args.gen_tokens, + "max_abs_target": args.max_abs_target, + "rms_target": args.rms_target, + "env": env_overrides, + "commands": commands, + "logs": logs, + "dry_run": args.dry_run, + "allow_stale_binary": args.allow_stale_binary, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=f"Candidate presets:\n{preset_help()}", + ) + parser.add_argument("--repo-root", type=Path, default=Path(".")) + parser.add_argument("--ds4", type=Path, default=Path("./ds4")) + parser.add_argument("--model", type=Path) + parser.add_argument("--out-dir", type=Path) + parser.add_argument( + "--preset", + choices=sorted(CANDIDATE_PRESETS), + help="Use a named default-off candidate environment preset.", + ) + parser.add_argument( + "--set-env", + action="append", + default=[], + metavar="NAME=VALUE", + help="Set or override an environment variable for the probe.", + ) + parser.add_argument( + "--case", + action="append", + choices=sorted(CASE_BY_ID), + help="Five-fixture case id to probe; repeatable. Defaults to long_memory_archive.", + ) + parser.add_argument( + "--all-cases", + action="store_true", + help="Probe all five drift-gate cases.", + ) + parser.add_argument( + "--route", + action="append", + default=[], + help=( + "DS4_METAL_MPP_COMPARE_ROUTE value, e.g. all, moe_down, moe_gate, " + "moe_up, attn_out, q8, flash_attn. Repeatable; comma or pipe " + "separated values are split." + ), + ) + parser.add_argument( + "--q8-filter", + help="Set DS4_METAL_Q8_COMPARE_FILTER for dense Q8_0 probes with --route q8.", + ) + parser.add_argument( + "--flash-attn-filter", + help="Set DS4_METAL_FLASH_ATTN_COMPARE_FILTER for FlashAttention probes with --route flash_attn.", + ) + parser.add_argument("--compare-max", type=int, default=200) + parser.add_argument( + "--continue-after-breach", + action="store_true", + help="Continue local comparisons after a target breach instead of stopping at the first breach.", + ) + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--gen-tokens", type=int, default=1) + parser.add_argument("--max-abs-target", type=float, default=1.0e-3) + parser.add_argument("--rms-target", type=float, default=1.0e-4) + parser.add_argument("--top", type=int, default=20) + parser.add_argument("--dry-run", action="store_true") + parser.add_argument( + "--allow-stale-binary", + action="store_true", + help="Skip the source-vs-binary freshness check.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if args.compare_max < 1: + raise SystemExit("--compare-max must be >= 1") + if args.gen_tokens < 1: + raise SystemExit("--gen-tokens must be >= 1") + if args.top < 1: + raise SystemExit("--top must be >= 1") + if args.all_cases: + args.case = [case.case_id for case in CASES] + elif not args.case: + args.case = ["long_memory_archive"] + args.route = normalize_routes(args.route) + if args.q8_filter and "q8" not in args.route: + raise SystemExit("--q8-filter requires --route q8") + if args.flash_attn_filter and "flash_attn" not in args.route: + raise SystemExit("--flash-attn-filter requires --route flash_attn") + + args.repo_root = args.repo_root.resolve() + if not args.ds4.is_absolute(): + args.ds4 = args.repo_root / args.ds4 + if not args.dry_run: + assert_fresh_binary( + args.ds4, + repo_root=args.repo_root, + source_patterns=DS4_FRESHNESS_SOURCES, + allow_stale=args.allow_stale_binary, + ) + if args.out_dir is None: + run_id = time.strftime("%Y%m%d-%H%M%S") + preset_label = args.preset or "manual" + args.out_dir = Path("speed-bench/local-runs") / f"{run_id}-{safe_label(preset_label)}-mpp-compare-probe" + args.out_dir.mkdir(parents=True, exist_ok=True) + + commands: dict[str, list[str]] = {} + logs: dict[str, str] = {} + env_for_config: dict[str, dict[str, str]] = {} + for route in args.route: + env_overrides = probe_env(args, route) + env_for_config[route] = env_overrides + for case_id in args.case: + cmd = ds4_command(args, case_id) + run_key = f"{case_id}:{route}" + log_path = args.out_dir / f"{case_id}.{safe_label(route)}.log" + commands[run_key] = cmd + logs[run_key] = str(log_path) + run_probe( + cmd, + cwd=args.repo_root, + env_overrides=env_overrides, + log_path=log_path, + dry_run=args.dry_run, + ) + + run_config = build_run_config( + args, + env_overrides=env_for_config, + commands=commands, + logs=logs, + ) + config_path = args.out_dir / "mpp-compare-run-config.json" + config_path.write_text(json.dumps(run_config, indent=2) + "\n", encoding="utf-8") + print(f"Wrote {config_path}") + + if args.dry_run: + print(f"Dry run only; would write {args.out_dir / 'mpp-compare-summary.md'}") + print(f"Dry run only; would write {args.out_dir / 'mpp-compare-summary.json'}") + return 0 + + summaries = [parse_log(Path(path)) for path in logs.values()] + summary = merge_summaries(summaries) + markdown_path = args.out_dir / "mpp-compare-summary.md" + json_path = args.out_dir / "mpp-compare-summary.json" + markdown_path.write_text( + render_markdown( + summary, + max_abs_target=args.max_abs_target, + rms_target=args.rms_target, + top=args.top, + ), + encoding="utf-8", + ) + json_path.write_text( + json.dumps( + { + "run_config": run_config, + "summary": as_json( + summary, + max_abs_target=args.max_abs_target, + rms_target=args.rms_target, + ), + }, + indent=2, + ) + + "\n", + encoding="utf-8", + ) + print(f"Wrote {markdown_path}") + print(f"Wrote {json_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/speed-bench/run_prefill_candidate_gate.py b/speed-bench/run_prefill_candidate_gate.py new file mode 100644 index 000000000..6eb6d481e --- /dev/null +++ b/speed-bench/run_prefill_candidate_gate.py @@ -0,0 +1,1296 @@ +#!/usr/bin/env python3 +"""Benchmark a prefill candidate and optionally run the quality drift gate. + +This is intended for default-off Metal Tensor experiments. It compares: + + standard -> ./ds4-bench -mt off + tensor -> ./ds4-bench -mt auto + candidate -> ./ds4-bench -mt with --set-env overrides + +Use --run-drift-gate before promotion. The helper only launches drift gates +after the speed screen passes, and the drift gates reuse the same candidate env +overrides so their "tensor" rows are the candidate route. Candidates that route +nonzero prefill positions also run the chunked frontier drift gate. +""" + +from __future__ import annotations + +import argparse +import csv +import json +import os +import re +import shlex +import statistics +import subprocess +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from metal_tensor_presets import CANDIDATE_PRESETS, preset_help + + +@dataclass(frozen=True) +class BenchRun: + name: str + label: str + mode_args: list[str] + env: dict[str, str] + + +DS4_BENCH_FRESHNESS_SOURCES = ( + "ds4.c", + "ds4.h", + "ds4_gpu.h", + "ds4_bench.c", + "ds4_metal.m", + "metal/*.metal", +) + + +def assert_fresh_binary( + binary: Path, + *, + repo_root: Path, + source_patterns: tuple[str, ...], + allow_stale: bool, +) -> None: + if allow_stale: + return + if not binary.exists(): + raise SystemExit(f"{binary}: binary does not exist; run the relevant make target first") + binary_mtime = binary.stat().st_mtime + stale_sources: list[Path] = [] + for pattern in source_patterns: + matches = sorted(repo_root.glob(pattern)) + if not matches: + continue + stale_sources.extend(path for path in matches if path.stat().st_mtime > binary_mtime) + if stale_sources: + newest = max(stale_sources, key=lambda path: path.stat().st_mtime) + rel = newest.relative_to(repo_root) + raise SystemExit( + f"{binary}: stale binary; {rel} is newer. " + "Rebuild before running the candidate gate, or pass --allow-stale-binary " + "only when intentionally summarizing old artifacts." + ) + + +def parse_env_overrides(values: list[str]) -> dict[str, str]: + env: dict[str, str] = {} + for value in values: + if "=" not in value: + raise SystemExit(f"--set-env expects NAME=VALUE, got: {value}") + name, env_value = value.split("=", 1) + if not name: + raise SystemExit(f"--set-env expects NAME=VALUE, got: {value}") + env[name] = env_value + return env + + +def candidate_env_from_args(args: argparse.Namespace) -> dict[str, str]: + env: dict[str, str] = {} + if args.preset: + preset = CANDIDATE_PRESETS[args.preset] + env.update(preset.env) + if args.candidate_label is None: + args.candidate_label = preset.label + if args.candidate_label is None: + args.candidate_label = "candidate" + env.update(parse_env_overrides(args.set_env)) + return env + + +def safe_label(value: str) -> str: + label = re.sub(r"[^A-Za-z0-9_.-]+", "-", value.strip()).strip("-") + return label or "candidate" + + +def run_command( + cmd: list[str], + *, + cwd: Path, + env_overrides: dict[str, str], + dry_run: bool, +) -> None: + env_prefix = [f"{name}={value}" for name, value in sorted(env_overrides.items())] + print("+", " ".join(env_prefix + cmd), flush=True) + if dry_run: + return + env = os.environ.copy() + env.update(env_overrides) + proc = subprocess.run(cmd, cwd=cwd, env=env, text=True, capture_output=True) + if proc.returncode != 0: + raise SystemExit( + f"command failed with exit {proc.returncode}: {' '.join(cmd)}\n" + f"stdout:\n{proc.stdout[-4000:]}\n" + f"stderr:\n{proc.stderr[-8000:]}" + ) + + +def read_bench_csv(path: Path) -> dict[int, dict[str, float]]: + with path.open(newline="", encoding="utf-8") as fp: + reader = csv.DictReader(fp) + if reader.fieldnames is None: + raise SystemExit(f"{path}: empty CSV") + required = {"ctx_tokens", "prefill_tps", "gen_tps"} + missing = required - set(reader.fieldnames) + if missing: + raise SystemExit(f"{path}: missing columns: {', '.join(sorted(missing))}") + rows: dict[int, dict[str, float]] = {} + for row in reader: + ctx = int(row["ctx_tokens"]) + rows[ctx] = { + "prefill_tps": float(row["prefill_tps"]), + "gen_tps": float(row["gen_tps"]), + } + if not rows: + raise SystemExit(f"{path}: no data rows") + return rows + + +def summarize_repeats( + csv_paths: dict[str, list[Path]], + *, + baseline_name: str, + tensor_name: str, + candidate_name: str, +) -> dict[str, Any]: + raw: dict[str, list[dict[int, dict[str, float]]]] = { + name: [read_bench_csv(path) for path in paths] + for name, paths in csv_paths.items() + } + context_sets = [ + set().union(*(run.keys() for run in repeats)) + for repeats in raw.values() + ] + contexts = sorted(set.intersection(*context_sets)) + if not contexts: + raise SystemExit("benchmark CSVs have no shared ctx_tokens values") + + runs: dict[str, dict[str, Any]] = {} + for name, repeats in raw.items(): + by_context: dict[str, Any] = {} + for ctx in contexts: + prefill = [run[ctx]["prefill_tps"] for run in repeats if ctx in run] + gen = [run[ctx]["gen_tps"] for run in repeats if ctx in run] + by_context[str(ctx)] = { + "prefill_tps_median": statistics.median(prefill), + "gen_tps_median": statistics.median(gen), + "prefill_tps_values": prefill, + "gen_tps_values": gen, + } + runs[name] = {"contexts": by_context} + + gains: dict[str, dict[str, Any]] = {} + for other_name, base_name in ( + (tensor_name, baseline_name), + (candidate_name, baseline_name), + (candidate_name, tensor_name), + ): + pair = f"{other_name}_vs_{base_name}" + gains[pair] = {} + for ctx in contexts: + ctx_key = str(ctx) + other = runs[other_name]["contexts"][ctx_key] + base = runs[base_name]["contexts"][ctx_key] + base_prefill = base["prefill_tps_median"] + base_gen = base["gen_tps_median"] + gains[pair][ctx_key] = { + "prefill_gain_pct": ((other["prefill_tps_median"] / base_prefill) - 1.0) * 100.0 + if base_prefill + else 0.0, + "gen_gain_pct": ((other["gen_tps_median"] / base_gen) - 1.0) * 100.0 + if base_gen + else 0.0, + } + + return { + "contexts": contexts, + "runs": runs, + "gains": gains, + } + + +def print_summary(summary: dict[str, Any], *, candidate_name: str) -> None: + print("\nMedian speed summary") + print("ctx standard_prefill tensor_prefill candidate_prefill candidate_vs_tensor candidate_gen_vs_tensor") + gains = summary["gains"][f"{candidate_name}_vs_tensor"] + for ctx in summary["contexts"]: + ctx_key = str(ctx) + standard = summary["runs"]["standard"]["contexts"][ctx_key] + tensor = summary["runs"]["tensor"]["contexts"][ctx_key] + candidate = summary["runs"][candidate_name]["contexts"][ctx_key] + gain = gains[ctx_key] + print( + f"{ctx} " + f"{standard['prefill_tps_median']:.2f} " + f"{tensor['prefill_tps_median']:.2f} " + f"{candidate['prefill_tps_median']:.2f} " + f"{gain['prefill_gain_pct']:+.1f}% " + f"{gain['gen_gain_pct']:+.1f}%" + ) + + +def evaluate_prefill_speed( + summary: dict[str, Any], + *, + candidate_name: str, + min_prefill_gain_pct: float, + min_repeat_prefill_gain_pct: float, + min_generation_gain_pct: float, +) -> dict[str, Any]: + gains = summary["gains"][f"{candidate_name}_vs_tensor"] + rows: list[dict[str, Any]] = [] + for ctx in summary["contexts"]: + ctx_key = str(ctx) + gain = gains[ctx_key] + tensor = summary["runs"]["tensor"]["contexts"][ctx_key] + candidate = summary["runs"][candidate_name]["contexts"][ctx_key] + repeat_prefill_gains = [ + ((candidate_prefill / tensor_prefill) - 1.0) * 100.0 + if tensor_prefill + else 0.0 + for candidate_prefill, tensor_prefill in zip( + candidate["prefill_tps_values"], + tensor["prefill_tps_values"], + ) + ] + repeat_generation_gains = [ + ((candidate_gen / tensor_gen) - 1.0) * 100.0 + if tensor_gen + else 0.0 + for candidate_gen, tensor_gen in zip( + candidate["gen_tps_values"], + tensor["gen_tps_values"], + ) + ] + min_repeat_prefill_gain = min(repeat_prefill_gains) if repeat_prefill_gains else gain["prefill_gain_pct"] + min_repeat_generation_gain = min(repeat_generation_gains) if repeat_generation_gains else gain["gen_gain_pct"] + rows.append({ + "ctx": ctx, + "prefill_gain_pct": gain["prefill_gain_pct"], + "gen_gain_pct": gain["gen_gain_pct"], + "repeat_prefill_gain_pct_values": repeat_prefill_gains, + "repeat_generation_gain_pct_values": repeat_generation_gains, + "min_repeat_prefill_gain_pct": min_repeat_prefill_gain, + "min_repeat_generation_gain_pct": min_repeat_generation_gain, + "prefill_ok": gain["prefill_gain_pct"] >= min_prefill_gain_pct, + "repeat_prefill_ok": min_repeat_prefill_gain >= min_repeat_prefill_gain_pct, + "generation_ok": gain["gen_gain_pct"] >= min_generation_gain_pct, + }) + return { + "min_prefill_gain_pct_required": min_prefill_gain_pct, + "min_repeat_prefill_gain_pct_required": min_repeat_prefill_gain_pct, + "min_generation_gain_pct_required": min_generation_gain_pct, + "min_prefill_gain_pct": min(row["prefill_gain_pct"] for row in rows), + "min_repeat_prefill_gain_pct": min(row["min_repeat_prefill_gain_pct"] for row in rows), + "min_repeat_generation_gain_pct": min(row["min_repeat_generation_gain_pct"] for row in rows), + "min_generation_gain_pct": min(row["gen_gain_pct"] for row in rows), + "all_prefill_contexts_ok": all(row["prefill_ok"] for row in rows), + "all_repeat_prefill_contexts_ok": all(row["repeat_prefill_ok"] for row in rows), + "all_generation_contexts_ok": all(row["generation_ok"] for row in rows), + "contexts": rows, + } + + +def speed_gate_is_ok(speed_gate: dict[str, Any] | None) -> bool: + return bool( + speed_gate and + speed_gate["all_prefill_contexts_ok"] and + speed_gate["all_repeat_prefill_contexts_ok"] and + speed_gate["all_generation_contexts_ok"] + ) + + +def speed_gate_skip_reason(speed_gate: dict[str, Any] | None) -> str: + if speed_gate is None: + return "speed summary missing" + reasons: list[str] = [] + if not speed_gate["all_prefill_contexts_ok"]: + reasons.append( + "candidate prefill is not above Tensor baseline at every measured context " + f"(min={speed_gate['min_prefill_gain_pct']:.1f}%, " + f"required={speed_gate['min_prefill_gain_pct_required']:.1f}%)" + ) + if not speed_gate["all_repeat_prefill_contexts_ok"]: + reasons.append( + "candidate prefill is not above the repeat-level Tensor baseline floor " + f"(min repeat={speed_gate['min_repeat_prefill_gain_pct']:.1f}%, " + f"required={speed_gate['min_repeat_prefill_gain_pct_required']:.1f}%)" + ) + if not speed_gate["all_generation_contexts_ok"]: + reasons.append( + "candidate generation is below the allowed Tensor-baseline floor " + f"(min={speed_gate['min_generation_gain_pct']:.1f}%, " + f"required={speed_gate['min_generation_gain_pct_required']:.1f}%)" + ) + return "; ".join(reasons) if reasons else "speed screen failed" + + +def candidate_env_requires_chunked_drift(candidate_env: dict[str, str]) -> bool: + for value in candidate_env.values(): + for match in re.finditer(r"\bpos\s*[:=]\s*(\d+)", value): + if int(match.group(1)) != 0: + return True + return False + + +def load_drift_payload(path: str | None) -> dict[str, Any] | None: + if not path: + return None + try: + with Path(path).open("r", encoding="utf-8") as fp: + return json.load(fp) + except (FileNotFoundError, json.JSONDecodeError): + return None + + +def tensor_pair_summary_for_gate( + gate_payload: dict[str, Any], + *, + pair_name: str, + max_tensor_standard_rms: float, + max_tensor_standard_top20_abs: float, +) -> dict[str, Any]: + tensor_delta = gate_payload["pairs"][pair_name]["summary"] + tensor_extrema = gate_payload["pairs"][pair_name].get("extrema", {}) + failures = list(gate_payload.get("gate_failures", [])) + result = { + "pair": pair_name, + "ok": len(failures) == 0, + "failures": failures, + "max_tensor_standard_rms": max_tensor_standard_rms, + "max_tensor_standard_top20_abs": max_tensor_standard_top20_abs, + "tensor_vs_standard_top1_mismatches": tensor_delta["top1_mismatches"], + "tensor_vs_standard_greedy_mismatches": tensor_delta.get("greedy_mismatches"), + "tensor_vs_standard_min_top20_overlap": tensor_delta["min_top20_overlap"], + "tensor_vs_standard_worst_rms": tensor_delta["worst_rms"], + "tensor_vs_standard_worst_top20_max_abs": tensor_delta["worst_top20_max_abs"], + "tensor_vs_standard_worst_rms_case": ( + tensor_extrema.get("worst_rms_case") or + tensor_extrema.get("worst_rms_frontier") + ), + "tensor_vs_standard_worst_top20_max_abs_case": ( + tensor_extrema.get("worst_top20_max_abs_case") or + tensor_extrema.get("worst_top20_max_abs_frontier") + ), + "tensor_vs_standard_min_top20_overlap_case": ( + tensor_extrema.get("min_top20_overlap_case") or + tensor_extrema.get("min_top20_overlap_frontier") + ), + } + rms_failure_present = any("worst_rms exceeds configured envelope" in failure or + "worst RMS exceeds configured envelope" in failure + for failure in failures) + top20_failure_present = any("worst_top20_max_abs exceeds configured envelope" in failure or + "worst top20 abs exceeds configured envelope" in failure + for failure in failures) + if tensor_delta["worst_rms"] > max_tensor_standard_rms: + result["ok"] = False + if not rms_failure_present: + failures.append( + f"{pair_name} worst RMS exceeds configured envelope " + f"({tensor_delta['worst_rms']:.6g} > {max_tensor_standard_rms:.6g})" + ) + if tensor_delta["worst_top20_max_abs"] > max_tensor_standard_top20_abs: + result["ok"] = False + if not top20_failure_present: + failures.append( + f"{pair_name} worst top20 abs exceeds configured envelope " + f"({tensor_delta['worst_top20_max_abs']:.6g} > " + f"{max_tensor_standard_top20_abs:.6g})" + ) + result["failures"] = failures + return result + + +def evaluate_candidate( + payload: dict[str, Any], + *, + min_prefill_gain_pct: float, + min_repeat_prefill_gain_pct: float, + min_generation_gain_pct: float, + max_tensor_standard_rms: float, + max_tensor_standard_top20_abs: float, +) -> dict[str, Any]: + speed = payload.get("speed_summary") + speed_gate = None + if speed is not None: + speed_gate = evaluate_prefill_speed(speed, + candidate_name=payload["candidate_name"], + min_prefill_gain_pct=min_prefill_gain_pct, + min_repeat_prefill_gain_pct=min_repeat_prefill_gain_pct, + min_generation_gain_pct=min_generation_gain_pct) + + drift_path = payload.get("quality_drift_gate_summary") + drift_payload = load_drift_payload(drift_path) + drift_gate = { + "run": drift_payload is not None, + "ok": False, + "failures": ["drift gate was not run"] if drift_payload is None else + list(drift_payload.get("gate_failures", [])), + } + if drift_payload is not None: + tensor_gate = tensor_pair_summary_for_gate( + drift_payload, + pair_name="tensor_vs_standard", + max_tensor_standard_rms=max_tensor_standard_rms, + max_tensor_standard_top20_abs=max_tensor_standard_top20_abs, + ) + drift_gate.update({ + "ok": tensor_gate["ok"], + "failures": tensor_gate["failures"], + **{ + key: value + for key, value in tensor_gate.items() + if key not in {"ok", "failures"} + }, + }) + + failures: list[str] = [] + if speed_gate is None: + failures.append("speed summary missing") + elif not speed_gate["all_prefill_contexts_ok"]: + failures.append( + "candidate prefill is not above Tensor baseline at every measured context " + f"(min={speed_gate['min_prefill_gain_pct']:.1f}%, " + f"required={speed_gate['min_prefill_gain_pct_required']:.1f}%)" + ) + if speed_gate is not None and not speed_gate["all_repeat_prefill_contexts_ok"]: + failures.append( + "candidate prefill is not above the repeat-level Tensor baseline floor " + f"(min repeat={speed_gate['min_repeat_prefill_gain_pct']:.1f}%, " + f"required={speed_gate['min_repeat_prefill_gain_pct_required']:.1f}%)" + ) + if speed_gate is not None and not speed_gate["all_generation_contexts_ok"]: + failures.append( + "candidate generation is below the allowed Tensor-baseline floor " + f"(min={speed_gate['min_generation_gain_pct']:.1f}%, " + f"required={speed_gate['min_generation_gain_pct_required']:.1f}%)" + ) + if not drift_gate["ok"]: + failures.extend(drift_gate["failures"]) + + chunked_required = candidate_env_requires_chunked_drift(payload.get("candidate_env", {})) + chunked_payload = load_drift_payload(payload.get("chunked_drift_gate_summary")) + coverage_gate: dict[str, Any] = { + "required": chunked_required, + "run": chunked_payload is not None, + "ok": True, + "failures": [], + } + if chunked_required and chunked_payload is None: + coverage_gate["ok"] = False + coverage_gate["failures"].append( + "candidate uses nonzero pos= route filters; the five-fixture drift " + "gate does not prove those continuation-prefill chunks, so run the " + "chunked frontier drift gate before promotion" + ) + elif chunked_payload is not None: + coverage_pair = ( + "tensor_vs_default_tensor" + if "tensor_vs_default_tensor" in chunked_payload.get("pairs", {}) + else "tensor_vs_standard" + ) + chunked_gate = tensor_pair_summary_for_gate( + chunked_payload, + pair_name=coverage_pair, + max_tensor_standard_rms=max_tensor_standard_rms, + max_tensor_standard_top20_abs=max_tensor_standard_top20_abs, + ) + coverage_gate.update({ + "ok": chunked_gate["ok"], + **{ + key: value + for key, value in chunked_gate.items() + if key not in {"ok"} + }, + }) + coverage_gate["failures"] = [ + f"chunked drift gate: {failure}" + for failure in chunked_gate["failures"] + ] + coverage_failures = coverage_gate["failures"] + failures.extend(coverage_failures) + + return { + "promotion_safe": len(failures) == 0, + "failures": failures, + "speed_gate": speed_gate, + "drift_gate": drift_gate, + "coverage_gate": coverage_gate, + } + + +def markdown_escape(value: object) -> str: + return str(value).replace("|", "\\|") + + +def shell_join(argv: list[object]) -> str: + return " ".join(shlex.quote(str(part)) for part in argv) + + +def fmt_pct(value: float) -> str: + return f"{value:+.1f}%" + + +def fmt_pct_list(values: list[float]) -> str: + return ", ".join(fmt_pct(value) for value in values) + + +def markdown_speed_summary(summary: dict[str, Any], *, candidate_name: str) -> str: + lines = [ + "## Median Speed", + "", + "| Ctx | Standard prefill | Tensor prefill | Candidate prefill | Candidate vs Tensor prefill | Candidate vs Tensor generation |", + "| ---: | ---: | ---: | ---: | ---: | ---: |", + ] + gains = summary["gains"][f"{candidate_name}_vs_tensor"] + for ctx in summary["contexts"]: + ctx_key = str(ctx) + standard = summary["runs"]["standard"]["contexts"][ctx_key] + tensor = summary["runs"]["tensor"]["contexts"][ctx_key] + candidate = summary["runs"][candidate_name]["contexts"][ctx_key] + gain = gains[ctx_key] + lines.append( + "| " + f"{ctx} | " + f"{standard['prefill_tps_median']:.2f} | " + f"{tensor['prefill_tps_median']:.2f} | " + f"{candidate['prefill_tps_median']:.2f} | " + f"{fmt_pct(gain['prefill_gain_pct'])} | " + f"{fmt_pct(gain['gen_gain_pct'])} |" + ) + return "\n".join(lines) + + +def markdown_drift_summary(payload: dict[str, Any]) -> str: + summary_path = payload.get("quality_drift_gate_summary") + markdown_path = payload.get("quality_drift_gate_markdown") + if not summary_path: + skip_reason = payload.get("quality_drift_gate_skipped_reason") + if skip_reason: + return "\n".join( + [ + "## Drift Gate", + "", + "Skipped because the speed screen failed.", + "", + f"Reason: {markdown_escape(skip_reason)}", + ] + ) + return "\n".join( + [ + "## Drift Gate", + "", + "Not run. Use `--run-drift-gate` after the speed screen passes before promoting a prefill candidate.", + ] + ) + + lines = ["## Drift Gate", ""] + drift_payload: dict[str, Any] | None = None + try: + with Path(summary_path).open("r", encoding="utf-8") as fp: + drift_payload = json.load(fp) + except FileNotFoundError: + lines.append(f"Summary JSON not found: `{markdown_escape(summary_path)}`") + except json.JSONDecodeError as exc: + lines.append(f"Could not parse `{markdown_escape(summary_path)}`: {exc}") + + if drift_payload is not None: + failures = drift_payload.get("gate_failures", []) + lines.append(f"Gate: {'FAIL' if failures else 'OK'}") + lines.append("") + if failures: + lines.append("Failures:") + lines.append("") + for failure in failures: + lines.append(f"- {markdown_escape(failure)}") + lines.append("") + + lines.extend( + [ + "| Pair | Top1 mismatches | Greedy mismatches | Min top20 | Worst RMS | Worst top20 abs |", + "| --- | ---: | ---: | ---: | ---: | ---: |", + ] + ) + for pair_name in drift_payload.get( + "pair_order", + ("standard_vs_quality", "tensor_vs_quality", "tensor_vs_standard"), + ): + pair_payload = drift_payload["pairs"][pair_name] + pair_summary = pair_payload["summary"] + lines.append( + "| " + f"{markdown_escape(pair_name)} | " + f"{pair_summary['top1_mismatches']} | " + f"{pair_summary['greedy_mismatches']} | " + f"{pair_summary['min_top20_overlap']}/20 | " + f"{pair_summary['worst_rms']:.6g} | " + f"{pair_summary['worst_top20_max_abs']:.6g} |" + ) + target_extrema = drift_payload["pairs"].get("tensor_vs_standard", {}).get("extrema") + if target_extrema: + lines.extend( + [ + "", + "| Tensor-vs-standard target | Fixture | Value |", + "| --- | --- | ---: |", + "| Worst RMS | " + f"{markdown_escape(target_extrema.get('worst_rms_case'))} | " + f"{target_extrema['worst_rms']:.6g} |", + "| Worst top20 abs | " + f"{markdown_escape(target_extrema.get('worst_top20_max_abs_case'))} | " + f"{target_extrema['worst_top20_max_abs']:.6g} |", + "| Min top20 overlap | " + f"{markdown_escape(target_extrema.get('min_top20_overlap_case'))} | " + f"{target_extrema['min_top20_overlap']}/20 |", + ] + ) + lines.extend(["", "Artifacts:", ""]) + lines.append(f"- JSON: `{markdown_escape(summary_path)}`") + if markdown_path: + lines.append(f"- Markdown: `{markdown_escape(markdown_path)}`") + return "\n".join(lines) + + +def markdown_chunked_drift_summary(payload: dict[str, Any]) -> str: + required = candidate_env_requires_chunked_drift(payload.get("candidate_env", {})) + summary_path = payload.get("chunked_drift_gate_summary") + markdown_path = payload.get("chunked_drift_gate_markdown") + skip_reason = payload.get("chunked_drift_gate_skipped_reason") + if not required and not summary_path and not skip_reason: + return "" + + if not summary_path: + lines = ["## Chunked Drift Gate", ""] + if skip_reason: + lines.extend([ + "Skipped because the speed screen failed.", + "", + f"Reason: {markdown_escape(skip_reason)}", + ]) + elif required: + lines.append( + "Not run. This candidate uses nonzero `pos=` filters, so run " + "`--run-drift-gate` to capture resumed-prefill frontier drift before promotion." + ) + else: + lines.append("Not run.") + return "\n".join(lines) + + lines = ["## Chunked Drift Gate", ""] + drift_payload: dict[str, Any] | None = None + try: + with Path(summary_path).open("r", encoding="utf-8") as fp: + drift_payload = json.load(fp) + except FileNotFoundError: + lines.append(f"Summary JSON not found: `{markdown_escape(summary_path)}`") + except json.JSONDecodeError as exc: + lines.append(f"Could not parse `{markdown_escape(summary_path)}`: {exc}") + + if drift_payload is not None: + failures = drift_payload.get("gate_failures", []) + lines.append(f"Gate: {'FAIL' if failures else 'OK'}") + lines.append("") + if failures: + lines.append("Failures:") + lines.append("") + for failure in failures: + lines.append(f"- {markdown_escape(failure)}") + lines.append("") + lines.extend( + [ + "| Pair | Top1 mismatches | Min top20 | Worst RMS | Worst RMS frontier | Worst top20 abs | Worst top20 abs frontier |", + "| --- | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + ) + for pair_name in drift_payload.get( + "pair_order", + ("standard_vs_quality", "tensor_vs_quality", "tensor_vs_standard"), + ): + pair_payload = drift_payload["pairs"][pair_name] + pair_summary = pair_payload["summary"] + pair_extrema = pair_payload.get("extrema", {}) + lines.append( + "| " + f"{markdown_escape(pair_name)} | " + f"{pair_summary['top1_mismatches']} | " + f"{pair_summary['min_top20_overlap']}/20 | " + f"{pair_summary['worst_rms']:.6g} | " + f"{markdown_escape(pair_extrema.get('worst_rms_frontier', 'n/a'))} | " + f"{pair_summary['worst_top20_max_abs']:.6g} | " + f"{markdown_escape(pair_extrema.get('worst_top20_max_abs_frontier', 'n/a'))} |" + ) + lines.extend(["", "Artifacts:", ""]) + lines.append(f"- JSON: `{markdown_escape(summary_path)}`") + if markdown_path: + lines.append(f"- Markdown: `{markdown_escape(markdown_path)}`") + return "\n".join(lines) + + +def markdown_promotion_summary(payload: dict[str, Any]) -> str: + decision = payload.get("promotion_decision") + if not decision: + return "\n".join(["## Promotion Decision", "", "Not evaluated."]) + + lines = [ + "## Promotion Decision", + "", + f"Promotion-safe: {'yes' if decision['promotion_safe'] else 'no'}", + "", + ] + if decision["failures"]: + lines.append("Reasons:") + lines.append("") + for failure in decision["failures"]: + lines.append(f"- {markdown_escape(failure)}") + lines.append("") + + speed_gate = decision.get("speed_gate") + if speed_gate: + lines.extend( + [ + "| Speed gate | Value |", + "| --- | ---: |", + f"| Required min prefill gain | {fmt_pct(speed_gate['min_prefill_gain_pct_required'])} |", + f"| Required min repeat prefill gain | {fmt_pct(speed_gate['min_repeat_prefill_gain_pct_required'])} |", + f"| Required min generation gain | {fmt_pct(speed_gate['min_generation_gain_pct_required'])} |", + f"| Observed min prefill gain | {fmt_pct(speed_gate['min_prefill_gain_pct'])} |", + f"| Observed min repeat prefill gain | {fmt_pct(speed_gate['min_repeat_prefill_gain_pct'])} |", + f"| Observed min generation gain | {fmt_pct(speed_gate['min_generation_gain_pct'])} |", + f"| Observed min repeat generation gain | {fmt_pct(speed_gate['min_repeat_generation_gain_pct'])} |", + f"| All prefill contexts pass | {'yes' if speed_gate['all_prefill_contexts_ok'] else 'no'} |", + f"| All repeat prefill contexts pass | {'yes' if speed_gate['all_repeat_prefill_contexts_ok'] else 'no'} |", + f"| All generation contexts pass | {'yes' if speed_gate['all_generation_contexts_ok'] else 'no'} |", + "", + ] + ) + lines.extend( + [ + "| Ctx | Median prefill | Repeat prefill | Median generation | Repeat generation |", + "| ---: | ---: | --- | ---: | --- |", + ] + ) + for row in speed_gate["contexts"]: + lines.append( + "| " + f"{row['ctx']} | " + f"{fmt_pct(row['prefill_gain_pct'])} | " + f"{markdown_escape(fmt_pct_list(row['repeat_prefill_gain_pct_values']))} | " + f"{fmt_pct(row['gen_gain_pct'])} | " + f"{markdown_escape(fmt_pct_list(row['repeat_generation_gain_pct_values']))} |" + ) + lines.append("") + + drift_gate = decision.get("drift_gate") + if drift_gate: + lines.extend( + [ + "| Drift gate | Value |", + "| --- | ---: |", + f"| Run | {'yes' if drift_gate['run'] else 'no'} |", + f"| OK | {'yes' if drift_gate['ok'] else 'no'} |", + ] + ) + if drift_gate.get("run"): + lines.extend( + [ + f"| Max Tensor-vs-standard RMS | {drift_gate['max_tensor_standard_rms']:.6g} |", + f"| Max Tensor-vs-standard top20 abs | {drift_gate['max_tensor_standard_top20_abs']:.6g} |", + f"| Tensor-vs-standard top1 mismatches | {drift_gate['tensor_vs_standard_top1_mismatches']} |", + f"| Tensor-vs-standard greedy mismatches | {drift_gate['tensor_vs_standard_greedy_mismatches']} |", + f"| Tensor-vs-standard min top20 | {drift_gate['tensor_vs_standard_min_top20_overlap']}/20 |", + f"| Tensor-vs-standard worst RMS | {drift_gate['tensor_vs_standard_worst_rms']:.6g} |", + f"| Tensor-vs-standard worst RMS case | {markdown_escape(drift_gate.get('tensor_vs_standard_worst_rms_case') or 'n/a')} |", + f"| Tensor-vs-standard worst top20 abs | {drift_gate['tensor_vs_standard_worst_top20_max_abs']:.6g} |", + f"| Tensor-vs-standard worst top20 abs case | {markdown_escape(drift_gate.get('tensor_vs_standard_worst_top20_max_abs_case') or 'n/a')} |", + ] + ) + lines.append("") + coverage_gate = decision.get("coverage_gate") + if coverage_gate: + lines.extend( + [ + "", + "| Coverage gate | Value |", + "| --- | ---: |", + f"| Requires chunked drift coverage | {'yes' if coverage_gate.get('required') else 'no'} |", + f"| Chunked drift run | {'yes' if coverage_gate.get('run') else 'no'} |", + f"| OK | {'yes' if coverage_gate['ok'] else 'no'} |", + ] + ) + if coverage_gate.get("run") and "tensor_vs_standard_worst_rms" in coverage_gate: + lines.extend( + [ + f"| Coverage pair | {markdown_escape(coverage_gate.get('pair') or 'n/a')} |", + f"| Max coverage RMS | {coverage_gate['max_tensor_standard_rms']:.6g} |", + f"| Max coverage top20 abs | {coverage_gate['max_tensor_standard_top20_abs']:.6g} |", + f"| Coverage top1 mismatches | {coverage_gate['tensor_vs_standard_top1_mismatches']} |", + f"| Coverage min top20 | {coverage_gate['tensor_vs_standard_min_top20_overlap']}/20 |", + f"| Coverage worst RMS | {coverage_gate['tensor_vs_standard_worst_rms']:.6g} |", + f"| Coverage worst RMS frontier | {markdown_escape(coverage_gate.get('tensor_vs_standard_worst_rms_case') or 'n/a')} |", + f"| Coverage worst top20 abs | {coverage_gate['tensor_vs_standard_worst_top20_max_abs']:.6g} |", + f"| Coverage worst top20 abs frontier | {markdown_escape(coverage_gate.get('tensor_vs_standard_worst_top20_max_abs_case') or 'n/a')} |", + ] + ) + return "\n".join(lines) + + +def markdown_run_config(payload: dict[str, Any]) -> str: + config = payload.get("run_config") + if not config: + return "" + lines = [ + "## Run Config", + "", + "| Setting | Value |", + "| --- | --- |", + ] + for key in ( + "repo_root", + "ds4_bench", + "ds4", + "model", + "prompt_file", + "out_dir", + "ctx_start", + "ctx_max", + "step_mul", + "gen_tokens", + "repeat", + "candidate_preset", + "candidate_mode", + "reuse", + "run_drift_gate", + "min_prefill_gain_pct", + "min_repeat_prefill_gain_pct", + "min_generation_gain_pct", + "max_tensor_standard_rms", + "max_tensor_standard_top20_abs", + ): + if key in config: + lines.append(f"| `{markdown_escape(key)}` | `{markdown_escape(config[key])}` |") + if config.get("argv"): + lines.extend( + [ + "", + "Replay command:", + "", + "```sh", + shell_join(["python3", *config["argv"]]), + "```", + ] + ) + return "\n".join(lines) + + +def write_candidate_markdown_summary(payload: dict[str, Any], path: Path) -> None: + lines = [ + "# Prefill Candidate Gate", + "", + f"Candidate: `{markdown_escape(payload['candidate_label'])}`", + f"Mode: `-mt {markdown_escape(payload['candidate_mode'])}`", + "", + ] + if payload.get("candidate_preset"): + lines.append(f"Preset: `{markdown_escape(payload['candidate_preset'])}`") + lines.append("") + candidate_env = payload["candidate_env"] + if candidate_env: + lines.append("Environment overrides:") + lines.append("") + for name, value in sorted(candidate_env.items()): + lines.append(f"- `{markdown_escape(name)}={markdown_escape(value)}`") + else: + lines.append("Environment overrides: none") + lines.append("") + run_config = markdown_run_config(payload) + if run_config: + lines.append(run_config) + lines.append("") + lines.append(markdown_promotion_summary(payload)) + lines.append("") + + if "speed_summary" in payload: + lines.append(markdown_speed_summary(payload["speed_summary"], + candidate_name=payload["candidate_name"])) + else: + lines.append("## Median Speed") + lines.append("") + lines.append("Not available in dry-run mode.") + lines.append("") + lines.append(markdown_drift_summary(payload)) + chunked_drift_summary = markdown_chunked_drift_summary(payload) + if chunked_drift_summary: + lines.append("") + lines.append(chunked_drift_summary) + lines.append("") + lines.append("## CSV Inputs") + lines.append("") + for name, paths in payload["csv_paths"].items(): + for csv_path in paths: + lines.append(f"- `{markdown_escape(name)}`: `{markdown_escape(csv_path)}`") + + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def build_run_config(args: argparse.Namespace) -> dict[str, Any]: + return { + "argv": sys.argv, + "repo_root": str(args.repo_root), + "ds4_bench": str(args.ds4_bench), + "ds4": str(args.ds4), + "python": str(args.python), + "model": str(args.model) if args.model else None, + "prompt_file": str(args.prompt_file), + "out_dir": str(args.out_dir), + "candidate_preset": args.preset, + "candidate_label": args.candidate_label, + "candidate_mode": args.candidate_mode, + "ctx_start": args.ctx_start, + "ctx_max": args.ctx_max, + "step_mul": args.step_mul, + "gen_tokens": args.gen_tokens, + "repeat": args.repeat, + "min_prefill_gain_pct": args.min_prefill_gain_pct, + "min_repeat_prefill_gain_pct": args.min_repeat_prefill_gain_pct, + "min_generation_gain_pct": args.min_generation_gain_pct, + "max_tensor_standard_rms": args.max_tensor_standard_rms, + "max_tensor_standard_top20_abs": args.max_tensor_standard_top20_abs, + "run_drift_gate": args.run_drift_gate, + "fail_on_quality_greedy": args.fail_on_quality_greedy, + "allow_stale_binary": args.allow_stale_binary, + "reuse": args.reuse, + "no_fail": args.no_fail, + "dry_run": args.dry_run, + } + + +def run_benchmarks(args: argparse.Namespace, candidate_env: dict[str, str]) -> dict[str, list[Path]]: + candidate_name = safe_label(args.candidate_label) + if candidate_name in {"standard", "tensor"}: + raise SystemExit("--candidate-label must not resolve to 'standard' or 'tensor'") + runs = ( + BenchRun("standard", "Standard Metal", ["-mt", "off"], {}), + BenchRun("tensor", "Tensor Metal", ["-mt", "auto"], {}), + BenchRun(candidate_name, args.candidate_label, ["-mt", args.candidate_mode], candidate_env), + ) + common_args = [ + "--prompt-file", + str(args.prompt_file), + "--ctx-start", + str(args.ctx_start), + "--ctx-max", + str(args.ctx_max), + "--step-mul", + str(args.step_mul), + "--gen-tokens", + str(args.gen_tokens), + ] + if args.model: + common_args[:0] = ["-m", str(args.model)] + + csv_paths: dict[str, list[Path]] = {run.name: [] for run in runs} + for repeat in range(1, args.repeat + 1): + repeat_dir = args.out_dir / f"repeat-{repeat}" + repeat_dir.mkdir(parents=True, exist_ok=True) + chart_inputs: list[Path] = [] + chart_labels: list[str] = [] + for run in runs: + csv_path = repeat_dir / f"{run.name}.csv" + csv_paths[run.name].append(csv_path) + cmd = [str(args.ds4_bench)] + run.mode_args + common_args + ["--csv", str(csv_path)] + print(f"\nrepeat {repeat}/{args.repeat}: {run.label} -> {csv_path}") + if args.reuse and csv_path.exists(): + print(f"reuse {csv_path}", flush=True) + else: + run_command(cmd, cwd=args.repo_root, env_overrides=run.env, dry_run=args.dry_run) + chart_inputs.append(csv_path) + chart_labels.append(run.label) + + chart_path = repeat_dir / "prefill-candidate.png" + compare_cmd = [ + str(args.python), + "speed-bench/compare_bench.py", + *[str(path) for path in chart_inputs], + "--labels", + *chart_labels, + "--title", + f"Prefill candidate: {args.candidate_label} (repeat {repeat})", + "-o", + str(chart_path), + ] + if args.reuse and chart_path.exists(): + print(f"reuse {chart_path}", flush=True) + else: + run_command(compare_cmd, cwd=args.repo_root, env_overrides={}, dry_run=args.dry_run) + + return csv_paths + + +def run_drift_gate(args: argparse.Namespace, candidate_env: dict[str, str]) -> Path: + gate_dir = args.out_dir / "quality-drift-gate" + cmd = [ + str(args.python), + "speed-bench/run_quality_drift_gate.py", + "--repo-root", + str(args.repo_root), + "--ds4", + str(args.ds4), + "--out-dir", + str(gate_dir), + ] + if args.model: + cmd += ["--model", str(args.model)] + if args.fail_on_quality_greedy: + cmd.append("--fail-on-quality-greedy") + cmd.append("--no-fail") + if args.reuse: + cmd.append("--reuse") + if args.allow_stale_binary: + cmd.append("--allow-stale-binary") + cmd += ["--max-tensor-standard-rms", str(args.max_tensor_standard_rms)] + cmd += ["--max-tensor-standard-top20-abs", str(args.max_tensor_standard_top20_abs)] + for name, value in sorted(candidate_env.items()): + cmd += ["--set-env", f"{name}={value}"] + run_command(cmd, cwd=args.repo_root, env_overrides={}, dry_run=args.dry_run) + return gate_dir + + +def run_chunked_drift_gate(args: argparse.Namespace, candidate_env: dict[str, str]) -> Path: + gate_dir = args.out_dir / "chunked-drift-gate" + cmd = [ + str(args.python), + "speed-bench/run_chunked_prefill_drift_gate.py", + "--repo-root", + str(args.repo_root), + "--ds4-bench", + str(args.ds4_bench), + "--prompt-file", + str(args.prompt_file), + "--out-dir", + str(gate_dir), + "--ctx-start", + str(args.ctx_start), + "--ctx-max", + str(args.ctx_max), + "--step-mul", + str(args.step_mul), + "--gen-tokens", + "1", + "--max-tensor-default-rms", + str(args.max_tensor_standard_rms), + "--max-tensor-default-top20-abs", + str(args.max_tensor_standard_top20_abs), + "--no-fail", + ] + if args.model: + cmd += ["--model", str(args.model)] + if args.reuse: + cmd.append("--reuse") + for name, value in sorted(candidate_env.items()): + cmd += ["--set-env", f"{name}={value}"] + run_command(cmd, cwd=args.repo_root, env_overrides={}, dry_run=args.dry_run) + return gate_dir + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=f"Candidate presets:\n{preset_help()}", + ) + parser.add_argument("--repo-root", type=Path, default=Path(".")) + parser.add_argument("--ds4-bench", type=Path, default=Path("./ds4-bench")) + parser.add_argument("--ds4", type=Path, default=Path("./ds4")) + parser.add_argument("--python", type=Path, default=Path(sys.executable)) + parser.add_argument("--model", type=Path) + parser.add_argument("--prompt-file", type=Path, default=Path("speed-bench/promessi_sposi.txt")) + parser.add_argument("--out-dir", type=Path) + parser.add_argument( + "--preset", + choices=sorted(CANDIDATE_PRESETS), + help="Use a named default-off candidate environment preset.", + ) + parser.add_argument("--candidate-label") + parser.add_argument("--candidate-mode", choices=("auto", "on", "off"), default="auto") + parser.add_argument("--ctx-start", type=int, default=512) + parser.add_argument("--ctx-max", type=int, default=8192) + parser.add_argument("--step-mul", type=int, default=2) + parser.add_argument("--gen-tokens", type=int, default=16) + parser.add_argument("--repeat", type=int, default=2) + parser.add_argument( + "--min-prefill-gain-pct", + type=float, + default=0.0, + help="Minimum candidate-vs-Tensor prefill gain required at every measured context for promotion.", + ) + parser.add_argument( + "--min-repeat-prefill-gain-pct", + type=float, + default=0.0, + help="Minimum candidate-vs-Tensor prefill gain required for every repeat/context pair.", + ) + parser.add_argument( + "--min-generation-gain-pct", + type=float, + default=-5.0, + help="Minimum candidate-vs-Tensor generation gain allowed at every measured context for promotion.", + ) + parser.add_argument( + "--max-tensor-standard-rms", + type=float, + default=0.30, + help="Maximum Tensor-vs-standard worst RMS allowed for production promotion.", + ) + parser.add_argument( + "--max-tensor-standard-top20-abs", + type=float, + default=0.60, + help="Maximum Tensor-vs-standard worst top-20 absolute drift allowed for production promotion.", + ) + parser.add_argument( + "--set-env", + action="append", + default=[], + metavar="NAME=VALUE", + help="Set an environment variable only for the candidate bench and drift gate.", + ) + parser.add_argument("--run-drift-gate", action="store_true") + parser.add_argument("--fail-on-quality-greedy", action="store_true") + parser.add_argument( + "--reuse", + action="store_true", + help="Reuse existing benchmark CSVs/charts and drift-gate dumps in --out-dir when present.", + ) + parser.add_argument( + "--allow-stale-binary", + action="store_true", + help="Skip source-vs-binary freshness checks.", + ) + parser.add_argument( + "--no-fail", + action="store_true", + help="Always exit 0 after writing the promotion decision.", + ) + parser.add_argument("--dry-run", action="store_true") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if args.repeat < 1: + raise SystemExit("--repeat must be >= 1") + candidate_env = candidate_env_from_args(args) + if args.out_dir is None: + run_id = time.strftime("%Y%m%d-%H%M%S") + args.out_dir = Path("speed-bench/local-runs") / f"{run_id}-{safe_label(args.candidate_label)}" + args.repo_root = args.repo_root.resolve() + if not args.ds4_bench.is_absolute(): + args.ds4_bench = args.repo_root / args.ds4_bench + if not args.ds4.is_absolute(): + args.ds4 = args.repo_root / args.ds4 + args.out_dir.mkdir(parents=True, exist_ok=True) + if not args.dry_run: + assert_fresh_binary( + args.ds4_bench, + repo_root=args.repo_root, + source_patterns=DS4_BENCH_FRESHNESS_SOURCES, + allow_stale=args.allow_stale_binary, + ) + + candidate_name = safe_label(args.candidate_label) + if candidate_name in {"standard", "tensor"}: + raise SystemExit("--candidate-label must not resolve to 'standard' or 'tensor'") + csv_paths = run_benchmarks(args, candidate_env) + + payload: dict[str, Any] = { + "candidate_label": args.candidate_label, + "candidate_name": candidate_name, + "candidate_preset": args.preset, + "candidate_mode": args.candidate_mode, + "candidate_env": candidate_env, + "run_config": build_run_config(args), + "csv_paths": {name: [str(path) for path in paths] for name, paths in csv_paths.items()}, + } + if not args.dry_run: + speed_summary = summarize_repeats( + csv_paths, + baseline_name="standard", + tensor_name="tensor", + candidate_name=candidate_name, + ) + payload["speed_summary"] = speed_summary + print_summary(speed_summary, candidate_name=candidate_name) + payload["speed_screen"] = evaluate_prefill_speed( + speed_summary, + candidate_name=candidate_name, + min_prefill_gain_pct=args.min_prefill_gain_pct, + min_repeat_prefill_gain_pct=args.min_repeat_prefill_gain_pct, + min_generation_gain_pct=args.min_generation_gain_pct, + ) + + if args.run_drift_gate: + speed_screen = payload.get("speed_screen") + if args.dry_run or speed_gate_is_ok(speed_screen): + gate_dir = run_drift_gate(args, candidate_env) + payload["quality_drift_gate_summary"] = str(gate_dir / "summary.json") + payload["quality_drift_gate_markdown"] = str(gate_dir / "summary.md") + if candidate_env_requires_chunked_drift(candidate_env): + chunked_gate_dir = run_chunked_drift_gate(args, candidate_env) + payload["chunked_drift_gate_summary"] = str(chunked_gate_dir / "summary.json") + payload["chunked_drift_gate_markdown"] = str(chunked_gate_dir / "summary.md") + else: + skip_reason = speed_gate_skip_reason(speed_screen) + payload["quality_drift_gate_skipped_reason"] = skip_reason + if candidate_env_requires_chunked_drift(candidate_env): + payload["chunked_drift_gate_skipped_reason"] = skip_reason + print(f"\nSkipping drift gate because the speed screen failed: {skip_reason}") + elif args.reuse: + gate_dir = args.out_dir / "quality-drift-gate" + if (gate_dir / "summary.json").exists(): + payload["quality_drift_gate_summary"] = str(gate_dir / "summary.json") + if (gate_dir / "summary.md").exists(): + payload["quality_drift_gate_markdown"] = str(gate_dir / "summary.md") + chunked_gate_dir = args.out_dir / "chunked-drift-gate" + if (chunked_gate_dir / "summary.json").exists(): + payload["chunked_drift_gate_summary"] = str(chunked_gate_dir / "summary.json") + if (chunked_gate_dir / "summary.md").exists(): + payload["chunked_drift_gate_markdown"] = str(chunked_gate_dir / "summary.md") + + if not args.dry_run: + payload["promotion_decision"] = evaluate_candidate( + payload, + min_prefill_gain_pct=args.min_prefill_gain_pct, + min_repeat_prefill_gain_pct=args.min_repeat_prefill_gain_pct, + min_generation_gain_pct=args.min_generation_gain_pct, + max_tensor_standard_rms=args.max_tensor_standard_rms, + max_tensor_standard_top20_abs=args.max_tensor_standard_top20_abs, + ) + + summary_path = args.out_dir / "prefill-candidate-summary.json" + markdown_path = args.out_dir / "prefill-candidate-summary.md" + if not args.dry_run: + with summary_path.open("w", encoding="utf-8") as fp: + json.dump(payload, fp, indent=2) + fp.write("\n") + write_candidate_markdown_summary(payload, markdown_path) + print(f"\nWrote {summary_path}") + print(f"Wrote {markdown_path}") + else: + print(f"\nDry run only; would write {summary_path}") + print(f"Dry run only; would write {markdown_path}") + if (not args.dry_run and + args.run_drift_gate and + not args.no_fail and + not payload["promotion_decision"]["promotion_safe"]): + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/speed-bench/run_quality_drift_gate.py b/speed-bench/run_quality_drift_gate.py new file mode 100644 index 000000000..d8a48f8b5 --- /dev/null +++ b/speed-bench/run_quality_drift_gate.py @@ -0,0 +1,651 @@ +#!/usr/bin/env python3 +"""Run the five-fixture Metal quality drift gate. + +The gate captures first-token full logits and 16-token greedy continuations for +three modes: + + quality -> --metal --quality + standard -> --metal -mt off + tensor -> --metal -mt auto + +It reports: + + standard_vs_quality + tensor_vs_quality + tensor_vs_standard + +The third comparison isolates the Tensor-route delta. The first two show +whether Tensor Metal is materially worse than the existing non-quality Metal +path when both are judged against --quality. +""" + +from __future__ import annotations + +import argparse +import json +import os +import shlex +import subprocess +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from compare_logit_drift import compare, load_dump +from metal_tensor_presets import CANDIDATE_PRESETS, preset_help + + +@dataclass(frozen=True) +class Case: + case_id: str + ctx: int + prompt_path: str + + +CASES = ( + Case("short_italian_fact", 16384, "tests/test-vectors/prompts/short_italian_fact.txt"), + Case("short_code_completion", 4096, "tests/test-vectors/prompts/short_code_completion.txt"), + Case("short_reasoning_plain", 4096, "tests/test-vectors/prompts/short_reasoning_plain.txt"), + Case("long_memory_archive", 16384, "tests/test-vectors/prompts/long_memory_archive.txt"), + Case("long_code_audit", 16384, "tests/test-vectors/prompts/long_code_audit.txt"), +) + +MODES: dict[str, list[str]] = { + "quality": ["--quality"], + "standard": ["-mt", "off"], + "tensor": ["-mt", "auto"], +} + +PAIRS = ( + ("standard_vs_quality", "quality", "standard"), + ("tensor_vs_quality", "quality", "tensor"), + ("tensor_vs_standard", "standard", "tensor"), +) + +DS4_FRESHNESS_SOURCES = ( + "ds4.c", + "ds4.h", + "ds4_gpu.h", + "ds4_cli.c", + "ds4_metal.m", + "metal/*.metal", +) + + +def assert_fresh_binary( + binary: Path, + *, + repo_root: Path, + source_patterns: tuple[str, ...], + allow_stale: bool, +) -> None: + if allow_stale: + return + if not binary.exists(): + raise SystemExit(f"{binary}: binary does not exist; run the relevant make target first") + binary_mtime = binary.stat().st_mtime + stale_sources: list[Path] = [] + for pattern in source_patterns: + matches = sorted(repo_root.glob(pattern)) + if not matches: + continue + stale_sources.extend(path for path in matches if path.stat().st_mtime > binary_mtime) + if stale_sources: + newest = max(stale_sources, key=lambda path: path.stat().st_mtime) + rel = newest.relative_to(repo_root) + raise SystemExit( + f"{binary}: stale binary; {rel} is newer. " + "Rebuild before running the drift gate, or pass --allow-stale-binary " + "only when intentionally summarizing old artifacts." + ) + + +def run_command(cmd: list[str], *, cwd: Path, dry_run: bool) -> None: + print("+", " ".join(cmd), flush=True) + if dry_run: + return + proc = subprocess.run(cmd, cwd=cwd, text=True, capture_output=True) + if proc.returncode != 0: + raise SystemExit( + f"command failed with exit {proc.returncode}: {' '.join(cmd)}\n" + f"stdout:\n{proc.stdout[-4000:]}\n" + f"stderr:\n{proc.stderr[-8000:]}" + ) + + +def dump_paths(out_dir: Path, case: Case, mode: str) -> tuple[Path, Path]: + stem = f"{case.case_id}.{mode}" + return out_dir / f"{stem}.logits.json", out_dir / f"{stem}.logprobs.json" + + +def ds4_base_cmd(args: argparse.Namespace, case: Case) -> list[str]: + cmd = [ + str(args.ds4), + "--metal", + "--temp", + "0", + "--nothink", + "--system", + "", + "-c", + str(case.ctx), + "--prompt-file", + case.prompt_path, + ] + if args.model: + cmd[1:1] = ["-m", str(args.model)] + return cmd + + +def capture_case(args: argparse.Namespace, case: Case, mode: str) -> None: + logits_path, logprobs_path = dump_paths(args.out_dir, case, mode) + mode_args = MODES[mode] + base = ds4_base_cmd(args, case) + + if not args.reuse or not logits_path.exists(): + run_command( + base + mode_args + ["--dump-logits", str(logits_path)], + cwd=args.repo_root, + dry_run=args.dry_run, + ) + + if not args.reuse or not logprobs_path.exists(): + run_command( + base + + mode_args + + [ + "-n", + str(args.greedy_tokens), + "--dump-logprobs", + str(logprobs_path), + "--logprobs-top-k", + str(args.top_k), + ], + cwd=args.repo_root, + dry_run=args.dry_run, + ) + + +def selected_ids(path: Path) -> list[int]: + with path.open("r", encoding="utf-8") as fp: + data = json.load(fp) + return [int(step["selected"]["id"]) for step in data.get("steps", [])] + + +def greedy_diff(ref_path: Path, cand_path: Path) -> dict[str, Any]: + ref = selected_ids(ref_path) + cand = selected_ids(cand_path) + first_diff = None + for i, (ref_id, cand_id) in enumerate(zip(ref, cand)): + if ref_id != cand_id: + first_diff = i + break + if first_diff is None and len(ref) != len(cand): + first_diff = min(len(ref), len(cand)) + return { + "same": first_diff is None, + "first_diff": first_diff, + "ref_tokens": ref, + "cand_tokens": cand, + } + + +def aggregate(rows: list[dict[str, Any]]) -> dict[str, Any]: + return { + "cases": len(rows), + "top1_mismatches": sum(0 if row["same_top1"] else 1 for row in rows), + "greedy_mismatches": sum(0 if row["greedy_same"] else 1 for row in rows), + "min_top5_overlap": min(row["top5_overlap"] for row in rows), + "min_top20_overlap": min(row["top20_overlap"] for row in rows), + "worst_rank_delta": max(row["max_rank_delta"] for row in rows), + "worst_rms": max(row["rms"] for row in rows), + "worst_max_abs": max(row["max_abs"] for row in rows), + "worst_top20_max_abs": max(row["top20_max_abs"] for row in rows), + } + + +def extrema(rows: list[dict[str, Any]]) -> dict[str, Any]: + worst_rms = max(rows, key=lambda row: row["rms"]) + worst_top20 = max(rows, key=lambda row: row["top20_max_abs"]) + worst_max_abs = max(rows, key=lambda row: row["max_abs"]) + worst_rank_delta = max(rows, key=lambda row: row["max_rank_delta"]) + min_top20 = min(rows, key=lambda row: row["top20_overlap"]) + return { + "worst_rms_case": worst_rms["case"], + "worst_rms": worst_rms["rms"], + "worst_top20_max_abs_case": worst_top20["case"], + "worst_top20_max_abs": worst_top20["top20_max_abs"], + "worst_max_abs_case": worst_max_abs["case"], + "worst_max_abs": worst_max_abs["max_abs"], + "worst_rank_delta_case": worst_rank_delta["case"], + "worst_rank_delta": worst_rank_delta["max_rank_delta"], + "min_top20_overlap_case": min_top20["case"], + "min_top20_overlap": min_top20["top20_overlap"], + "top1_mismatch_cases": [row["case"] for row in rows if not row["same_top1"]], + "greedy_mismatch_cases": [ + { + "case": row["case"], + "first_diff": row["greedy_first_diff"], + } + for row in rows + if not row["greedy_same"] + ], + } + + +def greedy_label(row: dict[str, Any]) -> str: + return "same" if row["greedy_same"] else f"diff@{row['greedy_first_diff']}" + + +def print_pair_table(pair_name: str, rows: list[dict[str, Any]]) -> None: + print(f"\n{pair_name}") + print("case same_top1 top5 top20 rank rms max_abs top20_abs greedy") + for row in rows: + print( + f"{row['case']} " + f"{'yes' if row['same_top1'] else 'no'} " + f"{row['top5_overlap']}/5 " + f"{row['top20_overlap']}/20 " + f"{row['max_rank_delta']} " + f"{row['rms']:.6g} " + f"{row['max_abs']:.6g} " + f"{row['top20_max_abs']:.6g} " + f"{greedy_label(row)}" + ) + summary = aggregate(rows) + print( + "summary " + f"top1_mismatches={summary['top1_mismatches']} " + f"greedy_mismatches={summary['greedy_mismatches']} " + f"min_top20={summary['min_top20_overlap']}/20 " + f"worst_rms={summary['worst_rms']:.6g} " + f"worst_top20_max_abs={summary['worst_top20_max_abs']:.6g}" + ) + + +def markdown_escape(value: object) -> str: + return str(value).replace("|", "\\|") + + +def shell_join(argv: list[object]) -> str: + return " ".join(shlex.quote(str(part)) for part in argv) + + +def markdown_pair_table(pair_name: str, rows: list[dict[str, Any]]) -> str: + lines = [ + f"## {markdown_escape(pair_name)}", + "", + "| Case | Same top1 | Top5 | Top20 | Rank delta | RMS | Max abs | Top20 abs | Greedy |", + "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | --- |", + ] + for row in rows: + lines.append( + "| " + f"{markdown_escape(row['case'])} | " + f"{'yes' if row['same_top1'] else 'no'} | " + f"{row['top5_overlap']}/5 | " + f"{row['top20_overlap']}/20 | " + f"{row['max_rank_delta']} | " + f"{row['rms']:.6g} | " + f"{row['max_abs']:.6g} | " + f"{row['top20_max_abs']:.6g} | " + f"{greedy_label(row)} |" + ) + summary = aggregate(rows) + row_extrema = extrema(rows) + lines.extend( + [ + "", + "| Summary | Value |", + "| --- | ---: |", + f"| Top1 mismatches | {summary['top1_mismatches']} |", + f"| Greedy mismatches | {summary['greedy_mismatches']} |", + f"| Min top5 overlap | {summary['min_top5_overlap']}/5 |", + f"| Min top20 overlap | {summary['min_top20_overlap']}/20 |", + f"| Worst rank delta | {summary['worst_rank_delta']} |", + f"| Worst RMS | {summary['worst_rms']:.6g} |", + f"| Worst max abs | {summary['worst_max_abs']:.6g} |", + f"| Worst top20 max abs | {summary['worst_top20_max_abs']:.6g} |", + "", + "| Worst fixture | Value |", + "| --- | --- |", + f"| Worst RMS case | {markdown_escape(row_extrema['worst_rms_case'])} " + f"({row_extrema['worst_rms']:.6g}) |", + f"| Worst top20 abs case | {markdown_escape(row_extrema['worst_top20_max_abs_case'])} " + f"({row_extrema['worst_top20_max_abs']:.6g}) |", + f"| Worst max abs case | {markdown_escape(row_extrema['worst_max_abs_case'])} " + f"({row_extrema['worst_max_abs']:.6g}) |", + f"| Worst rank delta case | {markdown_escape(row_extrema['worst_rank_delta_case'])} " + f"({row_extrema['worst_rank_delta']}) |", + f"| Min top20 overlap case | {markdown_escape(row_extrema['min_top20_overlap_case'])} " + f"({row_extrema['min_top20_overlap']}/20) |", + "", + ] + ) + return "\n".join(lines) + + +def write_markdown_summary(payload: dict[str, Any], path: Path) -> None: + lines = [ + "# Quality Drift Gate", + "", + "Modes:", + "", + ] + for mode, mode_args in payload["modes"].items(): + lines.append(f"- `{markdown_escape(mode)}`: `{' '.join(mode_args)}`") + if payload["env"]: + lines.extend(["", "Environment overrides:", ""]) + for name, value in sorted(payload["env"].items()): + lines.append(f"- `{markdown_escape(name)}={markdown_escape(value)}`") + else: + lines.extend(["", "Environment overrides: none"]) + + config = payload.get("run_config") + if config: + lines.extend(["", "Run config:", ""]) + lines.extend(["| Setting | Value |", "| --- | --- |"]) + for key in ( + "repo_root", + "ds4", + "model", + "out_dir", + "candidate_preset", + "top_k", + "greedy_tokens", + "reuse", + "fail_on_quality_greedy", + "max_tensor_standard_rms", + "max_tensor_standard_top20_abs", + ): + if key in config: + lines.append(f"| `{markdown_escape(key)}` | `{markdown_escape(config[key])}` |") + if config.get("argv"): + lines.extend( + [ + "", + "Replay command:", + "", + "```sh", + shell_join(["python3", *config["argv"]]), + "```", + ] + ) + + envelope = payload.get("drift_envelope") or {} + if envelope: + lines.extend(["", "Tensor-vs-standard drift envelope:", ""]) + if envelope.get("max_rms") is not None: + lines.append(f"- Worst RMS <= `{envelope['max_rms']:.6g}`") + if envelope.get("max_top20_abs") is not None: + lines.append(f"- Worst top20 abs <= `{envelope['max_top20_abs']:.6g}`") + else: + lines.extend(["", "Tensor-vs-standard drift envelope: not configured"]) + + failures = payload["gate_failures"] + lines.extend(["", f"Gate: {'FAIL' if failures else 'OK'}", ""]) + if failures: + lines.append("Failures:") + lines.append("") + for failure in failures: + lines.append(f"- {markdown_escape(failure)}") + lines.append("") + + for pair_name, _, _ in PAIRS: + lines.append(markdown_pair_table(pair_name, payload["pairs"][pair_name]["rows"])) + + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def summarize(args: argparse.Namespace) -> dict[str, Any]: + pairs: dict[str, Any] = {} + for pair_name, ref_mode, cand_mode in PAIRS: + rows: list[dict[str, Any]] = [] + for case in CASES: + ref_logits, ref_logprobs = dump_paths(args.out_dir, case, ref_mode) + cand_logits, cand_logprobs = dump_paths(args.out_dir, case, cand_mode) + metrics = compare(load_dump(ref_logits), load_dump(cand_logits), args.top_k) + greedy = greedy_diff(ref_logprobs, cand_logprobs) + row = { + "case": case.case_id, + "ctx": case.ctx, + **metrics, + "greedy_same": greedy["same"], + "greedy_first_diff": greedy["first_diff"], + "greedy_ref_tokens": greedy["ref_tokens"], + "greedy_cand_tokens": greedy["cand_tokens"], + } + rows.append(row) + pairs[pair_name] = { + "rows": rows, + "summary": aggregate(rows), + "extrema": extrema(rows), + } + print_pair_table(pair_name, rows) + return { + "cases": [case.__dict__ for case in CASES], + "modes": MODES, + "pairs": pairs, + } + + +def check_gate( + payload: dict[str, Any], + *, + fail_on_quality_greedy: bool, + max_tensor_standard_rms: float | None, + max_tensor_standard_top20_abs: float | None, +) -> list[str]: + failures: list[str] = [] + for pair_name in ("standard_vs_quality", "tensor_vs_quality"): + summary = payload["pairs"][pair_name]["summary"] + if summary["top1_mismatches"] != 0: + failures.append(f"{pair_name}: top1_mismatches={summary['top1_mismatches']}") + if fail_on_quality_greedy and summary["greedy_mismatches"] != 0: + failures.append(f"{pair_name}: greedy_mismatches={summary['greedy_mismatches']}") + + tensor_delta = payload["pairs"]["tensor_vs_standard"]["summary"] + if tensor_delta["top1_mismatches"] != 0: + failures.append( + f"tensor_vs_standard: top1_mismatches={tensor_delta['top1_mismatches']}" + ) + if tensor_delta["greedy_mismatches"] != 0: + failures.append( + f"tensor_vs_standard: greedy_mismatches={tensor_delta['greedy_mismatches']}" + ) + if (max_tensor_standard_rms is not None and + tensor_delta["worst_rms"] > max_tensor_standard_rms): + tensor_extrema = payload["pairs"]["tensor_vs_standard"]["extrema"] + failures.append( + "tensor_vs_standard: worst_rms exceeds configured envelope " + f"({tensor_delta['worst_rms']:.6g} > {max_tensor_standard_rms:.6g}, " + f"case={tensor_extrema['worst_rms_case']})" + ) + if (max_tensor_standard_top20_abs is not None and + tensor_delta["worst_top20_max_abs"] > max_tensor_standard_top20_abs): + tensor_extrema = payload["pairs"]["tensor_vs_standard"]["extrema"] + failures.append( + "tensor_vs_standard: worst_top20_max_abs exceeds configured envelope " + f"({tensor_delta['worst_top20_max_abs']:.6g} > " + f"{max_tensor_standard_top20_abs:.6g}, " + f"case={tensor_extrema['worst_top20_max_abs_case']})" + ) + + standard = payload["pairs"]["standard_vs_quality"]["summary"] + tensor = payload["pairs"]["tensor_vs_quality"]["summary"] + if tensor["worst_rms"] > standard["worst_rms"] * 1.10: + failures.append( + "tensor_vs_quality: worst_rms materially worse than standard " + f"({tensor['worst_rms']:.6g} > {standard['worst_rms']:.6g} * 1.10)" + ) + if tensor["worst_top20_max_abs"] > standard["worst_top20_max_abs"] * 1.10: + failures.append( + "tensor_vs_quality: worst_top20_max_abs materially worse than standard " + f"({tensor['worst_top20_max_abs']:.6g} > " + f"{standard['worst_top20_max_abs']:.6g} * 1.10)" + ) + return failures + + +def build_run_config(args: argparse.Namespace) -> dict[str, Any]: + return { + "argv": sys.argv, + "repo_root": str(args.repo_root), + "ds4": str(args.ds4), + "model": str(args.model) if args.model else None, + "out_dir": str(args.out_dir), + "candidate_preset": args.preset, + "top_k": args.top_k, + "greedy_tokens": args.greedy_tokens, + "reuse": args.reuse, + "dry_run": args.dry_run, + "allow_stale_binary": args.allow_stale_binary, + "fail_on_quality_greedy": args.fail_on_quality_greedy, + "max_tensor_standard_rms": args.max_tensor_standard_rms, + "max_tensor_standard_top20_abs": args.max_tensor_standard_top20_abs, + "no_fail": args.no_fail, + } + + +def parse_env_overrides(values: list[str]) -> dict[str, str]: + env: dict[str, str] = {} + for value in values: + if "=" not in value: + raise SystemExit(f"--set-env expects NAME=VALUE, got: {value}") + name, env_value = value.split("=", 1) + if not name: + raise SystemExit(f"--set-env expects NAME=VALUE, got: {value}") + env[name] = env_value + return env + + +def apply_env_overrides(args: argparse.Namespace) -> dict[str, str]: + overrides: dict[str, str] = {} + if args.preset: + overrides.update(CANDIDATE_PRESETS[args.preset].env) + overrides.update(parse_env_overrides(args.set_env)) + for name, value in overrides.items(): + os.environ[name] = value + return overrides + + +def main() -> int: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=f"Candidate presets:\n{preset_help()}", + ) + parser.add_argument("--repo-root", type=Path, default=Path(".")) + parser.add_argument("--ds4", type=Path, default=Path("./ds4")) + parser.add_argument("--model", type=Path) + parser.add_argument("--out-dir", type=Path) + parser.add_argument("--top-k", type=int, default=20) + parser.add_argument("--greedy-tokens", type=int, default=16) + parser.add_argument("--reuse", action="store_true", help="Reuse existing dumps in --out-dir.") + parser.add_argument("--dry-run", action="store_true", help="Print commands without running them.") + parser.add_argument( + "--allow-stale-binary", + action="store_true", + help="Skip the source-vs-binary freshness check.", + ) + parser.add_argument( + "--preset", + choices=sorted(CANDIDATE_PRESETS), + help="Use a named default-off candidate environment preset for the tensor mode.", + ) + parser.add_argument( + "--set-env", + action="append", + default=[], + metavar="NAME=VALUE", + help="Set an environment variable for all ds4 captures; repeatable.", + ) + parser.add_argument( + "--fail-on-quality-greedy", + action="store_true", + help="Fail when standard/tensor differs from --quality in greedy continuation.", + ) + parser.add_argument( + "--max-tensor-standard-rms", + type=float, + help="Optional maximum Tensor-vs-standard worst RMS allowed by this gate.", + ) + parser.add_argument( + "--max-tensor-standard-top20-abs", + type=float, + help="Optional maximum Tensor-vs-standard worst top-20 absolute drift allowed by this gate.", + ) + parser.add_argument( + "--no-fail", + action="store_true", + help="Always exit 0 after reporting gate failures.", + ) + args = parser.parse_args() + + if args.top_k < 20: + raise SystemExit("--top-k must be at least 20") + if args.out_dir is None: + run_id = time.strftime("%Y%m%d-%H%M%S") + label = f"{args.preset}-quality-drift-gate" if args.preset else "quality-drift-gate" + args.out_dir = Path("speed-bench/local-runs") / f"{run_id}-{label}" + + args.repo_root = args.repo_root.resolve() + if not args.ds4.is_absolute(): + args.ds4 = args.repo_root / args.ds4 + args.out_dir.mkdir(parents=True, exist_ok=True) + if not args.dry_run: + assert_fresh_binary( + args.ds4, + repo_root=args.repo_root, + source_patterns=DS4_FRESHNESS_SOURCES, + allow_stale=args.allow_stale_binary, + ) + env_overrides = apply_env_overrides(args) + if env_overrides: + print("Environment overrides:", flush=True) + for name, value in sorted(env_overrides.items()): + print(f" {name}={value}", flush=True) + + for case in CASES: + for mode in MODES: + capture_case(args, case, mode) + + if args.dry_run: + return 0 + + payload = summarize(args) + payload["env"] = env_overrides + payload["run_config"] = build_run_config(args) + envelope = { + "max_rms": args.max_tensor_standard_rms, + "max_top20_abs": args.max_tensor_standard_top20_abs, + } + if envelope["max_rms"] is not None or envelope["max_top20_abs"] is not None: + payload["drift_envelope"] = envelope + payload["gate_failures"] = check_gate( + payload, + fail_on_quality_greedy=args.fail_on_quality_greedy, + max_tensor_standard_rms=args.max_tensor_standard_rms, + max_tensor_standard_top20_abs=args.max_tensor_standard_top20_abs, + ) + summary_path = args.out_dir / "summary.json" + with summary_path.open("w", encoding="utf-8") as fp: + json.dump(payload, fp, indent=2) + fp.write("\n") + print(f"\nWrote {summary_path}") + markdown_path = args.out_dir / "summary.md" + write_markdown_summary(payload, markdown_path) + print(f"Wrote {markdown_path}") + + if payload["gate_failures"]: + print("\nGate failures:") + for failure in payload["gate_failures"]: + print(f" {failure}") + return 0 if args.no_fail else 1 + print("\nGate: OK") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/speed-bench/summarize_mpp_compare.py b/speed-bench/summarize_mpp_compare.py new file mode 100644 index 000000000..7a1b3928c --- /dev/null +++ b/speed-bench/summarize_mpp_compare.py @@ -0,0 +1,420 @@ +#!/usr/bin/env python3 +"""Summarize DS4 Metal Tensor comparator logs. + +This parses stderr/stdout from runs with DS4_METAL_MPP_COMPARE_ROUTE set. The +comparator reports local projection deltas between the legacy path and the +candidate Tensor path; this helper turns those raw lines into persistent +Markdown/JSON summaries for prefill optimization notes. +""" + +from __future__ import annotations + +import argparse +import json +import re +from collections import Counter, defaultdict +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + + +COMPARE_RE = re.compile( + r"Metal Tensor compare route=(?P\w+) module=(?P.*?) " + r"shape=(?P\d+)x(?P\d+)x(?P\d+) " + r"max_abs=(?P[0-9.eE+-]+) rms=(?P[0-9.eE+-]+) " + r"nonfinite=(?P\d+) max_index=(?P\d+)" +) +DELTA_RE = re.compile( + r"Metal Tensor compare route=(?P\w+) module=(?P.*?) " + r"largest deltas:(?P.*)" +) +DELTA_ITEM_RE = re.compile( + r"idx=(?P\d+) ref=(?P[0-9.eE+-]+) " + r"cand=(?P[0-9.eE+-]+) abs=(?P[0-9.eE+-]+)" +) +BREACH_RE = re.compile( + r"Metal Tensor compare route=(?P\w+) module=(?P.*?) " + r"exceeded target max_abs<=0.001 rms<=0.0001" +) +LIMIT_RE = re.compile( + r"Metal Tensor compare reached DS4_METAL_MPP_COMPARE_MAX=(?P\d+) " + r"without a target breach" +) +LAYER_RE = re.compile(r"layer=(?P\d+)") + + +@dataclass +class DeltaItem: + idx: int + ref: float + cand: float + abs_delta: float + + +@dataclass +class CompareItem: + source: Path + route: str + module: str + dim0: int + dim1: int + dim2: int + max_abs: float + rms: float + nonfinite: int + max_index: int + deltas: list[DeltaItem] = field(default_factory=list) + + @property + def layer(self) -> int | None: + match = LAYER_RE.search(self.module) + return int(match.group("layer")) if match else None + + @property + def shape(self) -> str: + return f"{self.dim0}x{self.dim1}x{self.dim2}" + + +@dataclass +class CompareSummary: + items: list[CompareItem] = field(default_factory=list) + breaches: list[dict[str, Any]] = field(default_factory=list) + limit_hits: list[dict[str, Any]] = field(default_factory=list) + + +def parse_log(path: Path) -> CompareSummary: + summary = CompareSummary() + pending: dict[tuple[str, str], CompareItem] = {} + text = path.read_text(encoding="utf-8", errors="ignore") + for line in text.splitlines(): + if match := COMPARE_RE.search(line): + item = CompareItem( + source=path, + route=match.group("route"), + module=match.group("module"), + dim0=int(match.group("dim0")), + dim1=int(match.group("dim1")), + dim2=int(match.group("dim2")), + max_abs=float(match.group("max_abs")), + rms=float(match.group("rms")), + nonfinite=int(match.group("nonfinite")), + max_index=int(match.group("max_index")), + ) + summary.items.append(item) + pending[(item.route, item.module)] = item + if match := DELTA_RE.search(line): + key = (match.group("route"), match.group("module")) + item = pending.get(key) + if item is not None: + item.deltas = [ + DeltaItem( + idx=int(delta.group("idx")), + ref=float(delta.group("ref")), + cand=float(delta.group("cand")), + abs_delta=float(delta.group("abs")), + ) + for delta in DELTA_ITEM_RE.finditer(match.group("deltas")) + ] + if match := BREACH_RE.search(line): + summary.breaches.append( + { + "source": str(path), + "route": match.group("route"), + "module": match.group("module"), + } + ) + if match := LIMIT_RE.search(line): + summary.limit_hits.append( + { + "source": str(path), + "max": int(match.group("max")), + } + ) + return summary + + +def merge_summaries(summaries: list[CompareSummary]) -> CompareSummary: + merged = CompareSummary() + for summary in summaries: + merged.items.extend(summary.items) + merged.breaches.extend(summary.breaches) + merged.limit_hits.extend(summary.limit_hits) + return merged + + +def pct(part: int, total: int) -> float: + return 100.0 * part / total if total else 0.0 + + +def item_to_json(item: CompareItem) -> dict[str, Any]: + return { + "source": str(item.source), + "route": item.route, + "module": item.module, + "layer": item.layer, + "shape": item.shape, + "max_abs": item.max_abs, + "rms": item.rms, + "nonfinite": item.nonfinite, + "max_index": item.max_index, + "largest_deltas": [ + { + "idx": delta.idx, + "ref": delta.ref, + "cand": delta.cand, + "abs": delta.abs_delta, + } + for delta in item.deltas + ], + } + + +def as_json(summary: CompareSummary, *, max_abs_target: float, rms_target: float) -> dict[str, Any]: + route_counts = Counter(item.route for item in summary.items) + layer_counts = Counter(item.layer for item in summary.items if item.layer is not None) + route_worst: dict[str, dict[str, Any]] = {} + for route in sorted(route_counts): + route_items = [item for item in summary.items if item.route == route] + route_worst[route] = { + "count": len(route_items), + "worst_max_abs": item_to_json(max(route_items, key=lambda item: item.max_abs)), + "worst_rms": item_to_json(max(route_items, key=lambda item: item.rms)), + } + threshold_breaches = [ + item + for item in summary.items + if item.nonfinite or item.max_abs > max_abs_target or item.rms > rms_target + ] + return { + "targets": { + "max_abs": max_abs_target, + "rms": rms_target, + }, + "count": len(summary.items), + "route_counts": dict(route_counts), + "layer_counts": {str(layer): count for layer, count in sorted(layer_counts.items())}, + "breaches": summary.breaches, + "limit_hits": summary.limit_hits, + "threshold_breaches": [item_to_json(item) for item in threshold_breaches], + "top_max_abs": [ + item_to_json(item) + for item in sorted(summary.items, key=lambda item: item.max_abs, reverse=True) + ], + "top_rms": [ + item_to_json(item) + for item in sorted(summary.items, key=lambda item: item.rms, reverse=True) + ], + "route_worst": route_worst, + } + + +def markdown_escape(value: object) -> str: + return str(value).replace("|", "\\|") + + +def render_item_row(item: CompareItem) -> str: + return ( + "| " + f"`{markdown_escape(item.route)}` | " + f"`{markdown_escape(item.module)}` | " + f"{item.layer if item.layer is not None else 'n/a'} | " + f"`{item.shape}` | " + f"{item.max_abs:.6g} | " + f"{item.rms:.6g} | " + f"{item.nonfinite} | " + f"{item.max_index} |" + ) + + +def render_markdown( + summary: CompareSummary, + *, + max_abs_target: float, + rms_target: float, + top: int, +) -> str: + route_counts = Counter(item.route for item in summary.items) + layer_counts = Counter(item.layer for item in summary.items if item.layer is not None) + threshold_breaches = [ + item + for item in summary.items + if item.nonfinite or item.max_abs > max_abs_target or item.rms > rms_target + ] + + blocks: list[str] = [ + "# DS4 Metal Tensor Comparator Summary", + "", + f"Parsed comparisons: `{len(summary.items)}`", + f"Targets: max abs `<= {max_abs_target:.6g}`, RMS `<= {rms_target:.6g}`", + "", + ] + if route_counts: + blocks.append( + "Routes: " + + ", ".join(f"`{route}`={count}" for route, count in route_counts.most_common()) + ) + blocks.append("") + if layer_counts: + blocks.append( + "Layers with comparisons: " + + ", ".join(f"`{layer}`={count}" for layer, count in sorted(layer_counts.items())) + ) + blocks.append("") + + if threshold_breaches: + blocks.extend( + [ + "## Target Breaches", + "", + "| Route | Module | Layer | Shape | Max abs | RMS | Nonfinite | Max index |", + "| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |", + ] + ) + for item in sorted(threshold_breaches, key=lambda item: item.max_abs, reverse=True): + blocks.append(render_item_row(item)) + blocks.append("") + else: + blocks.extend(["## Target Breaches", "", "None.", ""]) + + if summary.breaches: + blocks.extend(["Comparator breach lines:", ""]) + for breach in summary.breaches: + blocks.append( + f"- `{markdown_escape(breach['route'])}` " + f"`{markdown_escape(breach['module'])}` in `{markdown_escape(breach['source'])}`" + ) + blocks.append("") + if summary.limit_hits: + blocks.extend(["Comparator limit lines:", ""]) + for hit in summary.limit_hits: + blocks.append( + f"- reached `DS4_METAL_MPP_COMPARE_MAX={hit['max']}` without breach " + f"in `{markdown_escape(hit['source'])}`" + ) + blocks.append("") + + blocks.extend( + [ + "## Worst Max Abs", + "", + "| Route | Module | Layer | Shape | Max abs | RMS | Nonfinite | Max index |", + "| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |", + ] + ) + for item in sorted(summary.items, key=lambda item: item.max_abs, reverse=True)[:top]: + blocks.append(render_item_row(item)) + blocks.append("") + + blocks.extend( + [ + "## Worst RMS", + "", + "| Route | Module | Layer | Shape | Max abs | RMS | Nonfinite | Max index |", + "| --- | --- | ---: | --- | ---: | ---: | ---: | ---: |", + ] + ) + for item in sorted(summary.items, key=lambda item: item.rms, reverse=True)[:top]: + blocks.append(render_item_row(item)) + blocks.append("") + + blocks.extend( + [ + "## Route Summary", + "", + "| Route | Count | Share | Worst max abs | Worst max abs module | Worst RMS | Worst RMS module |", + "| --- | ---: | ---: | ---: | --- | ---: | --- |", + ] + ) + for route, count in route_counts.most_common(): + route_items = [item for item in summary.items if item.route == route] + max_abs_item = max(route_items, key=lambda item: item.max_abs) + rms_item = max(route_items, key=lambda item: item.rms) + blocks.append( + "| " + f"`{markdown_escape(route)}` | " + f"{count} | " + f"{pct(count, len(summary.items)):.1f}% | " + f"{max_abs_item.max_abs:.6g} | " + f"`{markdown_escape(max_abs_item.module)}` | " + f"{rms_item.rms:.6g} | " + f"`{markdown_escape(rms_item.module)}` |" + ) + blocks.append("") + + top_delta_items = [item for item in sorted(summary.items, key=lambda item: item.max_abs, reverse=True) if item.deltas] + if top_delta_items: + blocks.extend(["## Largest Delta Details", ""]) + for item in top_delta_items[: min(top, 5)]: + blocks.append( + f"### `{markdown_escape(item.route)}` `{markdown_escape(item.module)}`" + ) + blocks.append("") + blocks.append("| Idx | Ref | Cand | Abs |") + blocks.append("| ---: | ---: | ---: | ---: |") + for delta in item.deltas: + blocks.append( + f"| {delta.idx} | {delta.ref:.6g} | {delta.cand:.6g} | {delta.abs_delta:.6g} |" + ) + blocks.append("") + return "\n".join(blocks).rstrip() + "\n" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("logs", nargs="+", type=Path, help="comparator log/stderr files") + parser.add_argument("--top", type=int, default=20, help="number of rows to show in top tables") + parser.add_argument( + "--max-abs-target", + type=float, + default=1.0e-3, + help="local comparator max-abs target", + ) + parser.add_argument( + "--rms-target", + type=float, + default=1.0e-4, + help="local comparator RMS target", + ) + parser.add_argument("--output", type=Path, help="write Markdown summary here") + parser.add_argument("--json-output", type=Path, help="write JSON summary here") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if args.top < 1: + raise SystemExit("--top must be >= 1") + summaries = [parse_log(path) for path in args.logs] + summary = merge_summaries(summaries) + markdown = render_markdown( + summary, + max_abs_target=args.max_abs_target, + rms_target=args.rms_target, + top=args.top, + ) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(markdown, encoding="utf-8") + print(f"Wrote {args.output}") + else: + print(markdown) + if args.json_output: + args.json_output.parent.mkdir(parents=True, exist_ok=True) + args.json_output.write_text( + json.dumps( + as_json( + summary, + max_abs_target=args.max_abs_target, + rms_target=args.rms_target, + ), + indent=2, + ) + + "\n", + encoding="utf-8", + ) + print(f"Wrote {args.json_output}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/speed-bench/summarize_stage_profile.py b/speed-bench/summarize_stage_profile.py new file mode 100755 index 000000000..48ba0e96a --- /dev/null +++ b/speed-bench/summarize_stage_profile.py @@ -0,0 +1,355 @@ +#!/usr/bin/env python3 +"""Summarize DS4 Metal stage-profile logs. + +This parses stderr/stdout from runs with profiling envs such as +DS4_METAL_LAYER_PROFILE=1, DS4_METAL_MOE_STAGE_PROFILE=1, and +DS4_METAL_Q8_PREFILL_PROFILE=1. The output is intentionally simple Markdown so +local optimization notes can be pasted into the experiment log. +""" + +from __future__ import annotations + +import argparse +import json +import re +from collections import Counter, defaultdict +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + + +LAYER_STAGE_RE = re.compile( + r"metal layer stage part=(?P\w+) layer=(?P\d+) " + r"pos=(?P\d+) tokens=(?P\d+) " + r"(?P[a-z_]+)=(?P[0-9.]+) ms" +) +MOE_STAGE_RE = re.compile( + r"Metal routed MoE stage layer=(?P\d+) tokens=(?P\d+) " + r"pairs=(?P\d+) experts=(?P\d+) .*? " + r"path=(?P\w+) mpp=(?P[0-9/]+) tile=(?P[0-9/]+) " + r"mid=(?P\w+) (?P[a-z_]+)=(?P[0-9.]+) ms" +) +Q8_STAGE_RE = re.compile( + r"Metal Q8_0 prefill profile layer=(?P\d+) pos=(?P\d+) " + r"(?P[a-z0-9_]+) in=(?P\d+) out=(?P\d+) " + r"tok=(?P\d+) (?P[0-9.]+) ms" +) +ATTN_OUTPUT_RE = re.compile( + r"Metal attention output stage tokens=(?P\d+) " + r"(?P[a-z_]+)=(?P[0-9.]+) ms" +) +FLASH_ATTN_RE = re.compile( + r"Metal FlashAttention prefill stage mode=(?P\w+) " + r"tokens=(?P\d+) comp=(?P\d+) keys=(?P\d+) " + r"heads=(?P\d+) dim=(?P\d+) window=(?P\d+) " + r"ratio=(?P\d+) (?P[a-z_]+)=(?P[0-9.]+) ms" +) +THROUGHPUT_RE = re.compile( + r"prefill: (?P[0-9.]+) t/s, generation: (?P[0-9.]+) t/s" +) + + +@dataclass +class StageSummary: + total_ms: float = 0.0 + count: int = 0 + + def add(self, ms: float) -> None: + self.total_ms += ms + self.count += 1 + + @property + def avg_ms(self) -> float: + return self.total_ms / self.count if self.count else 0.0 + + +@dataclass +class ProfileSummary: + path: Path + events: int = 0 + stages: dict[str, StageSummary] = field(default_factory=lambda: defaultdict(StageSummary)) + layers: dict[int, Counter[str]] = field(default_factory=lambda: defaultdict(Counter)) + moe_paths: Counter[str] = field(default_factory=Counter) + moe_mpp: Counter[str] = field(default_factory=Counter) + moe_mpp_stages: dict[str, dict[str, StageSummary]] = field( + default_factory=lambda: defaultdict(lambda: defaultdict(StageSummary)) + ) + q8_shapes: dict[str, StageSummary] = field(default_factory=lambda: defaultdict(StageSummary)) + flash_shapes: dict[str, StageSummary] = field(default_factory=lambda: defaultdict(StageSummary)) + throughput: list[dict[str, float]] = field(default_factory=list) + + def add(self, key: str, layer: int | None, ms: float) -> None: + self.events += 1 + self.stages[key].add(ms) + if layer is not None: + self.layers[layer][key] += ms + + +def parse_profile(path: Path) -> ProfileSummary: + summary = ProfileSummary(path=path) + for line in path.read_text(encoding="utf-8", errors="ignore").splitlines(): + if match := LAYER_STAGE_RE.search(line): + key = f"{match.group('part')}.{match.group('stage')}" + summary.add(key, int(match.group("layer")), float(match.group("ms"))) + continue + if match := MOE_STAGE_RE.search(line): + key = f"moe_stage.{match.group('stage')}" + summary.add(key, int(match.group("layer")), float(match.group("ms"))) + summary.moe_paths[match.group("path")] += 1 + mpp_mask = match.group("mpp") + summary.moe_mpp[mpp_mask] += 1 + summary.moe_mpp_stages[mpp_mask][match.group("stage")].add(float(match.group("ms"))) + continue + if match := Q8_STAGE_RE.search(line): + key = f"q8.{match.group('route')}" + ms = float(match.group("ms")) + summary.add(key, int(match.group("layer")), ms) + shape = ( + f"{match.group('route')} in={match.group('input')} " + f"out={match.group('output')} tok={match.group('tokens')}" + ) + summary.q8_shapes[shape].add(ms) + continue + if match := ATTN_OUTPUT_RE.search(line): + key = f"attn_output.{match.group('stage')}" + summary.add(key, None, float(match.group("ms"))) + continue + if match := FLASH_ATTN_RE.search(line): + key = f"flash_attn.{match.group('mode')}.{match.group('stage')}" + ms = float(match.group("ms")) + summary.add(key, None, ms) + shape = ( + f"{match.group('mode')} tokens={match.group('tokens')} " + f"comp={match.group('comp')} keys={match.group('keys')} " + f"heads={match.group('heads')} dim={match.group('dim')} " + f"window={match.group('window')} ratio={match.group('ratio')}" + ) + summary.flash_shapes[shape].add(ms) + continue + if match := THROUGHPUT_RE.search(line): + summary.throughput.append( + { + "prefill_tps": float(match.group("prefill")), + "generation_tps": float(match.group("generation")), + } + ) + return summary + + +def pct(part: float, total: float) -> float: + return 100.0 * part / total if total else 0.0 + + +def as_json(summary: ProfileSummary) -> dict[str, Any]: + total_ms = sum(stage.total_ms for stage in summary.stages.values()) + return { + "path": str(summary.path), + "events": summary.events, + "total_ms": total_ms, + "throughput": summary.throughput, + "moe_paths": dict(summary.moe_paths), + "moe_mpp": dict(summary.moe_mpp), + "moe_mpp_stages": { + mask: { + stage_name: { + "total_ms": stage.total_ms, + "count": stage.count, + "avg_ms": stage.avg_ms, + "share_pct": pct(stage.total_ms, total_ms), + } + for stage_name, stage in sorted( + stages.items(), + key=lambda item: item[1].total_ms, + reverse=True, + ) + } + for mask, stages in sorted(summary.moe_mpp_stages.items()) + }, + "q8_shapes": { + key: { + "total_ms": shape.total_ms, + "count": shape.count, + "avg_ms": shape.avg_ms, + "share_pct": pct(shape.total_ms, total_ms), + } + for key, shape in sorted( + summary.q8_shapes.items(), + key=lambda item: item[1].total_ms, + reverse=True, + ) + }, + "flash_shapes": { + key: { + "total_ms": shape.total_ms, + "count": shape.count, + "avg_ms": shape.avg_ms, + "share_pct": pct(shape.total_ms, total_ms), + } + for key, shape in sorted( + summary.flash_shapes.items(), + key=lambda item: item[1].total_ms, + reverse=True, + ) + }, + "stages": { + key: { + "total_ms": stage.total_ms, + "count": stage.count, + "avg_ms": stage.avg_ms, + "share_pct": pct(stage.total_ms, total_ms), + } + for key, stage in sorted( + summary.stages.items(), + key=lambda item: item[1].total_ms, + reverse=True, + ) + }, + "layers": { + str(layer): { + "total_ms": sum(counter.values()), + "stages": dict(counter.most_common()), + } + for layer, counter in sorted(summary.layers.items()) + }, + } + + +def render_markdown(summaries: list[ProfileSummary], top: int) -> str: + blocks: list[str] = [ + "# DS4 Metal Stage Profile Summary", + "", + "Note: some profile lines are nested views of the same work, such as", + "`ffn.routed_moe` and `moe_stage.*`, or `attn.output_proj` and", + "`attn_output.*`. Treat percentages as ranking aids, not exclusive", + "wall-time shares.", + "", + ] + for summary in summaries: + total_ms = sum(stage.total_ms for stage in summary.stages.values()) + blocks.append(f"## {summary.path}") + blocks.append("") + if summary.throughput: + last = summary.throughput[-1] + blocks.append( + "Throughput: " + f"prefill `{last['prefill_tps']:.2f} t/s`, " + f"generation `{last['generation_tps']:.2f} t/s`" + ) + blocks.append("") + blocks.append(f"Parsed events: `{summary.events}`, parsed stage total: `{total_ms:.3f} ms`") + if summary.moe_paths: + path_counts = ", ".join(f"`{name}`={count}" for name, count in summary.moe_paths.most_common()) + blocks.append(f"MoE paths: {path_counts}") + if summary.moe_mpp: + mpp_counts = ", ".join(f"`{name}`={count}" for name, count in summary.moe_mpp.most_common()) + blocks.append(f"MoE mpp masks: {mpp_counts}") + blocks.append("") + if summary.moe_mpp_stages: + blocks.append("| MoE mpp mask | top stages | total ms | share |") + blocks.append("| --- | --- | ---: | ---: |") + mask_totals = [ + (sum(stage.total_ms for stage in stages.values()), mask, stages) + for mask, stages in summary.moe_mpp_stages.items() + ] + for mask_total, mask, stages in sorted(mask_totals, reverse=True): + top_stages = ", ".join( + f"`{name}`={stage.total_ms:.1f}" + for name, stage in sorted( + stages.items(), + key=lambda item: item[1].total_ms, + reverse=True, + )[:5] + ) + blocks.append( + f"| `{mask}` | {top_stages} | {mask_total:.3f} | " + f"{pct(mask_total, total_ms):.1f}% |" + ) + blocks.append("") + blocks.append("| Stage | total ms | events | avg ms | share |") + blocks.append("| --- | ---: | ---: | ---: | ---: |") + for key, stage in sorted( + summary.stages.items(), + key=lambda item: item[1].total_ms, + reverse=True, + )[:top]: + blocks.append( + f"| `{key}` | {stage.total_ms:.3f} | {stage.count} | " + f"{stage.avg_ms:.3f} | {pct(stage.total_ms, total_ms):.1f}% |" + ) + blocks.append("") + if summary.q8_shapes: + blocks.append("| Q8 shape | total ms | events | avg ms | share |") + blocks.append("| --- | ---: | ---: | ---: | ---: |") + for key, shape in sorted( + summary.q8_shapes.items(), + key=lambda item: item[1].total_ms, + reverse=True, + )[:top]: + blocks.append( + f"| `{key}` | {shape.total_ms:.3f} | {shape.count} | " + f"{shape.avg_ms:.3f} | {pct(shape.total_ms, total_ms):.1f}% |" + ) + blocks.append("") + if summary.flash_shapes: + blocks.append("| FlashAttention shape | total ms | events | avg ms | share |") + blocks.append("| --- | ---: | ---: | ---: | ---: |") + for key, shape in sorted( + summary.flash_shapes.items(), + key=lambda item: item[1].total_ms, + reverse=True, + )[:top]: + blocks.append( + f"| `{key}` | {shape.total_ms:.3f} | {shape.count} | " + f"{shape.avg_ms:.3f} | {pct(shape.total_ms, total_ms):.1f}% |" + ) + blocks.append("") + blocks.append("| Layer | total ms | top stages |") + blocks.append("| ---: | ---: | --- |") + layer_totals = [ + (sum(counter.values()), layer, counter) + for layer, counter in summary.layers.items() + ] + for layer_total, layer, counter in sorted(layer_totals, reverse=True)[:top]: + top_stages = ", ".join(f"`{name}`={value:.1f}" for name, value in counter.most_common(4)) + blocks.append(f"| {layer} | {layer_total:.3f} | {top_stages} |") + blocks.append("") + return "\n".join(blocks) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("logs", nargs="+", type=Path, help="profile log/stderr files to summarize") + parser.add_argument("--top", type=int, default=18, help="number of stages/layers to print") + parser.add_argument("--output", type=Path, help="write Markdown summary to this file") + parser.add_argument( + "--json", + "--json-output", + dest="json", + type=Path, + help="write machine-readable summary JSON", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + summaries = [parse_profile(path) for path in args.logs] + markdown = render_markdown(summaries, args.top) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(markdown + "\n", encoding="utf-8") + print(f"Wrote {args.output}") + else: + print(markdown) + if args.json: + args.json.parent.mkdir(parents=True, exist_ok=True) + args.json.write_text( + json.dumps([as_json(summary) for summary in summaries], indent=2) + "\n", + encoding="utf-8", + ) + print(f"Wrote {args.json}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/ds4_test.c b/tests/ds4_test.c index ea1e52487..06127acde 100644 --- a/tests/ds4_test.c +++ b/tests/ds4_test.c @@ -1,9 +1,11 @@ #define DS4_SERVER_TEST #define DS4_SERVER_TEST_NO_MAIN #include "../ds4_server.c" +#include "../ds4_dspark_runtime.h" #ifndef DS4_NO_GPU #include "../ds4_gpu.h" #include +#include static ds4_engine *test_engine_fast; static ds4_engine *test_engine_quality; @@ -85,11 +87,24 @@ static void test_restore_canonical_streaming_prefill( saved.batch_selected_addr); } +static ds4_backend test_backend(void) { +#ifdef __APPLE__ + return DS4_BACKEND_METAL; +#else + return DS4_BACKEND_CUDA; +#endif +} + + static ds4_engine *test_open_engine(bool quality) { ds4_engine *engine = NULL; - /* DS4_TEST_MTP loads the MTP head on the fast engine so the speculative - * verify regression can reuse it; draft=4 hits the multi-row verify path. */ - const char *mtp = getenv("DS4_TEST_MTP"); + /* DS4_TEST_MTP loads the legacy MTP head on the fast engine so the speculative + * verify regression can reuse it; draft=4 hits the multi-row verify path. + * DS4_TEST_DSPARK loads an official DSpark draft GGUF and lets metadata choose + * the block size. */ + const char *dspark = getenv("DS4_TEST_DSPARK"); + const char *mtp = (dspark && dspark[0]) ? dspark : getenv("DS4_TEST_MTP"); + const bool use_mtp = mtp && mtp[0] && !quality; ds4_engine_options opt = { .model_path = test_model_path(), #ifdef __APPLE__ @@ -106,8 +121,8 @@ static ds4_engine *test_open_engine(bool quality) { test_env_gib("DS4_TEST_SSD_STREAMING_CACHE_GB"), .ssd_streaming_preload_experts = test_env_u32("DS4_TEST_SSD_STREAMING_PRELOAD_EXPERTS"), - .mtp_path = (mtp && mtp[0] && !quality) ? mtp : NULL, - .mtp_draft_tokens = (mtp && mtp[0] && !quality) ? 4 : 0, + .mtp_path = use_mtp ? mtp : NULL, + .mtp_draft_tokens = use_mtp && !(dspark && dspark[0]) ? 4 : 0, }; TEST_ASSERT(ds4_engine_open(&engine, &opt) == 0); return engine; @@ -2053,7 +2068,7 @@ static bool test_mtp_capture_speculative(ds4_engine *engine, const ds4_tokens *p const int ntok = ds4_session_eval_speculative_argmax( session, token, max_tokens - n, eos, toks, (int)(sizeof(toks) / sizeof(toks[0])), err, sizeof(err)); - if (ntok < 0) { ok = false; TEST_ASSERT(false); break; } + if (ntok < 0) { fprintf(stderr, "ds4-test speculative error: %s\n", err); ok = false; TEST_ASSERT(false); break; } if (ntok > *max_chunk) *max_chunk = ntok; for (int j = 0; j < ntok; j++) { @@ -2174,8 +2189,381 @@ static void test_mtp_verify_depth(void) { free(spec); ds4_tokens_free(&prompt); } + +static void test_dspark_speculative_block(void) { + const char *dspark = getenv("DS4_TEST_DSPARK"); + if (!dspark || !dspark[0]) { + fprintf(stderr, "ds4-test: dspark-speculative-block skipped (set DS4_TEST_DSPARK to a DSpark GGUF)\n"); + return; + } + + ds4_engine *engine = test_get_engine(false); + const ds4_mtp_draft_kind draft_kind = ds4_engine_mtp_draft_kind(engine); + TEST_ASSERT(draft_kind == DS4_MTP_DRAFT_DSPARK); + if (!ds4_mtp_draft_runtime_supported(test_backend(), draft_kind)) { + fprintf(stderr, "ds4-test: dspark-speculative-block skipped (backend does not support DSpark runtime)\n"); + return; + } + TEST_ASSERT(ds4_engine_has_mtp(engine)); + TEST_ASSERT(ds4_engine_mtp_draft_tokens(engine) == 5); + + ds4_tokens prompt = {0}; + ds4_chat_begin(engine, &prompt); + ds4_chat_append_message(engine, &prompt, "user", test_mtp_copy_prompt()); + ds4_chat_append_assistant_prefix(engine, &prompt, DS4_THINK_NONE); + TEST_ASSERT(prompt.len > 0); + + int *spec = malloc((size_t)TEST_MTP_MAXGEN * sizeof(*spec)); + TEST_ASSERT(spec != NULL); + if (spec && prompt.len > 0) { + int nspec = 0, max_chunk = 0; + const bool ok_spec = test_mtp_capture_speculative(engine, &prompt, 96, + spec, &nspec, &max_chunk); + TEST_ASSERT(ok_spec); + TEST_ASSERT(max_chunk > 1); + + float worst_gap = 0.0f; + int worst_at = -1; + const bool ok_check = test_mtp_worst_argmax_gap(engine, &prompt, spec, nspec, + &worst_gap, &worst_at); + TEST_ASSERT(ok_check); + fprintf(stderr, "ds4-test: dspark-speculative-block nspec=%d max_chunk=%d worst_argmax_gap=%.3f at=%d\n", + nspec, max_chunk, worst_gap, worst_at); + TEST_ASSERT(worst_gap <= 2.0f); + } + + free(spec); + ds4_tokens_free(&prompt); +} + + #endif +static void test_dspark_binder_helpers(void) { + ds4_dspark_config cfg; + ds4_dspark_config_init_defaults(&cfg); + TEST_ASSERT(cfg.n_mtp_layers == 3); + TEST_ASSERT(cfg.block_size == 5); + TEST_ASSERT(cfg.noise_token_id == 128799u); + TEST_ASSERT(cfg.markov_rank == 256); + TEST_ASSERT(cfg.target_layer_ids[0] == 40); + TEST_ASSERT(cfg.target_layer_ids[1] == 41); + TEST_ASSERT(cfg.target_layer_ids[2] == 42); + + TEST_ASSERT(ds4_mtp_draft_kind_guess(false, false, false) == DS4_MTP_DRAFT_NONE); + TEST_ASSERT(ds4_mtp_draft_kind_guess(true, false, false) == DS4_MTP_DRAFT_LEGACY); + TEST_ASSERT(ds4_mtp_draft_kind_guess(false, true, true) == DS4_MTP_DRAFT_DSPARK); + TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, true, 0) == + DS4_MTP_DRAFT_DSPARK_NONSEQ); + TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, false, 0) == + DS4_MTP_DRAFT_NONE); + TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, true, 256) == + DS4_MTP_DRAFT_NONE); + TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_DSPARK_NONSEQ), + "dspark-nonseq")); + TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_DSPARK), "dspark")); + TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_LEGACY), "legacy-mtp")); +} + +static void test_dspark_markov_bf16_helpers(void) { + TEST_ASSERT(fabsf(ds4_dspark_bf16_to_f32(0x3fc0u) - 1.5f) < 0.001f); + TEST_ASSERT(fabsf(ds4_dspark_bf16_to_f32(0xbe80u) + 0.25f) < 0.001f); +} + + +static void test_dspark_runtime_helpers(void) { + ds4_dspark_config cfg; + ds4_dspark_config_init_defaults(&cfg); + TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_LEGACY, true, 4) == + DS4_DSPARK_SPEC_LEGACY_MTP); + TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK, true, 5) == + DS4_DSPARK_SPEC_DSPARK_ENABLED); + TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK_NONSEQ, true, 5) == + DS4_DSPARK_SPEC_DSPARK_ENABLED); + TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK, true, 1) == + DS4_DSPARK_SPEC_DISABLED); + TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_ENABLED), + "enabled") != NULL); + TEST_ASSERT(ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_LEGACY)); + TEST_ASSERT(!ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_NONE)); + TEST_ASSERT(ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_DSPARK)); + TEST_ASSERT(ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_DSPARK_NONSEQ)); + TEST_ASSERT(ds4_mtp_draft_runtime_supported(DS4_BACKEND_METAL, + DS4_MTP_DRAFT_DSPARK)); + TEST_ASSERT(ds4_mtp_draft_runtime_supported(DS4_BACKEND_METAL, + DS4_MTP_DRAFT_DSPARK_NONSEQ)); + TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CUDA, + DS4_MTP_DRAFT_DSPARK)); + TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CUDA, + DS4_MTP_DRAFT_DSPARK_NONSEQ)); + TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CPU, + DS4_MTP_DRAFT_LEGACY)); + + const int eos_drafts[] = { 101, 102, 2, 103 }; + TEST_ASSERT(ds4_dspark_draft_len_until_eos(eos_drafts, 4, 2) == 3); + TEST_ASSERT(ds4_dspark_draft_len_until_eos(eos_drafts, 4, 999) == 4); + TEST_ASSERT(ds4_dspark_draft_len_until_eos(eos_drafts, 0, 2) == 0); + TEST_ASSERT(ds4_dspark_prefix_slot_for_accept(1, 5) == 0); + TEST_ASSERT(ds4_dspark_prefix_slot_for_accept(2, 5) == 1); + TEST_ASSERT(ds4_dspark_prefix_slot_for_accept(4, 5) == 3); + TEST_ASSERT(ds4_dspark_prefix_slot_for_accept(5, 5) == -1); + TEST_ASSERT(ds4_dspark_prefix_slot_for_accept(0, 5) == -1); + TEST_ASSERT(ds4_dspark_prefix_slot_for_accept(1, 1) == -1); + TEST_ASSERT(ds4_dspark_prefix_slot_count(DS4_MTP_DRAFT_LEGACY, 0, 15) == 1); + TEST_ASSERT(ds4_dspark_prefix_slot_count(DS4_MTP_DRAFT_DSPARK, 5, 15) == 4); + TEST_ASSERT(ds4_dspark_prefix_slot_count(DS4_MTP_DRAFT_DSPARK_NONSEQ, 16, 15) == 15); + TEST_ASSERT(ds4_dspark_prefix_slot_count(DS4_MTP_DRAFT_DSPARK, 32, 15) == 15); + TEST_ASSERT(ds4_dspark_prefix_slot_count(DS4_MTP_DRAFT_NONE, 5, 15) == 0); + TEST_ASSERT(ds4_dspark_prefix_slot_count(DS4_MTP_DRAFT_DSPARK, 5, 0) == 0); + TEST_ASSERT(ds4_engine_has_mtp(NULL) == false); +} + +static uint32_t test_le32(const unsigned char *p) { + return (uint32_t)p[0] | + ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); +} + +static uint64_t test_le64(const unsigned char *p) { + return (uint64_t)p[0] | + ((uint64_t)p[1] << 8) | + ((uint64_t)p[2] << 16) | + ((uint64_t)p[3] << 24) | + ((uint64_t)p[4] << 32) | + ((uint64_t)p[5] << 40) | + ((uint64_t)p[6] << 48) | + ((uint64_t)p[7] << 56); +} + +static bool test_file_size(const char *path, uint64_t *size_out) { + struct stat st; + if (stat(path, &st) != 0 || st.st_size < 0) return false; + *size_out = (uint64_t)st.st_size; + return true; +} +static bool test_bf16_region_nonzero_finite(const char *path, + uint64_t offset, + uint64_t bytes) { + if (!path || bytes == 0 || (bytes & 1u) != 0) return false; + FILE *fp = fopen(path, "rb"); + if (!fp) return false; + if (fseeko(fp, (off_t)offset, SEEK_SET) != 0) { + fclose(fp); + return false; + } + unsigned char buf[4096]; + uint64_t remaining = bytes; + uint64_t values = 0; + uint64_t nonzero = 0; + while (remaining > 0) { + size_t chunk = remaining < sizeof(buf) ? (size_t)remaining : sizeof(buf); + if ((chunk & 1u) != 0) chunk--; + if (chunk == 0 || fread(buf, 1, chunk, fp) != chunk) { + fclose(fp); + return false; + } + for (size_t i = 0; i < chunk; i += 2) { + uint16_t u = (uint16_t)buf[i] | ((uint16_t)buf[i + 1] << 8); + if ((u & 0x7f80u) == 0x7f80u) { + fclose(fp); + return false; + } + if (u != 0) nonzero++; + values++; + } + remaining -= chunk; + } + return fclose(fp) == 0 && values == bytes / 2 && nonzero > 0; +} + + +static bool test_write_dspark_target_cache_dataset(const char *path) { + FILE *fp = fopen(path, "wb"); + if (!fp) return false; + const bool ok = fputs("===== DS4_IMATRIX_PROMPT 0 =====\n" + "Explain target cache export in one short sentence.\n", + fp) >= 0; + return fclose(fp) == 0 && ok; +} + +static int test_run_dspark_target_cache_cli(const char *dataset_path, + const char *output_dir) { + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) { + execl("./ds4", "./ds4", + "-m", test_model_path(), + "--metal", + "--dspark-target-cache-dataset", dataset_path, + "--dspark-target-cache-out", output_dir, + "--dspark-target-cache-target-model", "deepseek-ai/DeepSeek-V4-Flash", + "--dspark-target-cache-chat-template", "deepseek_v4_rendered", + "--dspark-target-cache-max-prompts", "1", + "--dspark-target-cache-max-tokens", "8", + "--ctx", "128", + (char *)NULL); + _exit(127); + } + int status = 0; + while (waitpid(pid, &status, 0) < 0) { + if (errno != EINTR) return -1; + } + if (!WIFEXITED(status)) return -1; + return WEXITSTATUS(status); +} + +static int test_run_dspark_target_cache_cli_missing_target_model(const char *dataset_path, + const char *output_dir) { + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) { + execl("./ds4", "./ds4", + "-m", test_model_path(), + "--metal", + "--dspark-target-cache-dataset", dataset_path, + "--dspark-target-cache-out", output_dir, + "--dspark-target-cache-chat-template", "deepseek_v4_rendered", + "--dspark-target-cache-max-prompts", "1", + "--dspark-target-cache-max-tokens", "8", + "--ctx", "128", + (char *)NULL); + _exit(127); + } + int status = 0; + while (waitpid(pid, &status, 0) < 0) { + if (errno != EINTR) return -1; + } + if (!WIFEXITED(status)) return -1; + return WEXITSTATUS(status); +} +static bool test_json_u64_field(const char *json, const char *key, uint64_t *out) { + const char *p = strstr(json, key); + if (!p) return false; + p += strlen(key); + while (*p == ' ' || *p == '\t') p++; + char *end = NULL; + unsigned long long v = strtoull(p, &end, 10); + if (end == p) return false; + *out = (uint64_t)v; + return true; +} + + +static void test_dspark_target_cache_export(void) { + char root_template[PATH_MAX]; + snprintf(root_template, sizeof(root_template), "%s", + "/tmp/ds4-target-cache-test-XXXXXX"); + char *root = mkdtemp(root_template); + TEST_ASSERT(root != NULL); + if (!root) return; + + char dataset_path[PATH_MAX]; + char output_dir[PATH_MAX]; + char missing_target_output_dir[PATH_MAX]; + char manifest_path[PATH_MAX]; + char lock_path[PATH_MAX]; + char index_path[PATH_MAX]; + char shard_path[PATH_MAX]; + TEST_ASSERT(snprintf(dataset_path, sizeof(dataset_path), "%s/prompts.txt", root) < + (int)sizeof(dataset_path)); + TEST_ASSERT(snprintf(output_dir, sizeof(output_dir), "%s/cache", root) < + (int)sizeof(output_dir)); + TEST_ASSERT(snprintf(missing_target_output_dir, sizeof(missing_target_output_dir), + "%s/missing-target-cache", root) < + (int)sizeof(missing_target_output_dir)); + TEST_ASSERT(snprintf(manifest_path, sizeof(manifest_path), "%s/manifest.json", + output_dir) < (int)sizeof(manifest_path)); + TEST_ASSERT(snprintf(index_path, sizeof(index_path), "%s/samples.idx", output_dir) < + (int)sizeof(index_path)); + TEST_ASSERT(snprintf(shard_path, sizeof(shard_path), "%s/shard-00000.bin", + output_dir) < (int)sizeof(shard_path)); + TEST_ASSERT(snprintf(lock_path, sizeof(lock_path), "%s/ds4.lock", root) < + (int)sizeof(lock_path)); + TEST_ASSERT(setenv("DS4_LOCK_FILE", lock_path, 1) == 0); + TEST_ASSERT(test_write_dspark_target_cache_dataset(dataset_path)); + const int missing_target_rc = + test_run_dspark_target_cache_cli_missing_target_model(dataset_path, + missing_target_output_dir); + TEST_ASSERT(missing_target_rc != 0); + + const int rc = test_run_dspark_target_cache_cli(dataset_path, output_dir); + TEST_ASSERT(rc == 0); + if (rc != 0) return; + + char *manifest = test_read_file(manifest_path); + TEST_ASSERT(manifest != NULL); + if (!manifest) return; + uint64_t hidden_size = 0; + uint64_t target_hidden_layers = 0; + TEST_ASSERT(strstr(manifest, "\"version\": 2") != NULL); + TEST_ASSERT(strstr(manifest, "\"format\": \"deepspec-target-cache\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"producer\": \"ds4\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"target_model_name_or_path\": \"deepseek-ai/DeepSeek-V4-Flash\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"source_gguf_path\": \"") != NULL); + TEST_ASSERT(strstr(manifest, "\"chat_template\": \"deepseek_v4_rendered\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"target_layer_ids\": [40, 41, 42]") != NULL); + TEST_ASSERT(strstr(manifest, "\"hidden_dtype\": \"bfloat16\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"token_dtype\": \"int32\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"mask_dtype\": \"uint8\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"index_record_size\": 56") != NULL); + TEST_ASSERT(test_json_u64_field(manifest, "\"target_hidden_layers\": ", + &target_hidden_layers)); + TEST_ASSERT(target_hidden_layers == 3); + TEST_ASSERT(strstr(manifest, "\"sample_split_marker\": \"===== DS4_IMATRIX_PROMPT\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"shard-00000.bin\"") != NULL); + TEST_ASSERT(test_json_u64_field(manifest, "\"hidden_size\": ", &hidden_size)); + TEST_ASSERT(hidden_size > 0); + free(manifest); + + uint64_t index_size = 0; + uint64_t shard_size = 0; + TEST_ASSERT(test_file_size(index_path, &index_size)); + TEST_ASSERT(index_size == 56); + TEST_ASSERT(test_file_size(shard_path, &shard_size)); + TEST_ASSERT(shard_size > 0); + if (index_size != 56 || shard_size == 0) return; + + FILE *idx = fopen(index_path, "rb"); + TEST_ASSERT(idx != NULL); + if (!idx) return; + unsigned char rec[56]; + TEST_ASSERT(fread(rec, 1, sizeof(rec), idx) == sizeof(rec)); + TEST_ASSERT(fclose(idx) == 0); + + const uint64_t sample_id = test_le64(rec + 0); + const uint32_t shard_id = test_le32(rec + 8); + const uint32_t seq_len = test_le32(rec + 12); + const uint64_t input_ids_offset = test_le64(rec + 16); + const uint64_t attention_mask_offset = test_le64(rec + 24); + const uint64_t loss_mask_offset = test_le64(rec + 32); + const uint64_t target_hidden_states_offset = test_le64(rec + 40); + const uint64_t target_last_hidden_states_offset = test_le64(rec + 48); + + TEST_ASSERT(sample_id == 0); + TEST_ASSERT(seq_len > 0 && seq_len <= 8); + TEST_ASSERT(shard_id == 0); + TEST_ASSERT(input_ids_offset == 0); + TEST_ASSERT(attention_mask_offset == (uint64_t)seq_len * sizeof(int32_t)); + TEST_ASSERT(loss_mask_offset == attention_mask_offset + seq_len); + TEST_ASSERT(target_hidden_states_offset == loss_mask_offset + seq_len); + const uint64_t target_hidden_bytes = + (uint64_t)seq_len * target_hidden_layers * hidden_size * sizeof(uint16_t); + TEST_ASSERT(target_last_hidden_states_offset == + target_hidden_states_offset + target_hidden_bytes); + TEST_ASSERT(test_bf16_region_nonzero_finite(shard_path, + target_hidden_states_offset, + target_hidden_bytes)); + const uint64_t target_last_hidden_bytes = + (uint64_t)seq_len * hidden_size * sizeof(uint16_t); + TEST_ASSERT(shard_size == target_last_hidden_states_offset + target_last_hidden_bytes); + TEST_ASSERT(test_bf16_region_nonzero_finite(shard_path, + target_last_hidden_states_offset, + target_last_hidden_bytes)); +} + + + static void test_server_unit_group(void) { ds4_server_unit_tests_run(); } @@ -2202,18 +2590,31 @@ static const ds4_test_entry test_entries[] = { {"--metal-tensor-equivalence", "metal-tensor-equivalence", "fast/quality Metal prompt-logit and greedy equivalence", test_metal_mpp_equivalence}, {"--streaming-decode-prefill-correctness", "streaming-decode-prefill-correctness", "streaming decode-style cold prefill drift and repeatability", test_streaming_decode_prefill_correctness}, {"--mtp-verify-depth", "mtp-verify-depth", "MTP speculative verify commits autoregressive-identical tokens at draft depth > 2", test_mtp_verify_depth}, + {"--dspark-speculative-block", "dspark-speculative-block", "DSpark block drafts commit only target-verified tokens", test_dspark_speculative_block}, #endif + {"--dspark-binder", "dspark-binder", "DSpark draft kind/config defaults without GGUF", test_dspark_binder_helpers}, + {"--dspark-markov-bf16", "dspark-markov-bf16", "DSpark Markov BF16 tensor decoding", test_dspark_markov_bf16_helpers}, + {"--dspark-runtime", "dspark-runtime", "DSpark capture plan and speculative gate helpers", test_dspark_runtime_helpers}, + {"--server", "server", "server parser/rendering/cache unit tests", test_server_unit_group}, }; +static const ds4_test_entry manual_test_entries[] = { + {"--dspark-target-cache-export", "dspark-target-cache-export", "DeepSpec target-cache exporter smoke", test_dspark_target_cache_export}, +}; + static void test_print_help(const char *prog) { printf("Usage: %s [--all | TEST...]\n\n", prog); puts("Tests:"); puts(" --all"); - puts(" Run every test. This is the default, ordered from slower to faster."); + puts(" Run every default test. This is the default, ordered from slower to faster."); for (size_t i = 0; i < sizeof(test_entries) / sizeof(test_entries[0]); i++) { printf(" %-20s %s\n", test_entries[i].flag, test_entries[i].desc); } + puts("\nManual tests:"); + for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) { + printf(" %-20s %s\n", manual_test_entries[i].flag, manual_test_entries[i].desc); + } puts(" --list"); puts(" Print test names only."); #ifndef DS4_NO_GPU @@ -2247,6 +2648,13 @@ static const ds4_test_entry *test_find_entry(const char *arg) { return NULL; } +static const ds4_test_entry *test_find_manual_entry(const char *arg) { + for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) { + if (!strcmp(arg, manual_test_entries[i].flag)) return &manual_test_entries[i]; + } + return NULL; +} + static void test_run_entry(const ds4_test_entry *entry) { int before = test_failures; fprintf(stderr, "%s:\n", entry->name); @@ -2262,6 +2670,7 @@ static void test_run_entry(const ds4_test_entry *entry) { int main(int argc, char **argv) { bool run_all = argc == 1; bool selected[sizeof(test_entries) / sizeof(test_entries[0])] = {0}; + bool selected_manual[sizeof(manual_test_entries) / sizeof(manual_test_entries[0])] = {0}; for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "--all")) { @@ -2270,18 +2679,27 @@ int main(int argc, char **argv) { for (size_t j = 0; j < sizeof(test_entries) / sizeof(test_entries[0]); j++) { puts(test_entries[j].flag); } + for (size_t j = 0; j < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); j++) { + puts(manual_test_entries[j].flag); + } return 0; } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { test_print_help(argv[0]); return 0; } else { const ds4_test_entry *entry = test_find_entry(argv[i]); - if (!entry) { - fprintf(stderr, "ds4-test: unknown test switch: %s\n", argv[i]); - test_print_help(argv[0]); - return 2; + if (entry) { + selected[(size_t)(entry - test_entries)] = true; + continue; } - selected[(size_t)(entry - test_entries)] = true; + entry = test_find_manual_entry(argv[i]); + if (entry) { + selected_manual[(size_t)(entry - manual_test_entries)] = true; + continue; + } + fprintf(stderr, "ds4-test: unknown test switch: %s\n", argv[i]); + test_print_help(argv[0]); + return 2; } } @@ -2293,6 +2711,9 @@ int main(int argc, char **argv) { for (size_t i = 0; i < sizeof(test_entries) / sizeof(test_entries[0]); i++) { if (selected[i]) test_run_entry(&test_entries[i]); } + for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) { + if (selected_manual[i]) test_run_entry(&manual_test_entries[i]); + } } #ifndef DS4_NO_GPU diff --git a/tests/test-vectors/README.md b/tests/test-vectors/README.md index de3acaace..fac05dfc0 100644 --- a/tests/test-vectors/README.md +++ b/tests/test-vectors/README.md @@ -1,26 +1,31 @@ # DeepSeek V4 Flash Test Vectors -These vectors were captured from the official DeepSeek V4 Flash API using -`deepseek-v4-flash`, greedy decoding, thinking disabled, and -`top_logprobs=20`. The hosted API does not expose full logits, so these files -store the best logprob slice the API provides. +The compact fixture consumed by `ds4_test` is generated from the local default +CyberNeurova abliterated GGUF using greedy decoding, thinking disabled, and +`top_logprobs=20`. It is a local regression fixture for the model currently +linked by `ds4flash.gguf`. + +The raw `official/*.official.json` captures from the hosted DeepSeek V4 Flash +API are still kept for auditing and comparison, but they are not the default +C test fixture. Files: - `prompts/*.txt`: exact user prompts. - `official/*.official.json`: official API continuations and top-logprobs. -- `official.vec`: compact C-test fixture generated from the official JSON. +- `official.vec`: compact C-test fixture generated from the local GGUF. - `local-golden.vec`: local top-k/logit fixture captured from a known-sane DS4 Flash run. It is used to catch substantial backend drift that can keep the same greedy token while damaging the logits distribution. -Regenerate official vectors: +Regenerate the official API captures: ```sh DEEPSEEK_API_KEY=... ./tests/test-vectors/fetch_official_vectors.py ``` -Running the fetcher without `--only` also regenerates `official.vec`. +The fetcher preserves the hosted API captures. Regenerate `official.vec` from a +local model dump when the default GGUF changes. The C runner consumes `official.vec` directly: @@ -50,9 +55,7 @@ routes disabled and pins `DS4_METAL_PREFILL_CHUNK=2048` for this strict official-vector check. `official.vec` is intentionally trivial to parse from C: each case points to a -prompt file and each expected token is hex-encoded by bytes. The official JSON -files remain in the tree so the compact fixture can be audited against the raw -API response. +prompt file and each expected token is hex-encoded by bytes. To inspect a local top-logprob dump manually: diff --git a/tests/test-vectors/official.vec b/tests/test-vectors/official.vec index 4076e0fd5..bf4c06e74 100644 --- a/tests/test-vectors/official.vec +++ b/tests/test-vectors/official.vec @@ -1,53 +1,397 @@ -# ds4-official-logprob-vectors-v1 +# ds4-local-cyberneurova-abliterated-logprob-vectors-v2 # case # step -# top +# top case short_italian_fact 16384 4 tests/test-vectors/prompts/short_italian_fact.txt -step 0 416461 1 -top 416461 0 -step 1 204c6f76 1 -top 204c6f76 0 -step 2 656c 1 -top 656c 0 -step 3 616365 1 -top 616365 0 +step 0 416461 20 +top 416461 -0.00223207683 +top 2a2a -6.15240526 +top 556e61 -10.3973818 +top 4c616479 -10.6614237 +top 45 -10.6680689 +top 436869 -11.1814814 +top 53 -13.149621 +top 4c61 -13.2641306 +top 4e61747572616c -13.6965952 +top 43657274 -13.9891729 +top 417567757374 -14.5222082 +top c388 -14.6669817 +top 43 -14.7921152 +top 20416461 -14.8195429 +top 4d69 -15.118453 +top 4164 -15.1551867 +top 5365636f6e64 -15.1630163 +top 46 -15.2650843 +top 4d6174 -15.5450182 +top 42 -16.2139282 +step 1 204c6f76 20 +top 204c6f76 -1.94158645e-07 +top 204279726f6e -16.2622414 +top c2a0 -16.9429817 +top 2041756775737461 -17.4329414 +top 20416461 -17.613081 +top 206c6f76 -18.8897514 +top e280 -19.8141136 +top 204c -20.0510406 +top 204c6f766564 -20.5304527 +top 204c75 -21.3707199 +top 204c616479 -21.8961372 +top 20657261 -22.2522278 +top 2028 -22.3919601 +top 2c -22.4892654 +top 204c6176 -22.614727 +top 206469 -22.6896515 +top 2d4c -22.9386253 +top 2042 -23.0224323 +top 204b696e67 -23.5577602 +top 20c3a8 -23.7326317 +step 2 656c 20 +top 656c -3.73509081e-08 +top 656c79 -18.1356659 +top 656c657373 -18.360281 +top 656c61 -19.344656 +top 656c616e64 -19.4052773 +top 656c6179 -20.3470535 +top 6574 -20.6374168 +top 656c616765 -20.8781471 +top 6c65 -21.6413364 +top 6c -21.7200813 +top 6c616365 -21.871603 +top 616c -21.9618225 +top 616365 -22.295929 +top 656c796e -22.729847 +top 6f6c -22.9921799 +top 656c6f7065 -23.0618496 +top c3a8 -23.6282539 +top 454c -24.0764503 +top 656c6465 -24.0828209 +top 656c6f77 -24.0907631 +step 3 616365 20 +top 616365 -4.32595471e-07 +top 61636865 -15.0795364 +top 6163 -16.5640869 +top 616765 -17.1593399 +top 6365 -17.257225 +top 617465 -19.1280441 +top 617665 -19.299263 +top 616e6365 -19.9278831 +top 61637265 -20.2412186 +top 61636b -20.3034439 +top 616465 -20.464489 +top 616665 -21.0095863 +top 6165 -21.2127686 +top 616b65 -21.9579582 +top 414345 -22.4687233 +top 696365 -22.4710159 +top 616361 -22.4848404 +top 616379 -22.7641106 +top 6565 -22.8046398 +top 61636573 -23.0705185 end case short_code_completion 4096 4 tests/test-vectors/prompts/short_code_completion.txt -step 0 606060 1 -top 606060 0 -step 1 63 1 -top 63 0 -step 2 0a 1 -top 0a 0 -step 3 72657475726e 1 -top 72657475726e 0 +step 0 546865 20 +top 546865 -0.585739911 +top 72657475726e -1.42208183 +top 606060 -2.50733829 +top 0a -2.92043567 +top 60 -3.63202357 +top 6060600a -4.15579128 +top 48657265 -5.3454771 +top 202020 -5.48643589 +top 746865 -5.53705311 +top 736e -5.84971905 +top 6e657874 -6.60867214 +top 436f6d706c657465 -7.12569714 +top 49 -7.28526974 +top 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e -7.67529774 +top 22 -7.77495861 +top 5765 -7.94059467 +top 4261736564 -8.14421749 +top 756e646566696e6564 -8.24009991 +top 6060 -8.243186 +top 53696e6365 -8.27500439 +step 1 206e657874 20 +top 206e657874 -0.0034993405 +top 206578616374 -6.21677923 +top 20636f7272656374 -7.87953138 +top 206d697373696e67 -8.23515034 +top 20746f6b656e -8.47098827 +top 20636f6d706c657465 -8.49135494 +top 2043 -8.98648357 +top 20726571756573746564 -9.48330593 +top 206578706563746564 -10.3076849 +top 207265717569726564 -10.3263998 +top 20636f6d706c6574696f6e -10.3525057 +top 206f6e6c79 -10.4702768 +top 20616e73776572 -10.5865335 +top 20636f6d706c65746564 -11.0404902 +top 2073746174656d656e74 -11.2674074 +top 2070726f7669646564 -11.6076918 +top 2060 -12.3725309 +top 20636f6e74696e756174696f6e -12.4003801 +top 6e657874 -12.4465799 +top 20636f6465 -13.0128622 +step 2 206578616374 20 +top 206578616374 -0.0312528573 +top 20746f6b656e -3.48288631 +top 206578706563746564 -10.440753 +top 2076616c6964 -11.6716032 +top 2065786163746c79 -12.8131495 +top 20636f6d706c657465 -13.1101809 +top 20746f6b656e73 -13.2724962 +top 20636f7272656374 -13.3769178 +top 206c6f676963616c -13.5655546 +top 2070726563697365 -14.4975195 +top 202a2a -14.6095209 +top 206578706c69636974 -15.0913286 +top 207265717569726564 -15.127799 +top 2028 -15.3104734 +top 206163637572617465 -15.3454237 +top 2043 -15.3630495 +top 20616e64 -15.4616613 +top 204558 -16.1131496 +top 206578636c7573697665 -16.1462631 +top 6578 -16.3691845 +step 3 20746f6b656e 20 +top 20746f6b656e -5.48701246e-06 +top 2043 -12.3162327 +top 20746f6b656e73 -13.965971 +top 20746f6b -17.2101574 +top 746f6b656e -17.4936848 +top 206578706563746564 -17.9039345 +top 206973 -18.4164562 +top 20746f -18.5629253 +top 2073686f756c64 -18.7419815 +top 5f746f6b656e -19.1551247 +top 2076616c6964 -19.2326775 +top 20636f6d706c6574696f6e -19.3115616 +top 20636f6d706c657465 -19.4702454 +top 206c6f676963616c -19.7332821 +top 20616e64 -19.7751026 +top 20546f6b656e -19.8149071 +top 20776f756c64 -19.8325329 +top 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e -20.0939617 +top 2c -20.3838921 +top 206166746572 -20.4462605 end -case short_reasoning_plain 4096 1 tests/test-vectors/prompts/short_reasoning_plain.txt -step 0 3136 1 -top 3136 0 +case short_reasoning_plain 4096 2 tests/test-vectors/prompts/short_reasoning_plain.txt +step 0 3136 20 +top 3136 -0.00172282755 +top 323034 -6.6006074 +top 546865 -8.98028469 +top 313238 -10.5100775 +top 3634 -10.7039862 +top 546f -10.7105932 +top 323536 -10.8948469 +top 38 -11.0259409 +top 3332 -11.5996084 +top 313633 -11.6718969 +top 36 -11.7362967 +top 4c6574 -11.8519773 +top 34 -11.9897318 +top 313634 -12.1409979 +top 5765 -12.2059736 +top 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e -12.2186594 +top 49 -12.2935553 +top 3135 -12.4659204 +top 313032 -12.5832701 +top 3137 -12.6820211 +step 1 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e 20 +top 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e -1.68589904e-05 +top 0a -11.4170742 +top 3c2f -13.2628803 +top 2e -13.394062 +top 0d -13.4614353 +top 0a0a -14.3499622 +top 3c -16.2068195 +top 200a -16.3556709 +top 20200a -16.6053371 +top 2020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020 -17.4370213 +top 3c5c2f -17.5513058 +top 606060 -17.5718803 +top 2028 -17.5755367 +top 5d5d -17.6659451 +top 7d -17.7317963 +top 60 -17.7695713 +top 5c2e -17.8806343 +top 205c5c -17.9232235 +top e280 -18.1226139 +top 5c29 -18.304369 end case long_memory_archive 16384 4 tests/test-vectors/prompts/long_memory_archive.txt -step 0 436f6d706f6e656e74 1 -top 436f6d706f6e656e74 0 -step 1 2067616d6d61 1 -top 2067616d6d61 0 -step 2 207265706f727473 1 -top 207265706f727473 0 -step 3 20616e6f6d616c696573 1 -top 20616e6f6d616c696573 0 +step 0 436f6d706f6e656e74 20 +top 436f6d706f6e656e74 -0.105898418 +top 47616d6d61 -2.75666666 +top 546865 -4.35264063 +top 67616d6d61 -4.80262041 +top 636f6d706f6e656e74 -4.82719278 +top 4261736564 -5.11432123 +top 4163636f7264696e67 -6.68487024 +top 5265636f7264 -8.14289379 +top ceb3 -10.0843534 +top 416c706861 -10.1029425 +top 20636f6d706f6e656e74 -10.5193949 +top 496e -10.6188078 +top 4166746572 -10.6198263 +top 4f6e6c79 -10.6530161 +top 616c706861 -10.7818213 +top 20436f6d706f6e656e74 -11.0202341 +top 476976656e -11.0461378 +top 2067616d6d61 -11.4705858 +top 746865 -11.5287313 +top 53696e6365 -11.9913778 +step 1 2067616d6d61 20 +top 2067616d6d61 -2.30464434e-06 +top 2047616d6d61 -13.8503323 +top 20616c706861 -13.8811016 +top 20ceb3 -15.5139942 +top 2062657461 -15.9878101 +top 207265706f727473 -17.2890797 +top 2067 -18.252676 +top 202a2a -18.3319645 +top 2e -18.4390163 +top 20657073696c6f6e -18.5010357 +top 67616d6d61 -19.0277481 +top 20 -19.0355511 +top 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e -19.0749569 +top c2a0 -19.2594585 +top 0a -19.3265781 +top 207369676d61 -20.1498203 +top e280 -20.4137096 +top 2c -20.5958786 +top 2064656c7461 -21.1152782 +top 206f6d656761 -21.5610752 +step 2 207265706f727473 20 +top 207265706f727473 -0.00494612288 +top 2e -5.31201029 +top 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e -13.8389263 +top 2e0a0a -15.1493416 +top 2e0a -15.2668438 +top 207265706f72746564 -15.9509525 +top 20646f6573 -16.5551472 +top 2028 -16.6683502 +top 206973 -16.801199 +top 2c -16.8323765 +top 207265636f726473 -17.2387829 +top 207265706f7274 -17.7172985 +top 2072657475726e73 -19.0998821 +top 206f6e6c79 -19.1626606 +top 20686173 -19.9670811 +top 2073686f7773 -20.1530476 +top 207265706f727465646c79 -20.1649094 +top 207265706f7274696e67 -20.7063942 +top 20636865636b73 -20.7853947 +top 2072656c6561736573 -20.9100227 +step 3 20616e6f6d616c696573 20 +top 20616e6f6d616c696573 -3.39562547e-08 +top 20616e6f6d616c6f7573 -17.9823303 +top 2061626e6f726d616c6974696573 -18.4510269 +top 20746865 -19.7427635 +top 206166746572 -19.9412899 +top 206f6e6c79 -20.0774651 +top 20616e -21.0316296 +top 20616e6f6d616c -21.1744709 +top 20616e6f6d616c79 -22.0913544 +top 2074686f7365 -22.90341 +top 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e -23.4925079 +top 2e -24.2962589 +top 20616e79 -24.6827545 +top 20657863657074696f6e73 -24.6888847 +top e280 -24.7788696 +top c2a0 -24.9924545 +top 206f75746c69657273 -25.2077751 +top 20616c6c -25.2229881 +top 206f62736572766174696f6e73 -25.580471 +top 206572726f7273 -25.6657715 end case long_code_audit 16384 4 tests/test-vectors/prompts/long_code_audit.txt -step 0 546865 1 -top 546865 0 -step 1 206d6f7374 1 -top 206d6f7374 0 -step 2 20696d706f7274616e74 1 -top 20696d706f7274616e74 0 -step 3 20636f6465 1 -top 20636f6465 0 +step 0 546865 20 +top 546865 -0.00386784854 +top 4c6f6f6b696e67 -6.12391615 +top 5468657265 -7.30433989 +top 4261736564 -7.91420317 +top 48657265 -8.7578907 +top 2a2a -8.97183228 +top 54686973 -9.24920273 +top 2323 -9.81762123 +top 20546865 -10.0437698 +top 5468657365 -10.4235611 +top 4974 -10.5219955 +top 496e -11.5369816 +top 7265 -11.6720638 +top 2e2e2e -11.842844 +top 476976656e -12.0138741 +top 4166746572 -12.3674946 +top 54686174 -12.5671959 +top 52656164696e67 -12.5910645 +top 5f5f -12.7797279 +top 746865 -12.9396172 +step 1 206d6f7374 20 +top 206d6f7374 -0.000208983809 +top 2066756e6374696f6e73 -9.23119068 +top 2067656e657261746564 -10.4861059 +top 206c6f67 -10.7199526 +top 20636f6465 -11.335803 +top 206175646974 -11.3678656 +top 2072657065746974696f6e -11.5716124 +top 20636f6d706c6574696f6e -11.8759604 +top 207061747465726e -12.533824 +top 207265706561746564 -12.9276762 +top 206d61696e -13.3294611 +top 20656e74697265 -13.4693804 +top 202a2a -13.7008543 +top 2072657065746974697665 -14.0882254 +top 2066756e6374696f6e -14.1555948 +top 20636f6d706c657465 -14.19596 +top 2070726f7669646564 -14.3754339 +top 207061747465726e73 -14.5542402 +top 206b6579 -14.5827017 +top 6d6f7374 -14.7641459 +step 2 20696d706f7274616e74 20 +top 20696d706f7274616e74 -2.73004594e-06 +top 206c696b656c79 -14.2245531 +top 206f6276696f7573 -14.7665071 +top 20636f6d6d6f6e -14.9531012 +top 20696d706f7274 -15.2255716 +top 202a2a -15.2469683 +top 20737472696b696e67 -15.2953634 +top 20696d70 -15.7600451 +top 207369676e69666963616e74 -16.2959881 +top 207265706561746564 -16.5497494 +top 696d706f7274616e74 -16.5566616 +top 20696d7072657373697665 -16.6703777 +top 20696d706f7274616e7465 -17.123682 +top 20637269746963616c -17.3378696 +top 206e6f7461626c65 -17.5797157 +top 2070726f6d696e656e74 -17.622797 +top 2072656c6576616e74 -17.7419834 +top 20696e746572657374696e67 -17.8331661 +top 206d6f7374 -18.4256649 +top 2072657065746974697665 -19.0499516 +step 3 20636f6465 20 +top 20636f6465 -2.71721746e-07 +top 206973737565 -15.9092083 +top 202a2a -16.893919 +top 207175616c697479 -16.9112167 +top 20436f6465 -18.4074783 +top 636f6465 -18.6753731 +top 20636f6d6d6f6e -18.9950409 +top 207468696e67 -19.5752048 +top e4bba3e7a081 -19.6258354 +top 0a -19.7425041 +top 3cefbd9c656e64e296816f66e2968173656e74656e6365efbd9c3e -19.9056091 +top 20636f64696e67 -19.9664783 +top 20726563757272696e67 -20.2803802 +top 5f636f6465 -20.2913589 +top 20636f7265 -20.2972527 +top 20ecbd94eb939c -20.592741 +top 20616e64 -20.9018459 +top e280 -21.0480499 +top 20726561736f6e -21.189888 +top 0a0a -21.214201 end diff --git a/tests/test-vectors/regen_local_vectors.py b/tests/test-vectors/regen_local_vectors.py new file mode 100755 index 000000000..8d9811a43 --- /dev/null +++ b/tests/test-vectors/regen_local_vectors.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +"""Regenerate tests/test-vectors/official.vec from the local ds4flash.gguf. + +Runs ./ds4 --dump-logprobs with the same strict configuration that +test_local_logprob_vectors() uses in the C runner (MPP off, prefill chunk 2048), +then emits the compact v2 vec format. + +Per-case ctx and step count come from the prompts table below, matching the +existing official.vec layout. +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +CASES = [ + ("short_italian_fact", 16384, 4), + ("short_code_completion", 4096, 4), + ("short_reasoning_plain", 4096, 2), + ("long_memory_archive", 16384, 4), + ("long_code_audit", 16384, 4), +] + + +def hex_bytes(values): + return "".join(f"{int(b):02x}" for b in values) + + +def capture_case(ds4_bin: Path, root: Path, prompt_id: str, ctx: int, steps: int, + lock_file: str) -> dict: + prompt_path = root / "prompts" / f"{prompt_id}.txt" + tmp_dir = Path(tempfile.mkdtemp(prefix=f"ds4-vec-{prompt_id}-")) + out_path = tmp_dir / "logprobs.json" + env = os.environ.copy() + env["DS4_METAL_PREFILL_CHUNK"] = "2048" + env["DS4_METAL_DISABLE_METAL4"] = "1" + env["DS4_LOCK_FILE"] = lock_file + cmd = [ + str(ds4_bin), + "--metal", + "--system", "", + "--prompt-file", str(prompt_path), + "--ctx", str(ctx), + "-n", str(steps), + "--temp", "0", + "--nothink", + "--logprobs-top-k", "20", + "--dump-logprobs", str(out_path), + ] + print(f"-> {prompt_id} ctx={ctx} steps={steps}", file=sys.stderr) + proc = subprocess.run(cmd, env=env, check=False) + if proc.returncode != 0: + raise SystemExit(f"ds4 failed for {prompt_id} (exit {proc.returncode})") + with out_path.open("r", encoding="utf-8") as fp: + data = json.load(fp) + shutil.rmtree(tmp_dir, ignore_errors=True) + return data + + +def build_vec(records, root: Path) -> str: + lines = [ + "# ds4-local-cyberneurova-abliterated-logprob-vectors-v2", + "# case ", + "# step ", + "# top ", + "", + ] + for prompt_id, ctx, steps, dump in records: + prompt_rel = f"tests/test-vectors/prompts/{prompt_id}.txt" + actual_steps = len(dump["steps"]) + if actual_steps < steps: + raise SystemExit( + f"{prompt_id}: expected {steps} steps, ds4 produced {actual_steps}" + ) + lines.append(f"case {prompt_id} {ctx} {steps} {prompt_rel}") + for i in range(steps): + step = dump["steps"][i] + selected_hex = hex_bytes(step["selected"]["bytes"]) + top = [ + (hex_bytes(t["token"]["bytes"]), float(t["logprob"])) + for t in step["top_logprobs"] + if t["token"]["bytes"] + ] + lines.append(f"step {i} {selected_hex} {len(top)}") + for token_hex, lp in top: + lines.append(f"top {token_hex} {lp:.9g}") + lines.append("end") + lines.append("") + return "\n".join(lines) + + +def main() -> int: + here = Path(__file__).resolve().parent + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--ds4", default=str(here.parent.parent / "ds4"), + help="path to ds4 binary") + parser.add_argument("--out", default=str(here / "official.vec"), + help="output vec file path") + parser.add_argument("--only", action="append", + help="capture only the named prompt id (repeatable)") + parser.add_argument("--lock-file", default="/tmp/ds4-regen-vectors.lock", + help="DS4_LOCK_FILE override so a running ds4-server does not block") + args = parser.parse_args() + + ds4_bin = Path(args.ds4) + if not ds4_bin.exists(): + raise SystemExit(f"missing ds4 binary at {ds4_bin}") + + selected = set(args.only) if args.only else None + records = [] + for prompt_id, ctx, steps in CASES: + if selected and prompt_id not in selected: + continue + dump = capture_case(ds4_bin, here, prompt_id, ctx, steps, args.lock_file) + records.append((prompt_id, ctx, steps, dump)) + + if not records: + raise SystemExit("no cases captured") + + vec_text = build_vec(records, here) + Path(args.out).write_text(vec_text, encoding="ascii") + print(f"wrote {args.out} ({len(records)} cases)", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())