diff --git a/Makefile b/Makefile index 9711dc1a4..de5dc185b 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,8 @@ ROCM_SRCS := $(wildcard rocm/*.cuh) ifeq ($(UNAME_S),Darwin) METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal -CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_metal.o -CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o +CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_metal.o +CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o else CFLAGS += -D_GNU_SOURCE -fno-finite-math-only CUDA_HOME ?= /usr/local/cuda @@ -28,8 +28,8 @@ ifneq ($(strip $(CUDA_ARCH)),) NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH) endif NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread -CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_cuda.o -CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o +CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_cuda.o +CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas HIPCC ?= $(shell command -v hipcc 2>/dev/null || echo /opt/rocm/bin/hipcc) ROCM_ARCH ?= gfx1151 @@ -106,7 +106,7 @@ cuda: strix-halo: $(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent \ - CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o" \ + CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_rocm.o" \ CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \ DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \ DS4_LINK_LIBS="$(ROCM_LDLIBS)" @@ -139,11 +139,13 @@ cuda-regression: tests/cuda_long_context_smoke ./tests/cuda_long_context_smoke endif -ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h +ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h $(CC) $(CFLAGS) -c -o $@ ds4.c ds4_ssd.o: ds4_ssd.c ds4_ssd.h - $(CC) $(CFLAGS) -c -o $@ ds4_ssd.c + +ds4_dspark_runtime.o: ds4_dspark_runtime.c ds4_dspark_runtime.h ds4.h + $(CC) $(CFLAGS) -c -o $@ ds4_dspark_runtime.c ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h $(CC) $(CFLAGS) -c -o $@ ds4_cli.c @@ -187,7 +189,7 @@ rax.o: rax.c rax.h rax_malloc.h linenoise.o: linenoise.c linenoise.h $(CC) $(CFLAGS) -c -o $@ linenoise.c -ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h +ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h $(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h diff --git a/README.md b/README.md index 785695284..5508ea8df 100644 --- a/README.md +++ b/README.md @@ -133,11 +133,37 @@ weights. Flash GGUF generation is supported by the local tools. PRO GGUF production currently still depends on the external `llama.cpp`-based workflow; native tooling can be added later. -`./download_model.sh mtp` fetches the optional speculative decoding support -GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and q4-imatrix, -but must be enabled explicitly with `--mtp`. The current MTP/speculative -decoding path is still experimental: it is correctness-gated and currently -provides at most a slight speedup, not a meaningful generation-speed win. +`./download_model.sh mtp` fetches the optional legacy speculative decoding +support GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and +q4-imatrix, but must be enabled explicitly with `--mtp`. Legacy one-step MTP is +correctness-gated and experimental: it currently provides at most a slight +speedup, not a meaningful generation-speed win. Official DeepSeek-V4-Flash +DSpark/DeepSpec Markov draft shards can be converted with +`gguf-tools/deepseek4-quantize --dspark-only`. Passing the converted DSpark GGUF +with `--mtp DSpark.gguf` enables an experimental Metal block speculative decode +path: draft blocks are target-verified before commit, but acceptance and speed +depend on the base/draft quantization and prompt. DSpark GGUFs are additional +draft-model weights, so higher draft precision trades directly against +long-context headroom. CPU builds do not run MTP, and CUDA/ROCm currently load +DSpark GGUFs without enabling the DSpark runtime. + +For DeepSpec training experiments, `ds4 --dspark-target-cache-dataset FILE +--dspark-target-cache-out DIR --dspark-target-cache-target-model HF_OR_PATH` +consumes the same rendered prompt dataset format used by imatrix collection and +writes a DeepSpec-compatible target cache (`manifest.json`, `samples.idx`, and +shard data) containing prompt token ids, attention/loss masks, target-layer +hidden states, and last hidden states. Use +`--dspark-target-cache-chat-template NAME` to stamp the cache manifest with the +DeepSpec training template identity. +Validate the cache contract with +`python3 gguf-tools/deepspec/ds4_deepspec.py DIR --target-model HF_OR_PATH` +before handing it to a DeepSpec checkout. The same helper can emit the DS4-side +non-Markov DeepSpec config scaffold with +`python3 gguf-tools/deepspec/ds4_deepspec.py --emit-nonseq-config dspark_v4_nonseq.py --target-cache DIR`. +This target-cache export path remains useful for DSpark/DeepSpec training +experiments; the built-in Metal runtime uses already converted official DSpark +Markov draft GGUFs and should still be benchmarked with `DS4_MTP_TIMING=1` on +the exact base/draft quant pair before treating it as a throughput win. Then build: @@ -689,10 +715,12 @@ conversation. Useful commands are `/help`, `/think`, `/think-max`, `/nothink`, and returns to `ds4>`. The CLI defaults to thinking mode. Use `/nothink` or `--nothink` for direct -answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional MTP speculative -path; it is useful only for greedy decoding, currently uses a confidence gate -(`--mtp-margin`) to avoid slow partial accepts, and should be treated as an -experimental slight-speedup path. +answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional legacy one-step +MTP speculative path. Passing a converted official DSpark/DeepSpec Markov GGUF +with `--mtp DSpark.gguf` opts into the experimental Metal block-draft runtime, +which verifies proposed blocks against the target model before committing them. +It is correctness-gated, not a guaranteed speedup; measure acceptance and wall +time for the exact quantized base/draft pair. ## Server diff --git a/download_model.sh b/download_model.sh index 51d368a58..b9f410232 100755 --- a/download_model.sh +++ b/download_model.sh @@ -65,9 +65,9 @@ Targets: Downloads both PRO Q4 split files into the download directory. About 838 GB total. This target does not update ./ds4flash.gguf. - mtp Optional speculative decoding component, about 3.5 GB on disk. - It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but must be - enabled explicitly with --mtp when running ds4 or ds4-server. + mtp Optional legacy one-step speculative decoding component, about 3.5 GB on + disk. It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but + must be enabled explicitly with --mtp when running ds4 or ds4-server. Options: --token TOKEN Hugging Face token. Otherwise HF_TOKEN or the local HF token @@ -259,9 +259,10 @@ fi if [ "$MODEL" = "mtp" ]; then echo - echo "MTP is an optional component for q2-imatrix, q2-q4-imatrix, and q4-imatrix." + echo "MTP is an optional legacy one-step component for q2-imatrix, q2-q4-imatrix, and q4-imatrix." echo "Enable it explicitly, for example:" echo " ./ds4 --mtp $OUT_DIR/$MTP_FILE --mtp-draft 2" + echo "DeepSpec/DSpark GGUFs are recognized separately by the loader but speculative block drafting remains disabled until validated." elif [ "$MODEL" = "pro-q4-layers00-30" ] || [ "$MODEL" = "pro-q4-layers31-output" ] || [ "$MODEL" = "pro-q4-split" ]; then echo echo "Downloaded PRO Q4 distributed split file(s). Use them with --layers," diff --git a/ds4.c b/ds4.c index 640511eb0..ec206a0c9 100644 --- a/ds4.c +++ b/ds4.c @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -38,6 +39,11 @@ #include "ds4.h" #include "ds4_distributed.h" +#include "ds4_dspark_runtime.h" + +#ifndef DS4_GIT_COMMIT +#define DS4_GIT_COMMIT "unknown" +#endif #ifndef DS4_NO_GPU #include "ds4_gpu.h" @@ -322,6 +328,7 @@ static uint32_t g_ds4_compress_ratios[DS4_MAX_LAYER] = {0}; #define DS4_COMPRESS_ROPE_FREQ_BASE (g_ds4_shape.compress_rope_freq_base) #define DS4_ROPE_ORIG_CTX (g_ds4_shape.rope_orig_ctx) +enum { DS4_DSPARK_MAX_BLOCK_SIZE = 16 }; static int g_ds4_lock_fd = -1; #if defined(__GNUC__) || defined(__clang__) @@ -607,6 +614,9 @@ typedef struct { } ds4_str; typedef ds4_tokens token_vec; +static void token_vec_push(token_vec *tv, int token); +static void token_vec_free(token_vec *tv); + typedef struct { const uint8_t *base; @@ -1594,6 +1604,7 @@ enum { DS4_TENSOR_Q4_K = 12, DS4_TENSOR_IQ2_XXS = 16, DS4_TENSOR_I32 = 26, + DS4_TENSOR_BF16 = 30, }; typedef struct { @@ -1617,6 +1628,7 @@ typedef struct { int fd; const uint8_t *map; uint64_t size; + char *path; uint32_t version; uint64_t n_kv; @@ -1824,6 +1836,7 @@ static void model_close(ds4_model *m) { if (!m) return; free(m->kv); free(m->tensors); + free(m->path); if (m->map) munmap((void *)m->map, (size_t)m->size); if (m->fd >= 0) close(m->fd); memset(m, 0, sizeof(*m)); @@ -1973,6 +1986,7 @@ static void model_open(ds4_model *m, const char *path, bool metal_mapping, m->fd = fd; m->map = map; m->size = (uint64_t)st.st_size; + m->path = ds4_strdup(path); ds4_cursor c = cursor_at(m, 0); uint32_t magic; @@ -2437,6 +2451,14 @@ static inline uint16_t f32_to_f16(float f) { #endif } +static inline uint16_t f32_to_bf16(float f) { + uint32_t bits; + memcpy(&bits, &f, sizeof(bits)); + const uint32_t lsb = (bits >> 16) & 1u; + bits += 0x7fffu + lsb; + return (uint16_t)(bits >> 16); +} + static void f16_round_inplace_cpu(float *x, uint32_t n) { for (uint32_t i = 0; i < n; i++) x[i] = f16_to_f32(f32_to_f16(x[i])); } @@ -3061,16 +3083,26 @@ typedef struct { ds4_layer_weights layer[DS4_MAX_LAYER]; } ds4_weights; +enum { DS4_DSPARK_MTP_LAYERS = 3 }; + typedef struct { - ds4_tensor *e_proj; - ds4_tensor *h_proj; - ds4_tensor *enorm; - ds4_tensor *hnorm; - ds4_tensor *norm; - ds4_tensor *hc_head_base; - ds4_tensor *hc_head_fn; - ds4_tensor *hc_head_scale; - ds4_layer_weights block; + ds4_mtp_draft_kind kind; + ds4_dspark_config dspark; + ds4_tensor *e_proj; + ds4_tensor *h_proj; + ds4_tensor *enorm; + ds4_tensor *hnorm; + ds4_tensor *norm; + ds4_tensor *hc_head_base; + ds4_tensor *hc_head_fn; + ds4_tensor *hc_head_scale; + ds4_tensor *main_proj; + ds4_tensor *main_norm; + ds4_tensor *markov_w1; + ds4_tensor *markov_w2; + ds4_tensor *confidence_proj; + ds4_layer_weights block; + ds4_layer_weights stage[DS4_DSPARK_MTP_LAYERS]; } ds4_mtp_weights; /* ========================================================================= @@ -3202,6 +3234,29 @@ static void tensor_expect_plain_layout( tensor_expect_layout(t, t->type, ndim, d0, d1, d2); } +static bool tensor_type_is_plain_or_bf16(uint32_t type) { + return type == DS4_TENSOR_F16 || type == DS4_TENSOR_F32 || + type == DS4_TENSOR_BF16; +} + +static void tensor_expect_plain_or_bf16_layout( + const ds4_tensor *t, + uint32_t ndim, + uint64_t d0, + uint64_t d1, + uint64_t d2) { + if (!t) ds4_die("internal error: missing tensor while validating layout"); + if (!tensor_type_is_plain_or_bf16(t->type)) { + fprintf(stderr, + "ds4: tensor %.*s has type %s, expected F16, F32, or BF16\n", + (int)t->name.len, + t->name.ptr, + tensor_type_name(t->type)); + exit(1); + } + tensor_expect_layout(t, t->type, ndim, d0, d1, d2); +} + static bool tensor_type_is_f16_or_q8_0(uint32_t type) { return type == DS4_TENSOR_F16 || type == DS4_TENSOR_Q8_0; } @@ -3639,21 +3694,106 @@ static void weights_validate_layout( } } -static void mtp_weights_validate_layout(const ds4_mtp_weights *w) { + +void ds4_dspark_config_init_defaults(ds4_dspark_config *cfg) { + if (!cfg) return; + memset(cfg, 0, sizeof(*cfg)); + cfg->n_mtp_layers = 3; + cfg->block_size = 5; + cfg->noise_token_id = 128799u; + cfg->markov_rank = 256; + cfg->target_layer_ids[0] = 40; + cfg->target_layer_ids[1] = 41; + cfg->target_layer_ids[2] = 42; +} + +const char *ds4_mtp_draft_kind_name(ds4_mtp_draft_kind kind) { + switch (kind) { + case DS4_MTP_DRAFT_LEGACY: return "legacy-mtp"; + case DS4_MTP_DRAFT_DSPARK: return "dspark"; + case DS4_MTP_DRAFT_DSPARK_NONSEQ: return "dspark-nonseq"; + default: return "none"; + } +} + +ds4_mtp_draft_kind ds4_mtp_draft_kind_guess_ex(bool has_e_proj, + bool has_main_proj, + bool has_markov_w1, + bool markov_rank_set, + uint32_t markov_rank) { + if (has_main_proj && has_markov_w1) return DS4_MTP_DRAFT_DSPARK; + if (has_main_proj && markov_rank_set && markov_rank == 0) return DS4_MTP_DRAFT_DSPARK_NONSEQ; + if (has_e_proj) return DS4_MTP_DRAFT_LEGACY; + return DS4_MTP_DRAFT_NONE; +} + +ds4_mtp_draft_kind ds4_mtp_draft_kind_guess(bool has_e_proj, bool has_main_proj, bool has_markov_w1) { + return ds4_mtp_draft_kind_guess_ex(has_e_proj, has_main_proj, has_markov_w1, false, 0); +} + +static void dspark_config_apply_metadata(ds4_dspark_config *cfg, const ds4_model *m) { + ds4_dspark_config_init_defaults(cfg); + uint32_t v = 0; + if (model_get_u32(m, "deepseek4.dspark.n_mtp_layers", &v)) { + if (v != DS4_DSPARK_MTP_LAYERS) { + fprintf(stderr, "ds4: DSpark draft expects %u stages, GGUF has n_mtp_layers=%u\n", + DS4_DSPARK_MTP_LAYERS, v); + exit(1); + } + cfg->n_mtp_layers = v; + } + if (model_get_u32(m, "deepseek4.dspark.block_size", &v) && v > 0) cfg->block_size = v; + if (model_get_u32(m, "deepseek4.dspark.noise_token_id", &v)) cfg->noise_token_id = v; + if (model_get_u32(m, "deepseek4.dspark.markov_rank", &v)) cfg->markov_rank = v; + for (uint32_t i = 0; i < 3; i++) { + char key[64]; + snprintf(key, sizeof(key), "deepseek4.dspark.target_layer_ids.%u", i); + if (model_get_u32(m, key, &v)) cfg->target_layer_ids[i] = v; + } +} + +static ds4_mtp_draft_kind mtp_model_detect_kind(const ds4_model *m) { + uint32_t markov_rank = 0; + const bool markov_rank_set = model_get_u32(m, "deepseek4.dspark.markov_rank", &markov_rank); + const bool has_e_proj = model_find_tensor(m, "mtp.0.e_proj.weight") != NULL; + const bool has_main_proj = model_find_tensor(m, "mtp.0.main_proj.weight") != NULL; + const bool has_markov = model_find_tensor(m, "mtp.2.markov_head.markov_w1.weight") != NULL; + return ds4_mtp_draft_kind_guess_ex(has_e_proj, has_main_proj, has_markov, + markov_rank_set, markov_rank); +} + +static void mtp_weights_bind_mtp_layer(ds4_layer_weights *l, const ds4_model *m, uint32_t stage) { + l->hc_attn_fn = required_tensorf(m, "mtp.%u.hc_attn_fn.weight", stage); + l->hc_attn_scale = required_tensorf(m, "mtp.%u.hc_attn_scale.weight", stage); + l->hc_attn_base = required_tensorf(m, "mtp.%u.hc_attn_base.weight", stage); + l->attn_norm = required_tensorf(m, "mtp.%u.attn_norm.weight", stage); + l->attn_q_a = required_tensorf(m, "mtp.%u.attn_q_a.weight", stage); + l->attn_q_a_norm = required_tensorf(m, "mtp.%u.attn_q_a_norm.weight", stage); + l->attn_q_b = required_tensorf(m, "mtp.%u.attn_q_b.weight", stage); + l->attn_kv = required_tensorf(m, "mtp.%u.attn_kv.weight", stage); + l->attn_kv_a_norm = required_tensorf(m, "mtp.%u.attn_kv_a_norm.weight", stage); + l->attn_sinks = required_tensorf(m, "mtp.%u.attn_sinks.weight", stage); + l->attn_output_a = required_tensorf(m, "mtp.%u.attn_output_a.weight", stage); + l->attn_output_b = required_tensorf(m, "mtp.%u.attn_output_b.weight", stage); + l->hc_ffn_fn = required_tensorf(m, "mtp.%u.hc_ffn_fn.weight", stage); + l->hc_ffn_scale = required_tensorf(m, "mtp.%u.hc_ffn_scale.weight", stage); + l->hc_ffn_base = required_tensorf(m, "mtp.%u.hc_ffn_base.weight", stage); + l->ffn_norm = required_tensorf(m, "mtp.%u.ffn_norm.weight", stage); + l->ffn_gate_inp = required_tensorf(m, "mtp.%u.ffn_gate_inp.weight", stage); + l->ffn_exp_probs_b = tensor_by_namef(m, "mtp.%u.exp_probs_b.bias", stage); + l->ffn_gate_exps = required_tensorf(m, "mtp.%u.ffn_gate_exps.weight", stage); + l->ffn_up_exps = required_tensorf(m, "mtp.%u.ffn_up_exps.weight", stage); + l->ffn_down_exps = required_tensorf(m, "mtp.%u.ffn_down_exps.weight", stage); + l->ffn_gate_shexp = required_tensorf(m, "mtp.%u.ffn_gate_shexp.weight", stage); + l->ffn_up_shexp = required_tensorf(m, "mtp.%u.ffn_up_shexp.weight", stage); + l->ffn_down_shexp = required_tensorf(m, "mtp.%u.ffn_down_shexp.weight", stage); +} + +static void mtp_layer_validate_layout(const ds4_layer_weights *l, bool require_exp_probs_b) { const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC; const uint64_t hc_mix_dim = 2u * DS4_N_HC + (uint64_t)DS4_N_HC * DS4_N_HC; const uint64_t q_dim = (uint64_t)DS4_N_HEAD * DS4_N_HEAD_DIM; const uint64_t out_low_dim = (uint64_t)DS4_N_OUT_GROUP * DS4_N_LORA_O; - const ds4_layer_weights *l = &w->block; - - tensor_expect_layout(w->hc_head_base, DS4_TENSOR_F32, 1, DS4_N_HC, 0, 0); - tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0); - tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32, 1, 1, 0, 0); - tensor_expect_layout(w->e_proj, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0); - tensor_expect_layout(w->h_proj, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0); - tensor_expect_layout(w->enorm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); - tensor_expect_layout(w->hnorm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); - tensor_expect_layout(w->norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); tensor_expect_plain_layout(l->hc_attn_fn, 2, hc_dim, hc_mix_dim, 0); tensor_expect_layout(l->hc_attn_scale, DS4_TENSOR_F32, 1, 3, 0, 0); @@ -3667,13 +3807,16 @@ static void mtp_weights_validate_layout(const ds4_mtp_weights *w) { tensor_expect_layout(l->attn_sinks, DS4_TENSOR_F32, 1, DS4_N_HEAD, 0, 0); tensor_expect_layout(l->attn_output_a, DS4_TENSOR_Q8_0, 2, DS4_N_HEAD_DIM * (DS4_N_HEAD / DS4_N_OUT_GROUP), out_low_dim, 0); tensor_expect_layout(l->attn_output_b, DS4_TENSOR_Q8_0, 2, out_low_dim, DS4_N_EMBD, 0); - tensor_expect_plain_layout(l->hc_ffn_fn, 2, hc_dim, hc_mix_dim, 0); tensor_expect_layout(l->hc_ffn_scale, DS4_TENSOR_F32, 1, 3, 0, 0); tensor_expect_layout(l->hc_ffn_base, DS4_TENSOR_F32, 1, hc_mix_dim, 0, 0); tensor_expect_layout(l->ffn_norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); tensor_expect_plain_layout(l->ffn_gate_inp, 2, DS4_N_EMBD, DS4_N_EXPERT, 0); - tensor_expect_layout(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0); + if (require_exp_probs_b) { + tensor_expect_layout(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0); + } else { + tensor_expect_optional(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0); + } tensor_expect_routed_expert(l->ffn_gate_exps, 3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT); tensor_expect_routed_expert(l->ffn_up_exps, 3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT); tensor_expect_routed_expert(l->ffn_down_exps, 3, DS4_N_FF_EXP, DS4_N_EMBD, DS4_N_EXPERT); @@ -3685,6 +3828,93 @@ static void mtp_weights_validate_layout(const ds4_mtp_weights *w) { tensor_expect_layout(l->ffn_down_shexp, DS4_TENSOR_Q8_0, 2, DS4_N_FF_EXP, DS4_N_EMBD, 0); } +static void mtp_weights_validate_legacy_layout(const ds4_mtp_weights *w) { + const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC; + + tensor_expect_layout(w->hc_head_base, DS4_TENSOR_F32, 1, DS4_N_HC, 0, 0); + tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0); + tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32, 1, 1, 0, 0); + tensor_expect_layout(w->e_proj, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0); + tensor_expect_layout(w->h_proj, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0); + tensor_expect_layout(w->enorm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + tensor_expect_layout(w->hnorm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + tensor_expect_layout(w->norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + mtp_layer_validate_layout(&w->block, true); +} + +static void mtp_weights_validate_dspark_layout(const ds4_mtp_weights *w) { + const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC; + const uint64_t main_in = 3u * DS4_N_EMBD; + const bool has_markov_head = w->kind == DS4_MTP_DRAFT_DSPARK; + if (w->dspark.block_size == 0 || w->dspark.block_size > 16) { + ds4_die("DSpark block_size must be in 1..16"); + } + + tensor_expect_layout(w->main_proj, DS4_TENSOR_Q8_0, 2, main_in, DS4_N_EMBD, 0); + tensor_expect_layout(w->main_norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + for (uint32_t s = 0; s < w->dspark.n_mtp_layers; s++) { + mtp_layer_validate_layout(&w->stage[s], false); + } + tensor_expect_layout(w->norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); + tensor_expect_layout(w->hc_head_base, DS4_TENSOR_F32, 1, DS4_N_HC, 0, 0); + tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0); + tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32, 1, 1, 0, 0); + if (has_markov_head) { + const uint64_t conf_in = DS4_N_EMBD + (uint64_t)w->dspark.markov_rank; + if (w->dspark.markov_rank == 0) ds4_die("official DSpark Markov head has zero markov rank"); + tensor_expect_plain_or_bf16_layout(w->markov_w1, 2, w->dspark.markov_rank, DS4_N_VOCAB, 0); + tensor_expect_plain_or_bf16_layout(w->markov_w2, 2, w->dspark.markov_rank, DS4_N_VOCAB, 0); + if (!w->confidence_proj) ds4_die("internal error: missing DSpark confidence projection"); + if (w->confidence_proj->ndim == 1) { + tensor_expect_plain_or_bf16_layout(w->confidence_proj, 1, conf_in, 0, 0); + } else { + tensor_expect_plain_or_bf16_layout(w->confidence_proj, 2, conf_in, 1, 0); + } + } else if (w->dspark.markov_rank != 0) { + ds4_die("nonseq DSpark draft must declare deepseek4.dspark.markov_rank=0"); + } +} + +static void mtp_weights_bind_legacy(ds4_mtp_weights *w, const ds4_model *m) { + w->kind = DS4_MTP_DRAFT_LEGACY; + w->hc_head_base = required_tensor(m, "mtp.0.hc_head_base.weight"); + w->hc_head_fn = required_tensor(m, "mtp.0.hc_head_fn.weight"); + w->hc_head_scale = required_tensor(m, "mtp.0.hc_head_scale.weight"); + w->e_proj = required_tensor(m, "mtp.0.e_proj.weight"); + w->h_proj = required_tensor(m, "mtp.0.h_proj.weight"); + w->enorm = required_tensor(m, "mtp.0.enorm.weight"); + w->hnorm = required_tensor(m, "mtp.0.hnorm.weight"); + w->norm = required_tensor(m, "mtp.0.norm.weight"); + mtp_weights_bind_mtp_layer(&w->block, m, 0); + mtp_weights_validate_legacy_layout(w); +} + +static void mtp_weights_bind_dspark(ds4_mtp_weights *w, const ds4_model *m) { + w->kind = mtp_model_detect_kind(m); + dspark_config_apply_metadata(&w->dspark, m); + if (w->dspark.n_mtp_layers != DS4_DSPARK_MTP_LAYERS) { + fprintf(stderr, "ds4: DSpark draft expects %u stages, GGUF has n_mtp_layers=%u\n", + DS4_DSPARK_MTP_LAYERS, w->dspark.n_mtp_layers); + exit(1); + } + w->main_proj = required_tensor(m, "mtp.0.main_proj.weight"); + w->main_norm = required_tensor(m, "mtp.0.main_norm.weight"); + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + mtp_weights_bind_mtp_layer(&w->stage[s], m, s); + } + w->norm = required_tensor(m, "mtp.2.norm.weight"); + w->hc_head_base = required_tensor(m, "mtp.2.hc_head_base.weight"); + w->hc_head_fn = required_tensor(m, "mtp.2.hc_head_fn.weight"); + w->hc_head_scale = required_tensor(m, "mtp.2.hc_head_scale.weight"); + if (w->kind == DS4_MTP_DRAFT_DSPARK) { + w->markov_w1 = required_tensor(m, "mtp.2.markov_head.markov_w1.weight"); + w->markov_w2 = required_tensor(m, "mtp.2.markov_head.markov_w2.weight"); + w->confidence_proj = required_tensor(m, "mtp.2.confidence_head.proj.weight"); + } + mtp_weights_validate_dspark_layout(w); +} + + static bool ds4_shape_matches_metadata( const ds4_shape *s, uint32_t n_layer, @@ -4433,45 +4663,34 @@ static DS4_MAYBE_UNUSED bool weights_model_map_output_spans( return model_map_span_vec_finish(spans); } -static void mtp_weights_bind(ds4_mtp_weights *w, const ds4_model *m) { - memset(w, 0, sizeof(*w)); - w->hc_head_base = required_tensor(m, "mtp.0.hc_head_base.weight"); - w->hc_head_fn = required_tensor(m, "mtp.0.hc_head_fn.weight"); - w->hc_head_scale = required_tensor(m, "mtp.0.hc_head_scale.weight"); - w->e_proj = required_tensor(m, "mtp.0.e_proj.weight"); - w->h_proj = required_tensor(m, "mtp.0.h_proj.weight"); - w->enorm = required_tensor(m, "mtp.0.enorm.weight"); - w->hnorm = required_tensor(m, "mtp.0.hnorm.weight"); - w->norm = required_tensor(m, "mtp.0.norm.weight"); +bool ds4_mtp_speculative_draft_ready(ds4_mtp_draft_kind kind) { + return kind == DS4_MTP_DRAFT_LEGACY || kind == DS4_MTP_DRAFT_DSPARK; +} - ds4_layer_weights *l = &w->block; - l->hc_attn_fn = required_tensor(m, "mtp.0.hc_attn_fn.weight"); - l->hc_attn_scale = required_tensor(m, "mtp.0.hc_attn_scale.weight"); - l->hc_attn_base = required_tensor(m, "mtp.0.hc_attn_base.weight"); - l->attn_norm = required_tensor(m, "mtp.0.attn_norm.weight"); - l->attn_q_a = required_tensor(m, "mtp.0.attn_q_a.weight"); - l->attn_q_a_norm = required_tensor(m, "mtp.0.attn_q_a_norm.weight"); - l->attn_q_b = required_tensor(m, "mtp.0.attn_q_b.weight"); - l->attn_kv = required_tensor(m, "mtp.0.attn_kv.weight"); - l->attn_kv_a_norm = required_tensor(m, "mtp.0.attn_kv_a_norm.weight"); - l->attn_sinks = required_tensor(m, "mtp.0.attn_sinks.weight"); - l->attn_output_a = required_tensor(m, "mtp.0.attn_output_a.weight"); - l->attn_output_b = required_tensor(m, "mtp.0.attn_output_b.weight"); - l->hc_ffn_fn = required_tensor(m, "mtp.0.hc_ffn_fn.weight"); - l->hc_ffn_scale = required_tensor(m, "mtp.0.hc_ffn_scale.weight"); - l->hc_ffn_base = required_tensor(m, "mtp.0.hc_ffn_base.weight"); - l->ffn_norm = required_tensor(m, "mtp.0.ffn_norm.weight"); - l->ffn_gate_inp = required_tensor(m, "mtp.0.ffn_gate_inp.weight"); - l->ffn_exp_probs_b = required_tensor(m, "mtp.0.exp_probs_b.bias"); - l->ffn_gate_exps = required_tensor(m, "mtp.0.ffn_gate_exps.weight"); - l->ffn_up_exps = required_tensor(m, "mtp.0.ffn_up_exps.weight"); - l->ffn_down_exps = required_tensor(m, "mtp.0.ffn_down_exps.weight"); - l->ffn_gate_shexp = required_tensor(m, "mtp.0.ffn_gate_shexp.weight"); - l->ffn_up_shexp = required_tensor(m, "mtp.0.ffn_up_shexp.weight"); - l->ffn_down_shexp = required_tensor(m, "mtp.0.ffn_down_shexp.weight"); - - mtp_weights_validate_layout(w); +bool ds4_mtp_draft_runtime_supported(ds4_backend backend, ds4_mtp_draft_kind kind) { + if (backend == DS4_BACKEND_CPU) return false; + if (!ds4_mtp_speculative_draft_ready(kind)) return false; + const bool dspark_family = kind == DS4_MTP_DRAFT_DSPARK || + kind == DS4_MTP_DRAFT_DSPARK_NONSEQ; + if (dspark_family && backend != DS4_BACKEND_METAL) return false; + return true; +} + +static void mtp_weights_bind(ds4_mtp_weights *w, const ds4_model *m) { + memset(w, 0, sizeof(*w)); + const ds4_mtp_draft_kind kind = mtp_model_detect_kind(m); + if (kind == DS4_MTP_DRAFT_DSPARK || kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) { + mtp_weights_bind_dspark(w, m); + return; + } + if (kind == DS4_MTP_DRAFT_LEGACY) { + mtp_weights_bind_legacy(w, m); + return; + } + fprintf(stderr, + "ds4: unsupported draft GGUF: need legacy mtp.0.e_proj, official DSpark mtp.0.main_proj + mtp.2.markov_head, or nonseq DSpark mtp.0.main_proj + deepseek4.dspark.markov_rank=0\n"); + exit(1); } static void weights_free(ds4_weights *w) { @@ -4592,6 +4811,115 @@ static void matvec_f16_serial(float *out, const ds4_model *m, const ds4_tensor * } } +static inline float tensor_plain_value(const ds4_model *m, const ds4_tensor *w, uint64_t idx) { + const void *data = tensor_data(m, w); + if (w->type == DS4_TENSOR_F32) { + const float *x = data; + return x[idx]; + } + if (w->type == DS4_TENSOR_F16) { + const uint16_t *x = data; + return f16_to_f32(x[idx]); + } + if (w->type == DS4_TENSOR_BF16) { + const uint16_t *x = data; + return ds4_dspark_bf16_to_f32(x[idx]); + } + ds4_die("expected an F16, F32, or BF16 tensor"); + return 0.0f; +} + +static void tensor_plain_row_to_f32(float *out, + const ds4_model *m, + const ds4_tensor *w, + uint64_t row) { + if (w->ndim != 2) ds4_die("expected a 2D plain tensor"); + const uint64_t n = w->dim[0]; + const uint64_t offset = row * n; + for (uint64_t i = 0; i < n; i++) out[i] = tensor_plain_value(m, w, offset + i); +} + +typedef struct { + float *logits; + const void *weights; + const float *latent; + uint64_t rank; + uint32_t type; +} dspark_markov_bias_ctx; + +static void dspark_markov_bias_worker(void *vctx, uint64_t row0, uint64_t row1) { + dspark_markov_bias_ctx *ctx = vctx; + const uint64_t rank = ctx->rank; + + if (ctx->type == DS4_TENSOR_F32) { + const float *w = ctx->weights; + for (uint64_t vocab = row0; vocab < row1; vocab++) { + const float *row = w + vocab * rank; + float bias = 0.0f; + for (uint64_t i = 0; i < rank; i++) bias += row[i] * ctx->latent[i]; + ctx->logits[vocab] += bias; + } + return; + } + + if (ctx->type == DS4_TENSOR_F16) { + const uint16_t *w = ctx->weights; + for (uint64_t vocab = row0; vocab < row1; vocab++) { + const uint16_t *row = w + vocab * rank; + float bias = 0.0f; + for (uint64_t i = 0; i < rank; i++) bias += f16_to_f32(row[i]) * ctx->latent[i]; + ctx->logits[vocab] += bias; + } + return; + } + + if (ctx->type == DS4_TENSOR_BF16) { + const uint16_t *w = ctx->weights; + for (uint64_t vocab = row0; vocab < row1; vocab++) { + const uint16_t *row = w + vocab * rank; + float bias = 0.0f; + for (uint64_t i = 0; i < rank; i++) bias += ds4_dspark_bf16_to_f32(row[i]) * ctx->latent[i]; + ctx->logits[vocab] += bias; + } + return; + } + + ds4_die("expected an F16, F32, or BF16 tensor"); +} + +static void dspark_apply_markov_bias(float *logits, + const ds4_model *m, + const ds4_mtp_weights *mtp, + int prev_token) { + if (!logits || !m || !mtp || !mtp->markov_w1 || !mtp->markov_w2 || + prev_token < 0 || prev_token >= (int)DS4_N_VOCAB) { + return; + } + + const uint64_t rank = mtp->dspark.markov_rank; + if (rank == 0) return; + if (mtp->markov_w1->ndim != 2 || mtp->markov_w2->ndim != 2 || + mtp->markov_w1->dim[0] != rank || mtp->markov_w1->dim[1] != DS4_N_VOCAB || + mtp->markov_w2->dim[0] != rank || mtp->markov_w2->dim[1] != DS4_N_VOCAB) { + ds4_die("invalid DSpark Markov tensor layout"); + } + + float latent[512]; + if (rank > sizeof(latent) / sizeof(latent[0])) { + ds4_die("DSpark Markov rank exceeds local buffer"); + } + tensor_plain_row_to_f32(latent, m, mtp->markov_w1, (uint64_t)prev_token); + + dspark_markov_bias_ctx ctx = { + .logits = logits, + .weights = tensor_data(m, mtp->markov_w2), + .latent = latent, + .rank = rank, + .type = mtp->markov_w2->type, + }; + ds4_parallel_for_min_rows(DS4_N_VOCAB, dspark_markov_bias_worker, &ctx, 1024); +} + typedef struct { float *out; const uint8_t *data; @@ -8320,6 +8648,7 @@ typedef struct { uint32_t head_dim; } ds4_kv_cache; + static uint32_t ds4_default_raw_cap(uint32_t ctx_size) { uint32_t raw_cap = DS4_N_SWA; if (raw_cap > ctx_size) raw_cap = ctx_size; @@ -10421,6 +10750,18 @@ typedef struct { ds4_gpu_tensor *mtp_next_hc; ds4_gpu_tensor *mtp_raw_cache; uint32_t mtp_n_raw; + + /* Optional DSpark block-draft state. The target decoder captures mean-HC + * hidden rows at the configured target layers, then the drafter consumes + * that 3-row feature to propose a block of candidate tokens. */ + ds4_gpu_tensor *dspark_main_hidden; + ds4_gpu_tensor *dspark_main_x; + ds4_gpu_tensor *dspark_verify_hidden; + ds4_gpu_tensor *dspark_verify_main_x; + ds4_gpu_tensor *dspark_mean_weights; + ds4_gpu_tensor *dspark_kv_cache[DS4_DSPARK_MTP_LAYERS]; + uint32_t dspark_target_layer_ids[DS4_DSPARK_MTP_LAYERS]; + uint32_t dspark_n_real; uint32_t prefill_cap; uint32_t raw_window; @@ -10491,6 +10832,7 @@ typedef struct { bool ssd_streaming_cold; bool streaming_static_decode_map_current; bool mtp_enabled; + bool dspark_enabled; float *cpu_router_norm; } ds4_gpu_graph; @@ -10530,7 +10872,6 @@ static void graph_power_note_decode_token(ds4_gpu_graph *g, double elapsed_sec) graph_power_update_avg(g->decode_token_avg_sec, elapsed_sec); graph_power_sleep(g->decode_token_avg_sec, g->power_percent); } - /* Release every Metal tensor owned by the whole-model graph runtime. */ static void metal_graph_free(ds4_gpu_graph *g) { ds4_gpu_tensor_free(g->directional_steering_dirs); @@ -10575,6 +10916,14 @@ static void metal_graph_free(ds4_gpu_graph *g) { ds4_gpu_tensor_free(g->batch_next_hc); ds4_gpu_tensor_free(g->batch_cur_hc); ds4_gpu_tensor_free(g->prefill_tokens); + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + ds4_gpu_tensor_free(g->dspark_kv_cache[s]); + } + ds4_gpu_tensor_free(g->dspark_mean_weights); + ds4_gpu_tensor_free(g->dspark_main_x); + ds4_gpu_tensor_free(g->dspark_verify_main_x); + ds4_gpu_tensor_free(g->dspark_verify_hidden); + ds4_gpu_tensor_free(g->dspark_main_hidden); ds4_gpu_tensor_free(g->logits); ds4_gpu_tensor_free(g->mtp_raw_cache); ds4_gpu_tensor_free(g->mtp_next_hc); @@ -10956,14 +11305,23 @@ static bool metal_graph_ensure_batch_ffn_out(ds4_gpu_graph *g) { * weights are not copied here; tensors reference the mapped GGUF. */ static bool metal_graph_alloc_raw_cap( ds4_gpu_graph *g, - const ds4_weights *weights, + const ds4_weights *weights, const ds4_layer_weights *layer, - uint32_t raw_cap, - uint32_t ctx_size, - uint32_t prefill_cap, - bool enable_mtp) { + const ds4_mtp_weights *mtp_weights, + uint32_t raw_cap, + uint32_t ctx_size, + uint32_t prefill_cap, + bool enable_mtp) { memset(g, 0, sizeof(*g)); g->mtp_enabled = enable_mtp; + const bool enable_dspark = + enable_mtp && mtp_weights && mtp_weights->kind == DS4_MTP_DRAFT_DSPARK; + g->dspark_enabled = enable_dspark; + if (enable_dspark) { + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + g->dspark_target_layer_ids[s] = mtp_weights->dspark.target_layer_ids[s]; + } + } if (raw_cap == 0) raw_cap = 1; if (ctx_size == 0) ctx_size = raw_cap; if (prefill_cap == 0) prefill_cap = 1; @@ -11169,6 +11527,30 @@ static bool metal_graph_alloc_raw_cap( g->spec_logits = ds4_gpu_tensor_alloc((uint64_t)16 * DS4_N_VOCAB * sizeof(float)); g->mtp_n_raw = 0; } + if (enable_dspark) { + g->dspark_main_hidden = ds4_gpu_tensor_alloc( + (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float)); + g->dspark_main_x = ds4_gpu_tensor_alloc((uint64_t)DS4_N_EMBD * sizeof(float)); + g->dspark_verify_hidden = ds4_gpu_tensor_alloc( + (uint64_t)DS4_DSPARK_MAX_BLOCK_SIZE * + DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float)); + g->dspark_verify_main_x = ds4_gpu_tensor_alloc( + (uint64_t)DS4_DSPARK_MAX_BLOCK_SIZE * DS4_N_EMBD * sizeof(float)); + g->dspark_mean_weights = ds4_gpu_tensor_alloc((uint64_t)DS4_N_HC * sizeof(float)); + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + g->dspark_kv_cache[s] = metal_graph_alloc_kv_cache_tensor( + managed_kv_cache, + (uint64_t)(DS4_N_SWA + mtp_weights->dspark.block_size) * + DS4_N_HEAD_DIM * sizeof(float)); + } + if (g->dspark_mean_weights) { + state_init_ok = state_init_ok && + metal_tensor_fill_f32(g->dspark_mean_weights, + 1.0f / (float)DS4_N_HC, + DS4_N_HC); + } + g->dspark_n_real = 0; + } g->prefill_tokens = ds4_gpu_tensor_alloc(pc * sizeof(int32_t)); g->batch_cur_hc = ds4_gpu_tensor_alloc(pc * hc_dim * sizeof(float)); @@ -11265,6 +11647,12 @@ static bool metal_graph_alloc_raw_cap( g->mtp_eproj_hc && g->mtp_hnorm_hc && g->mtp_hproj_hc && g->mtp_input_hc && g->mtp_state_hc && g->mtp_next_hc && g->mtp_raw_cache && g->spec_logits)) && + (!enable_dspark || + (g->dspark_main_hidden && g->dspark_main_x && + g->dspark_verify_hidden && g->dspark_verify_main_x && + g->dspark_mean_weights && + g->dspark_kv_cache[0] && g->dspark_kv_cache[1] && + g->dspark_kv_cache[2])) && g->prefill_tokens && g->batch_cur_hc && g->batch_next_hc && g->batch_flat_hc && g->batch_hc_mix && g->batch_hc_split && @@ -11292,7 +11680,8 @@ static bool metal_graph_alloc( ds4_gpu_graph *g, const ds4_weights *weights, const ds4_layer_weights *layer) { - return metal_graph_alloc_raw_cap(g, weights, layer, DS4_N_SWA, DS4_N_SWA, 1, false); + return metal_graph_alloc_raw_cap(g, weights, layer, NULL, + DS4_N_SWA, DS4_N_SWA, 1, false); } static bool metal_graph_install_model_spans( @@ -16312,6 +16701,79 @@ static bool metal_graph_encode_output_head_mtp( return ok; } +static DS4_MAYBE_UNUSED bool metal_graph_encode_output_head_mtp_batch( + ds4_gpu_graph *g, + const ds4_model *base_model, + const ds4_weights *base_weights, + const ds4_model *mtp_model, + const ds4_mtp_weights *mtp, + uint32_t n_tokens, + uint64_t vocab_dim) { + if (n_tokens == 0 || n_tokens > g->prefill_cap || !g->spec_logits) return false; + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + ds4_gpu_tensor *output_pre = ds4_gpu_tensor_view( + g->batch_hc_mix, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float)); + ds4_gpu_tensor *output_weights = ds4_gpu_tensor_view( + g->batch_hc_split, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float)); + ds4_gpu_tensor *output_embd = ds4_gpu_tensor_view( + g->batch_ffn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *output_norm = ds4_gpu_tensor_view( + g->batch_ffn_norm, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *logits = ds4_gpu_tensor_view( + g->spec_logits, 0, (uint64_t)n_tokens * vocab_dim * sizeof(float)); + bool ok = output_pre && output_weights && output_embd && output_norm && logits; + + if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc, + g->batch_cur_hc, + (uint32_t)hc_dim, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(output_pre, + mtp_model, + mtp->hc_head_fn, + hc_dim, + DS4_N_HC, + g->batch_flat_hc, + n_tokens); + if (ok) ok = ds4_gpu_output_hc_weights_tensor(output_weights, + output_pre, + mtp_model->map, + mtp_model->size, + mtp->hc_head_scale->abs_offset, + mtp->hc_head_base->abs_offset, + DS4_N_HC, + DS4_HC_EPS) != 0; + if (ok) ok = ds4_gpu_hc_weighted_sum_tensor(output_embd, + g->batch_cur_hc, + output_weights, + DS4_N_EMBD, + DS4_N_HC) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(output_norm, + output_embd, + mtp_model->map, + mtp_model->size, + mtp->norm->abs_offset, + DS4_N_EMBD, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(logits, + base_model->map, + base_model->size, + base_weights->output->abs_offset, + DS4_N_EMBD, + vocab_dim, + output_norm, + n_tokens) != 0; + + ds4_gpu_tensor_free(logits); + ds4_gpu_tensor_free(output_norm); + ds4_gpu_tensor_free(output_embd); + ds4_gpu_tensor_free(output_weights); + ds4_gpu_tensor_free(output_pre); + return ok; +} + /* ========================================================================= * Metal Diagnostic Comparisons. * ========================================================================= @@ -16941,6 +17403,67 @@ static uint32_t metal_graph_token_split_after_layers(void) { return split_after_layers; } +static bool metal_graph_capture_dspark_main_hidden(ds4_gpu_graph *g, uint32_t il) { + if (!g || !g->dspark_enabled) return true; + if (!g->cur_hc || !g->dspark_main_hidden || !g->dspark_mean_weights) return false; + + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + if (g->dspark_target_layer_ids[s] != il) continue; + ds4_gpu_tensor *dst = ds4_gpu_tensor_view( + g->dspark_main_hidden, + (uint64_t)s * DS4_N_EMBD * sizeof(float), + (uint64_t)DS4_N_EMBD * sizeof(float)); + const bool ok = dst && + ds4_gpu_hc_weighted_sum_tensor(dst, + g->cur_hc, + g->dspark_mean_weights, + DS4_N_EMBD, + DS4_N_HC) != 0; + ds4_gpu_tensor_free(dst); + return ok; + } + return true; +} + +static bool metal_graph_capture_dspark_batch_main_hidden(ds4_gpu_graph *g, + uint32_t il, + uint32_t n_tokens) { + if (!g || !g->dspark_enabled) return true; + if (!g->batch_cur_hc || !g->dspark_verify_hidden || !g->dspark_mean_weights || + n_tokens == 0 || n_tokens > DS4_DSPARK_MAX_BLOCK_SIZE) { + return false; + } + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + const uint64_t hidden_row_bytes = + (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float); + const uint64_t stage_bytes = (uint64_t)DS4_N_EMBD * sizeof(float); + + for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) { + if (g->dspark_target_layer_ids[s] != il) continue; + for (uint32_t row = 0; row < n_tokens; row++) { + ds4_gpu_tensor *src = ds4_gpu_tensor_view( + g->batch_cur_hc, + (uint64_t)row * hc_dim * sizeof(float), + hc_dim * sizeof(float)); + ds4_gpu_tensor *dst = ds4_gpu_tensor_view( + g->dspark_verify_hidden, + (uint64_t)row * hidden_row_bytes + (uint64_t)s * stage_bytes, + stage_bytes); + const bool ok = src && dst && + ds4_gpu_hc_weighted_sum_tensor(dst, + src, + g->dspark_mean_weights, + DS4_N_EMBD, + DS4_N_HC) != 0; + ds4_gpu_tensor_free(dst); + ds4_gpu_tensor_free(src); + if (!ok) return false; + } + } + return true; +} + /* Encode a full single-token decode step on Metal. This is the generation * hot path: update caches, run all layers, then produce logits. */ static bool metal_graph_encode_token_raw_swa( @@ -16990,6 +17513,7 @@ static bool metal_graph_encode_token_raw_swa( ds4_gpu_tensor *tmp = g->cur_hc; g->cur_hc = g->after_ffn_hc; g->after_ffn_hc = tmp; + if (ok) ok = metal_graph_capture_dspark_main_hidden(g, il); if (ok && allow_split_flush && split_after_layers != 0 && il + 1u == split_after_layers) { ok = ds4_gpu_flush_commands() != 0; } @@ -19283,45 +19807,650 @@ static bool metal_graph_encode_layer_batch( return ok; } -static bool metal_graph_eval_token_raw_swa_streaming( - ds4_gpu_graph *g, - const ds4_model *model, - const ds4_weights *weights, - int token, - uint32_t pos, - float *logits) { - if (g->raw_cap == 0) { - fprintf(stderr, "ds4: Metal graph raw KV cache is not allocated\n"); +static bool metal_graph_dspark_input_stage( + ds4_gpu_graph *g, + const ds4_model *target_model, + const ds4_weights *target_weights, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + int anchor_token, + uint32_t block_size) { + if (!g || !target_model || !target_weights || !dspark_model || !mtp || + !g->dspark_main_hidden || !g->dspark_main_x || !g->batch_cur_hc || + block_size == 0 || block_size > g->prefill_cap) { return false; } - const bool profile = getenv("DS4_METAL_GRAPH_TOKEN_PROFILE") != NULL; - const bool throttle = graph_power_throttle_enabled(g); - const double t0 = (profile || throttle) ? now_sec() : 0.0; - const uint32_t raw_row = pos % g->raw_cap; - const uint32_t n_raw = metal_graph_raw_span_for_batch(g, pos, 1); + bool ok = ds4_gpu_begin_commands() != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->dspark_main_x, + dspark_model->map, + dspark_model->size, + mtp->main_proj->abs_offset, + 3ull * DS4_N_EMBD, + (uint64_t)DS4_N_EMBD, + g->dspark_main_hidden, + 1) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->dspark_main_x, + g->dspark_main_x, + dspark_model->map, + dspark_model->size, + mtp->main_norm->abs_offset, + DS4_N_EMBD, + DS4_RMS_EPS) != 0; + if (ds4_gpu_end_commands() == 0) ok = false; + if (!ok) return false; - const bool static_decode_map = metal_graph_stream_decode_static_map_enabled(); - const bool static_map_state_cache = - static_decode_map && metal_graph_stream_decode_static_map_state_cache_enabled(); - const bool batch_static_decode = - static_decode_map && metal_graph_stream_decode_layer_batch_enabled(g); - bool ok = true; - if (static_decode_map) { - if (!static_map_state_cache || !g->streaming_static_decode_map_current) { - ok = metal_graph_stream_map_decode_static_all(model, weights); - if (ok) g->streaming_static_decode_map_current = static_map_state_cache; - } - } else { - g->streaming_static_decode_map_current = false; - ok = metal_graph_stream_map_token(model, weights); - } - if (ok && !static_decode_map && DS4_N_LAYER > 0) { - metal_graph_stream_readahead_layer_decode(model, weights, 0); + token_vec draft_ids = {0}; + token_vec_push(&draft_ids, anchor_token); + for (uint32_t i = 1; i < block_size; i++) { + token_vec_push(&draft_ids, (int)mtp->dspark.noise_token_id); } - if (ok) ok = ds4_gpu_begin_commands() != 0; - if (ok) { - ok = ds4_gpu_embed_token_hc_tensor(g->cur_hc, + + ok = metal_graph_upload_prompt_tokens(g->prefill_tokens, &draft_ids, 0u, block_size); + if (ok) ok = metal_graph_upload_prompt_embeddings_hc(g->batch_cur_hc, + g->prefill_tokens, + target_model, + target_weights, + &draft_ids, + 0u, + block_size); + token_vec_free(&draft_ids); + return ok; +} + +static bool metal_graph_dspark_encode_attention( + ds4_gpu_graph *g, + const ds4_model *dspark_model, + const ds4_layer_weights *layer, + uint32_t stage, + uint32_t start_pos, + uint32_t n_tokens) { + if (!g || !dspark_model || !layer || stage >= DS4_DSPARK_MTP_LAYERS || + n_tokens == 0 || n_tokens > g->prefill_cap || + !g->dspark_kv_cache[stage] || !g->batch_cur_hc || !g->dspark_main_x) { + return false; + } + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + const uint64_t mix_hc = 2ull * DS4_N_HC + (uint64_t)DS4_N_HC * DS4_N_HC; + const uint64_t q_rank = layer->attn_q_a->dim[1]; + const uint64_t q_dim = (uint64_t)DS4_N_HEAD * DS4_N_HEAD_DIM; + const uint32_t n_groups = DS4_N_OUT_GROUP; + const uint32_t group_heads = DS4_N_HEAD / n_groups; + const uint32_t group_dim = DS4_N_HEAD_DIM * group_heads; + const uint32_t rank = DS4_N_LORA_O; + const uint32_t raw_cap = DS4_N_SWA + n_tokens; + uint32_t n_real = g->dspark_n_real; + if (n_real + 1u + n_tokens > raw_cap) n_real = raw_cap - 1u - n_tokens; + + ds4_gpu_tensor *hc_mix_view = ds4_gpu_tensor_view( + g->batch_hc_mix, 0, (uint64_t)n_tokens * mix_hc * sizeof(float)); + ds4_gpu_tensor *hc_split_view = ds4_gpu_tensor_view( + g->batch_hc_split, 0, (uint64_t)n_tokens * mix_hc * sizeof(float)); + ds4_gpu_tensor *attn_cur_view = ds4_gpu_tensor_view( + g->batch_attn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *after_attn_hc_view = ds4_gpu_tensor_view( + g->batch_after_attn_hc, 0, (uint64_t)n_tokens * hc_dim * sizeof(float)); + bool ok = hc_mix_view && hc_split_view && attn_cur_view && after_attn_hc_view; + + if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc, + g->batch_cur_hc, + (uint32_t)hc_dim, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(hc_mix_view, + dspark_model, + layer->hc_attn_fn, + hc_dim, + mix_hc, + g->batch_flat_hc, + n_tokens); + if (ok) ok = ds4_gpu_hc_split_weighted_sum_tensor(attn_cur_view, + hc_split_view, + hc_mix_view, + g->batch_cur_hc, + dspark_model->map, + dspark_model->size, + layer->hc_attn_scale->abs_offset, + layer->hc_attn_base->abs_offset, + DS4_N_EMBD, + DS4_N_HC, + DS4_N_HC_SINKHORN_ITER, + DS4_HC_EPS) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_attn_norm, + g->batch_attn_cur, + dspark_model->map, + dspark_model->size, + layer->attn_norm->abs_offset, + DS4_N_EMBD, + n_tokens, + DS4_RMS_EPS) != 0; + + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_qr, + dspark_model->map, + dspark_model->size, + layer->attn_q_a->abs_offset, + DS4_N_EMBD, + q_rank, + g->batch_attn_norm, + n_tokens) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_qr_norm, + g->batch_qr, + dspark_model->map, + dspark_model->size, + layer->attn_q_a_norm->abs_offset, + (uint32_t)q_rank, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_q, + dspark_model->map, + dspark_model->size, + layer->attn_q_b->abs_offset, + q_rank, + q_dim, + g->batch_qr_norm, + n_tokens) != 0; + if (ok) ok = ds4_gpu_head_rms_norm_tensor(g->batch_q, + n_tokens, + DS4_N_HEAD, + DS4_N_HEAD_DIM, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_q, + n_tokens, + DS4_N_HEAD, + DS4_N_HEAD_DIM, + DS4_N_ROT, + start_pos + 1u, + 0u, + false, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv->abs_offset, + DS4_N_EMBD, + DS4_N_HEAD_DIM, + g->batch_attn_norm, + n_tokens) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv, + g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv_a_norm->abs_offset, + DS4_N_HEAD_DIM, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv, + n_tokens, + DS4_N_HEAD_KV, + DS4_N_HEAD_DIM, + DS4_N_ROT, + start_pos + 1u, + 0u, + false, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv, + n_tokens, + DS4_N_HEAD_DIM, + DS4_N_ROT) != 0; + if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor(g->dspark_kv_cache[stage], + g->batch_kv, + raw_cap, + n_real + 1u, + n_tokens, + DS4_N_HEAD_DIM) != 0; + + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv->abs_offset, + DS4_N_EMBD, + DS4_N_HEAD_DIM, + g->dspark_main_x, + 1) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv, + g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv_a_norm->abs_offset, + DS4_N_HEAD_DIM, + 1, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv, + 1, + DS4_N_HEAD_KV, + DS4_N_HEAD_DIM, + DS4_N_ROT, + start_pos, + 0u, + false, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv, + 1, + DS4_N_HEAD_DIM, + DS4_N_ROT) != 0; + if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor(g->dspark_kv_cache[stage], + g->batch_kv, + raw_cap, + n_real, + 1, + DS4_N_HEAD_DIM) != 0; + + if (ok) ok = ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + g->batch_heads, + dspark_model->map, + dspark_model->size, + layer->attn_sinks->abs_offset, + g->batch_q, + g->dspark_kv_cache[stage], + n_tokens, + n_real + 1u + n_tokens, + raw_cap, + 0u, + DS4_N_HEAD, + DS4_N_HEAD_DIM) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_heads, + n_tokens, + DS4_N_HEAD, + DS4_N_HEAD_DIM, + DS4_N_ROT, + start_pos + 1u, + 0u, + true, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + if (ok) ok = ds4_gpu_attention_output_q8_batch_tensor(g->batch_attn_out, + g->batch_attn_low, + g->batch_group_tmp, + g->batch_low_tmp, + dspark_model->map, + dspark_model->size, + layer->attn_output_a->abs_offset, + layer->attn_output_b->abs_offset, + group_dim, + rank, + n_groups, + DS4_N_EMBD, + g->batch_heads, + n_tokens) != 0; + if (ok) ok = ds4_gpu_hc_expand_split_tensor(after_attn_hc_view, + g->batch_attn_out, + g->batch_cur_hc, + hc_split_view, + DS4_N_EMBD, + DS4_N_HC) != 0; + + ds4_gpu_tensor_free(after_attn_hc_view); + ds4_gpu_tensor_free(attn_cur_view); + ds4_gpu_tensor_free(hc_split_view); + ds4_gpu_tensor_free(hc_mix_view); + return ok; +} + +static bool metal_graph_dspark_refresh_main_rows( + ds4_gpu_graph *g, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + ds4_gpu_tensor *main_hidden, + ds4_gpu_tensor *main_x, + uint32_t pos0, + uint32_t row0, + uint32_t n_tokens, + bool keep_last_hidden) { + if (n_tokens == 0) return true; + if (!g || !g->dspark_enabled || !dspark_model || !mtp || !main_hidden || + !main_x || !g->batch_kv_raw || !g->batch_kv || + n_tokens > DS4_DSPARK_MAX_BLOCK_SIZE || + row0 + n_tokens > DS4_N_SWA + mtp->dspark.block_size) { + return false; + } + + bool ok = ds4_gpu_begin_commands() != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(main_x, + dspark_model->map, + dspark_model->size, + mtp->main_proj->abs_offset, + 3ull * DS4_N_EMBD, + (uint64_t)DS4_N_EMBD, + main_hidden, + n_tokens) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(main_x, + main_x, + dspark_model->map, + dspark_model->size, + mtp->main_norm->abs_offset, + DS4_N_EMBD, + n_tokens, + DS4_RMS_EPS) != 0; + + for (uint32_t stage = 0; ok && stage < mtp->dspark.n_mtp_layers; stage++) { + const ds4_layer_weights *layer = &mtp->stage[stage]; + ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv->abs_offset, + DS4_N_EMBD, + DS4_N_HEAD_DIM, + main_x, + n_tokens) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv, + g->batch_kv_raw, + dspark_model->map, + dspark_model->size, + layer->attn_kv_a_norm->abs_offset, + DS4_N_HEAD_DIM, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv, + n_tokens, + DS4_N_HEAD_KV, + DS4_N_HEAD_DIM, + DS4_N_ROT, + pos0, + 0u, + false, + DS4_ROPE_FREQ_BASE, + 1.0f, + 0.0f, + 1.0f, + DS4_ROPE_YARN_BETA_FAST, + DS4_ROPE_YARN_BETA_SLOW) != 0; + if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv, + n_tokens, + DS4_N_HEAD_DIM, + DS4_N_ROT) != 0; + if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor( + g->dspark_kv_cache[stage], + g->batch_kv, + DS4_N_SWA + mtp->dspark.block_size, + row0, + n_tokens, + DS4_N_HEAD_DIM) != 0; + } + + if (ok && keep_last_hidden && g->dspark_main_hidden) { + const uint64_t stage_bytes = (uint64_t)DS4_N_EMBD * sizeof(float); + const uint64_t hidden_row_bytes = + (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float); + const uint64_t src_row = (uint64_t)(n_tokens - 1u) * hidden_row_bytes; + for (uint32_t s = 0; ok && s < DS4_DSPARK_MTP_LAYERS; s++) { + ok = ds4_gpu_tensor_copy(g->dspark_main_hidden, + (uint64_t)s * stage_bytes, + main_hidden, + src_row + (uint64_t)s * stage_bytes, + stage_bytes) != 0; + } + } + + if (ds4_gpu_end_commands() == 0) ok = false; + if (!ok) (void)ds4_gpu_synchronize(); + return ok; +} + +static bool metal_graph_dspark_refresh_verified_rows( + ds4_gpu_graph *g, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + uint32_t row0, + uint32_t pos0, + uint32_t n_tokens) { + return metal_graph_dspark_refresh_main_rows(g, + dspark_model, + mtp, + g ? g->dspark_verify_hidden : NULL, + g ? g->dspark_verify_main_x : NULL, + pos0, + row0, + n_tokens, + true); +} + +static bool metal_graph_dspark_refresh_current_row( + ds4_gpu_graph *g, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + uint32_t row, + uint32_t pos) { + return metal_graph_dspark_refresh_main_rows(g, + dspark_model, + mtp, + g ? g->dspark_main_hidden : NULL, + g ? g->dspark_main_x : NULL, + pos, + row, + 1, + false); +} +static bool metal_graph_encode_output_head_dspark_batch( + ds4_gpu_graph *g, + const ds4_model *target_model, + const ds4_weights *target_weights, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + uint32_t n_tokens) { + if (!g || !target_model || !target_weights || !dspark_model || !mtp || + n_tokens == 0 || n_tokens > g->prefill_cap || !g->spec_logits) { + return false; + } + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + ds4_gpu_tensor *output_pre = ds4_gpu_tensor_view( + g->batch_hc_mix, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float)); + ds4_gpu_tensor *output_weights = ds4_gpu_tensor_view( + g->batch_hc_split, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float)); + ds4_gpu_tensor *output_embd = ds4_gpu_tensor_view( + g->batch_ffn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *output_norm = ds4_gpu_tensor_view( + g->batch_ffn_norm, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float)); + ds4_gpu_tensor *logits = ds4_gpu_tensor_view( + g->spec_logits, 0, (uint64_t)n_tokens * DS4_N_VOCAB * sizeof(float)); + bool ok = output_pre && output_weights && output_embd && output_norm && logits; + + if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc, + g->batch_cur_hc, + (uint32_t)hc_dim, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(output_pre, + dspark_model, + mtp->hc_head_fn, + hc_dim, + DS4_N_HC, + g->batch_flat_hc, + n_tokens); + if (ok) ok = ds4_gpu_output_hc_weights_tensor(output_weights, + output_pre, + dspark_model->map, + dspark_model->size, + mtp->hc_head_scale->abs_offset, + mtp->hc_head_base->abs_offset, + DS4_N_HC, + DS4_HC_EPS) != 0; + if (ok) ok = ds4_gpu_hc_weighted_sum_tensor(output_embd, + g->batch_cur_hc, + output_weights, + DS4_N_EMBD, + DS4_N_HC) != 0; + if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(output_norm, + output_embd, + dspark_model->map, + dspark_model->size, + mtp->norm->abs_offset, + DS4_N_EMBD, + n_tokens, + DS4_RMS_EPS) != 0; + if (ok) ok = ds4_gpu_matmul_q8_0_tensor(logits, + target_model->map, + target_model->size, + target_weights->output->abs_offset, + DS4_N_EMBD, + DS4_N_VOCAB, + output_norm, + n_tokens) != 0; + + ds4_gpu_tensor_free(logits); + ds4_gpu_tensor_free(output_norm); + ds4_gpu_tensor_free(output_embd); + ds4_gpu_tensor_free(output_weights); + ds4_gpu_tensor_free(output_pre); + return ok; +} + +static bool metal_graph_eval_dspark_draft_block( + ds4_gpu_graph *g, + const ds4_model *target_model, + const ds4_weights *target_weights, + const ds4_model *dspark_model, + const ds4_mtp_weights *mtp, + int anchor_token, + uint32_t pos, + uint32_t max_tokens, + int *drafts, + int *draft_n, + uint32_t *base_real_out, + float *last_logits, + float *all_draft_logits) { + if (draft_n) *draft_n = 0; + if (base_real_out) *base_real_out = 0; + if (!g || !target_model || !target_weights || !dspark_model || !mtp || + !drafts || !draft_n || mtp->kind != DS4_MTP_DRAFT_DSPARK) { + return false; + } + + uint32_t block_size = mtp->dspark.block_size; + if (block_size > max_tokens) block_size = max_tokens; + if (block_size > g->prefill_cap) block_size = g->prefill_cap; + if (block_size == 0 || block_size > 16) return true; + if (g->dspark_n_real >= DS4_N_SWA) g->dspark_n_real = 0; + if (base_real_out) *base_real_out = g->dspark_n_real; + + bool ok = metal_graph_dspark_input_stage(g, + target_model, + target_weights, + dspark_model, + mtp, + anchor_token, + block_size); + bool commands_open = false; + if (ok) { + ok = ds4_gpu_begin_commands() != 0; + commands_open = ok; + } + for (uint32_t stage = 0; ok && stage < mtp->dspark.n_mtp_layers; stage++) { + const ds4_layer_weights *layer = &mtp->stage[stage]; + ok = metal_graph_dspark_encode_attention(g, + dspark_model, + layer, + stage, + pos, + block_size); + if (ok) ok = metal_graph_encode_layer_ffn_batch(g, + dspark_model, + layer, + stage, + pos + 1u, + block_size); + if (ok) { + ds4_gpu_tensor *tmp = g->batch_cur_hc; + g->batch_cur_hc = g->batch_next_hc; + g->batch_next_hc = tmp; + } + } + if (ok) ok = metal_graph_encode_output_head_dspark_batch(g, + target_model, + target_weights, + dspark_model, + mtp, + block_size); + if (commands_open && ds4_gpu_end_commands() == 0) ok = false; + if (!ok) { + (void)ds4_gpu_synchronize(); + return false; + } + + const uint64_t row_bytes = (uint64_t)DS4_N_VOCAB * sizeof(float); + float *row_logits = xmalloc((size_t)row_bytes); + for (uint32_t i = 0; ok && i < block_size; i++) { + ok = ds4_gpu_tensor_read(g->spec_logits, + (uint64_t)i * row_bytes, + row_logits, + row_bytes) != 0; + if (!ok) break; + const int prev = i == 0 ? anchor_token : drafts[i - 1u]; + dspark_apply_markov_bias(row_logits, dspark_model, mtp, prev); + drafts[i] = sample_argmax(row_logits, DS4_N_VOCAB); + if (all_draft_logits) { + memcpy(all_draft_logits + (uint64_t)i * DS4_N_VOCAB, row_logits, (size_t)row_bytes); + } + if (last_logits && i + 1u == block_size) { + memcpy(last_logits, row_logits, (size_t)row_bytes); + } + } + free(row_logits); + if (!ok) return false; + *draft_n = (int)block_size; + return true; +} + +static bool metal_graph_eval_token_raw_swa_streaming( + ds4_gpu_graph *g, + const ds4_model *model, + const ds4_weights *weights, + int token, + uint32_t pos, + float *logits) { + if (g->raw_cap == 0) { + fprintf(stderr, "ds4: Metal graph raw KV cache is not allocated\n"); + return false; + } + + const bool profile = getenv("DS4_METAL_GRAPH_TOKEN_PROFILE") != NULL; + const bool throttle = graph_power_throttle_enabled(g); + const double t0 = (profile || throttle) ? now_sec() : 0.0; + const uint32_t raw_row = pos % g->raw_cap; + const uint32_t n_raw = metal_graph_raw_span_for_batch(g, pos, 1); + + const bool static_decode_map = metal_graph_stream_decode_static_map_enabled(); + const bool static_map_state_cache = + static_decode_map && metal_graph_stream_decode_static_map_state_cache_enabled(); + const bool batch_static_decode = + static_decode_map && metal_graph_stream_decode_layer_batch_enabled(g); + bool ok = true; + if (static_decode_map) { + if (!static_map_state_cache || !g->streaming_static_decode_map_current) { + ok = metal_graph_stream_map_decode_static_all(model, weights); + if (ok) g->streaming_static_decode_map_current = static_map_state_cache; + } + } else { + g->streaming_static_decode_map_current = false; + ok = metal_graph_stream_map_token(model, weights); + } + if (ok && !static_decode_map && DS4_N_LAYER > 0) { + metal_graph_stream_readahead_layer_decode(model, weights, 0); + } + if (ok) ok = ds4_gpu_begin_commands() != 0; + if (ok) { + ok = ds4_gpu_embed_token_hc_tensor(g->cur_hc, model->map, model->size, weights->token_embd->abs_offset, @@ -19457,7 +20586,8 @@ static bool metal_graph_eval_token_raw_swa_streaming( return ok; } -/* Execute one Metal decode token and read back logits. */ +/* Execute one Metal decode token and optionally capture the target hidden states + * that DSpark uses as the draft model's cross-token input. */ static bool metal_graph_eval_token_raw_swa( ds4_gpu_graph *g, const ds4_model *model, @@ -19474,7 +20604,8 @@ static bool metal_graph_eval_token_raw_swa( const double t0 = (profile || throttle) ? now_sec() : 0.0; bool ok = ds4_gpu_begin_commands() != 0; - if (ok) ok = metal_graph_encode_token_raw_swa(g, model, weights, token, pos, logits != NULL, true); + if (ok) ok = metal_graph_encode_token_raw_swa(g, model, weights, token, pos, + logits != NULL, true); const double t_encoded = (profile || throttle) ? now_sec() : 0.0; if (ok) ok = ds4_gpu_end_commands() != 0; const double t_done = (profile || throttle) ? now_sec() : 0.0; @@ -19502,6 +20633,8 @@ static bool metal_graph_eval_token_raw_swa( return ok; } +/* Execute one Metal decode token and read back logits. */ + static bool metal_graph_streaming_decode_prefill_wide_default( const ds4_weights *weights) { return DS4_MODEL_VARIANT == DS4_VARIANT_FLASH && @@ -20272,6 +21405,7 @@ static bool metal_graph_reset_prefill_state(ds4_gpu_graph *g) { memset(g->layer_n_comp, 0, sizeof(g->layer_n_comp)); memset(g->layer_n_index_comp, 0, sizeof(g->layer_n_index_comp)); g->mtp_n_raw = 0; + g->dspark_n_real = 0; for (uint32_t il = 0; il < DS4_N_LAYER; il++) { const uint32_t ratio = ds4_layer_compress_ratio(il); if (ratio == 0) continue; @@ -21150,6 +22284,7 @@ static bool metal_graph_verify_suffix_tops( il, start, n_tokens); + if (ok) ok = metal_graph_capture_dspark_batch_main_hidden(g, il, n_tokens); } if (ok) ok = ds4_gpu_end_commands() != 0; else (void)ds4_gpu_synchronize(); @@ -21510,7 +22645,8 @@ static int metal_graph_prompt_logits_test( ds4_gpu_graph g; bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0], - raw_cap, (uint32_t)ctx_size, (uint32_t)n_test, false); + NULL, raw_cap, (uint32_t)ctx_size, + (uint32_t)n_test, false); if (!ok) { metal_graph_free(&g); fprintf(stderr, "ds4: failed to initialize Metal graph prompt test runtime\n"); @@ -22602,6 +23738,174 @@ static float sample_rng_f32(uint64_t *state) { return (float)((x >> 40) & 0xffffffu) / 16777216.0f; } +/* ========================================================================= + * B2 Rejection Sampling for DSpark Speculative Decoding. + * ========================================================================= + * + * Implements Chen et al. (2023) / Leviathan et al. (2023) rejection sampling. + * At temp=0: pure argmax matching (token-identical to non-speculative decode). + * At temp>0: lossless samples from the target model's distribution. + * + * All computations use log-probabilities to avoid overflow on 129K vocab. + * Activated via DS4_SPEC_TEMP env var; greedy path is the unchanged default. + */ + +/* Stable log-softmax: log_probs[i] = logits[i] - log(sum(exp(logits))). + * Uses max-subtraction for numerical stability on 129K vocab. */ +static void b2_log_softmax(const float *logits, uint32_t vocab, float *log_probs) { + float max_val = DS4_NEG_INF; + for (uint32_t i = 0; i < vocab; i++) { + if (logits[i] > max_val) max_val = logits[i]; + } + float sum_exp = 0.0f; + for (uint32_t i = 0; i < vocab; i++) { + sum_exp += expf(logits[i] - max_val); + } + const float log_denom = max_val + logf(sum_exp); + for (uint32_t i = 0; i < vocab; i++) { + log_probs[i] = logits[i] - log_denom; + } +} + +/* Sample from CDF of a log-probability vector. */ +static int b2_sample_from_log_probs(const float *log_probs, uint32_t vocab, + uint64_t *rng) { + const float u = sample_rng_f32(rng); + float cumsum = 0.0f; + for (uint32_t i = 0; i < vocab; i++) { + cumsum += expf(log_probs[i]); + if (cumsum >= u) return (int)i; + } + return (int)(vocab - 1); +} + +/* Sample from the residual distribution max(0, target_prob - draft_prob). + * Both inputs are log-probability vectors. */ +static int b2_sample_residual(const float *log_target, const float *log_draft, + uint32_t vocab, uint64_t *rng) { + /* Compute residual in probability space. Use a stack allocation guard: + * 129280 * 4 = ~504 KB, too large for stack. Heap allocate. */ + float *residual = xmalloc((size_t)vocab * sizeof(float)); + float residual_sum = 0.0f; + + for (uint32_t i = 0; i < vocab; i++) { + const float t = expf(log_target[i]); + const float d = expf(log_draft[i]); + const float r = t - d; + residual[i] = r > 0.0f ? r : 0.0f; + residual_sum += residual[i]; + } + + int result; + if (residual_sum < 1e-10f) { + /* Residual is effectively zero — fall back to target distribution. */ + free(residual); + return b2_sample_from_log_probs(log_target, vocab, rng); + } + + /* CDF inversion over the unnormalized residual. */ + const float threshold = sample_rng_f32(rng) * residual_sum; + float cumsum = 0.0f; + result = (int)(vocab - 1); + for (uint32_t i = 0; i < vocab; i++) { + cumsum += residual[i]; + if (cumsum >= threshold) { + result = (int)i; + break; + } + } + + free(residual); + return result; +} + +typedef struct { + int n_accepted; + int accepted_tokens[16]; + int correction_token; + bool has_correction; +} b2_result; + +/* B2 rejection sampling for DSpark speculative decode. + * + * draft_tokens: [n_draft] token ids proposed by the drafter + * draft_logits: [n_draft * vocab] raw logits from the drafter (post-markov-bias) + * target_logits: [n_draft * vocab] raw logits from the target model (batch verify) + * vocab: vocabulary size (DS4_N_VOCAB) + * n_draft: number of draft tokens + * temperature: sampling temperature (<=0 falls back to argmax matching) + * rng: pointer to xorshift64* state (mutated) + */ +static b2_result b2_rejection_sample( + const int *draft_tokens, + const float *draft_logits, + const float *target_logits, + uint32_t vocab, + int n_draft, + float temperature, + uint64_t *rng) +{ + b2_result result; + memset(&result, 0, sizeof(result)); + if (n_draft <= 0 || n_draft > 16) return result; + + /* Greedy path: pure argmax matching (temp <= 0). */ + if (temperature <= 0.0f) { + for (int i = 0; i < n_draft; i++) { + const float *t_logits = target_logits + (uint64_t)i * vocab; + const int targ = sample_argmax(t_logits, vocab); + if (targ == draft_tokens[i]) { + result.accepted_tokens[result.n_accepted++] = draft_tokens[i]; + } else { + result.correction_token = targ; + result.has_correction = true; + break; + } + } + return result; + } + + /* Stochastic path: rejection sampling (temp > 0). */ + const float inv_temp = 1.0f / temperature; + + /* Scratch buffers for temperature-scaled log-softmax. */ + float *log_draft = xmalloc((size_t)vocab * sizeof(float)); + float *log_target = xmalloc((size_t)vocab * sizeof(float)); + float *scaled = xmalloc((size_t)vocab * sizeof(float)); + + for (int i = 0; i < n_draft; i++) { + const float *d_logits = draft_logits + (uint64_t)i * vocab; + const float *t_logits = target_logits + (uint64_t)i * vocab; + + /* Apply temperature scaling before log-softmax. */ + for (uint32_t v = 0; v < vocab; v++) scaled[v] = d_logits[v] * inv_temp; + b2_log_softmax(scaled, vocab, log_draft); + for (uint32_t v = 0; v < vocab; v++) scaled[v] = t_logits[v] * inv_temp; + b2_log_softmax(scaled, vocab, log_target); + + const int token = draft_tokens[i]; + const float log_ratio = log_target[token] - log_draft[token]; + + /* Accept with probability min(1, target_prob / draft_prob). + * In log space: accept if log(u) < min(0, log_ratio). */ + const float u = sample_rng_f32(rng); + if (logf(u + 1e-30f) < fminf(0.0f, log_ratio)) { + result.accepted_tokens[result.n_accepted++] = token; + } else { + /* Reject: sample correction from residual(target - draft). */ + result.correction_token = b2_sample_residual( + log_target, log_draft, vocab, rng); + result.has_correction = true; + break; + } + } + + free(scaled); + free(log_target); + free(log_draft); + return result; +} + typedef struct { int id; float logit; @@ -22956,7 +24260,8 @@ static int generate_metal_graph_raw_swa( } ds4_gpu_graph g; bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0], - raw_cap, (uint32_t)ctx_size, prefill_cap, false); + NULL, raw_cap, (uint32_t)ctx_size, + prefill_cap, false); if (!ok) { fprintf(stderr, "ds4: failed to allocate GPU graph runtime\n"); return 1; @@ -23268,6 +24573,13 @@ struct ds4_session { float *logits; float *mtp_logits; int mtp_draft_token; + int dspark_draft_tokens[16]; + int dspark_draft_count; + uint32_t dspark_draft_base_real; + float *dspark_b2_draft_logits; /* [block_size * DS4_N_VOCAB] post-markov-bias logits for B2 */ + uint64_t dspark_b2_rng; /* xorshift64* state for B2 rejection sampling (persisted across calls) */ + int dspark_prev_accepted; /* previous cycle accepted count (for adaptive block size) */ + int dspark_prev_drafted; /* previous cycle drafted count */ uint64_t mtp_probe_total; uint64_t mtp_probe_hit; ds4_session_progress_fn progress; @@ -24040,12 +25352,18 @@ bool ds4_engine_has_output_head(ds4_engine *e) { return e && weights_have_output_head(&e->weights); } +ds4_mtp_draft_kind ds4_engine_mtp_draft_kind(ds4_engine *e) { + return (e && e->mtp_ready) ? e->mtp_weights.kind : DS4_MTP_DRAFT_NONE; +} + bool ds4_engine_has_mtp(ds4_engine *e) { return e && e->backend != DS4_BACKEND_CPU && e->distributed.role == DS4_DISTRIBUTED_NONE && - e->mtp_ready; + e->mtp_ready && + ds4_mtp_draft_runtime_supported(e->backend, e->mtp_weights.kind); } + int ds4_engine_mtp_draft_tokens(ds4_engine *e) { return ds4_engine_has_mtp(e) ? e->mtp_draft_tokens : 0; } @@ -24986,13 +26304,381 @@ static char *imatrix_trim_block(char *p, char *end) { *end = '\0'; return p; } -#endif -int ds4_engine_collect_imatrix(ds4_engine *e, - const char *dataset_path, - const char *output_path, - int ctx_size, - int max_prompts, +static bool dspark_target_cache_join_path(char *dst, size_t dst_size, const char *dir, const char *name) { + if (!dst || dst_size == 0 || !dir || !name) return false; + const int n = snprintf(dst, dst_size, "%s/%s", dir, name); + return n > 0 && (size_t)n < dst_size; +} + +static bool dspark_target_cache_output_dir_prepare(const char *path) { + struct stat st; + if (stat(path, &st) == 0) { + if (!S_ISDIR(st.st_mode)) { + fprintf(stderr, "ds4: DSpark target cache output path is not a directory: %s\n", path); + return false; + } + DIR *dir = opendir(path); + if (!dir) { + fprintf(stderr, "ds4: failed to inspect DSpark target cache output dir %s: %s\n", + path, strerror(errno)); + return false; + } + bool empty = true; + struct dirent *ent = NULL; + while ((ent = readdir(dir)) != NULL) { + if (strcmp(ent->d_name, ".") && strcmp(ent->d_name, "..")) { + empty = false; + break; + } + } + closedir(dir); + if (!empty) { + fprintf(stderr, "ds4: DSpark target cache output dir is not empty: %s\n", path); + return false; + } + return true; + } + if (errno != ENOENT) { + fprintf(stderr, "ds4: failed to stat DSpark target cache output dir %s: %s\n", + path, strerror(errno)); + return false; + } + if (mkdir(path, 0777) != 0) { + fprintf(stderr, "ds4: failed to create DSpark target cache output dir %s: %s\n", + path, strerror(errno)); + return false; + } + return true; +} + +static bool dspark_target_cache_file_pos(FILE *fp, uint64_t *out) { + if (!fp || !out) return false; + off_t pos = ftello(fp); + if (pos < 0) return false; + *out = (uint64_t)pos; + return true; +} + +static bool dspark_target_cache_write_all(FILE *fp, const void *ptr, size_t bytes, const char *what) { + if (bytes == 0) return true; + if (fwrite(ptr, 1, bytes, fp) != bytes) { + fprintf(stderr, "ds4: failed to write DSpark target cache %s: %s\n", + what ? what : "payload", strerror(errno)); + return false; + } + return true; +} + +static void dspark_target_cache_store_le32(uint8_t *p, uint32_t v) { + p[0] = (uint8_t)(v & 0xffu); + p[1] = (uint8_t)((v >> 8) & 0xffu); + p[2] = (uint8_t)((v >> 16) & 0xffu); + p[3] = (uint8_t)((v >> 24) & 0xffu); +} + +static void dspark_target_cache_store_le64(uint8_t *p, uint64_t v) { + for (uint32_t i = 0; i < 8; i++) p[i] = (uint8_t)((v >> (8u * i)) & 0xffu); +} + +static bool dspark_target_cache_write_index_record(FILE *fp, + uint64_t sample_id, + uint32_t shard_id, + uint32_t seq_len, + uint64_t input_ids_offset, + uint64_t attention_mask_offset, + uint64_t loss_mask_offset, + uint64_t target_hidden_states_offset, + uint64_t target_last_hidden_states_offset) { + uint8_t rec[56]; + dspark_target_cache_store_le64(rec + 0, sample_id); + dspark_target_cache_store_le32(rec + 8, shard_id); + dspark_target_cache_store_le32(rec + 12, seq_len); + dspark_target_cache_store_le64(rec + 16, input_ids_offset); + dspark_target_cache_store_le64(rec + 24, attention_mask_offset); + dspark_target_cache_store_le64(rec + 32, loss_mask_offset); + dspark_target_cache_store_le64(rec + 40, target_hidden_states_offset); + dspark_target_cache_store_le64(rec + 48, target_last_hidden_states_offset); + return dspark_target_cache_write_all(fp, rec, sizeof(rec), "samples.idx record"); +} + +static bool dspark_target_cache_write_json_string(FILE *fp, const char *s) { + if (fputc('"', fp) == EOF) return false; + for (const unsigned char *p = (const unsigned char *)(s ? s : ""); *p; p++) { + switch (*p) { + case '\\': + case '"': + if (fprintf(fp, "\\%c", *p) < 0) return false; + break; + case '\n': + if (fputs("\\n", fp) == EOF) return false; + break; + case '\r': + if (fputs("\\r", fp) == EOF) return false; + break; + case '\t': + if (fputs("\\t", fp) == EOF) return false; + break; + default: + if (*p < 0x20) { + if (fprintf(fp, "\\u%04x", (unsigned)*p) < 0) return false; + } else if (fputc((int)*p, fp) == EOF) { + return false; + } + break; + } + } + return fputc('"', fp) != EOF; +} + +static const char *dspark_target_cache_quant_family(const ds4_weights *weights) { + if (!weights || DS4_N_LAYER == 0) return "unknown"; + const ds4_layer_weights *layer = &weights->layer[0]; + if (!layer->ffn_gate_exps || !layer->ffn_up_exps || !layer->ffn_down_exps) return "unknown"; + if (layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K && + layer->ffn_up_exps->type == DS4_TENSOR_Q4_K && + layer->ffn_down_exps->type == DS4_TENSOR_Q4_K) { + return "q4_k_routed_experts"; + } + if (layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS && + layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS && + layer->ffn_down_exps->type == DS4_TENSOR_Q2_K) { + return "iq2_xxs_gate_up_q2_k_down_routed_experts"; + } + return "mixed_routed_experts"; +} + +static bool dspark_target_cache_write_tensor_type_counts(FILE *fp, const ds4_model *model) { + uint64_t counts[32] = {0}; + uint64_t unknown = 0; + if (model) { + for (uint64_t i = 0; i < model->n_tensors; i++) { + uint32_t type = model->tensors[i].type; + if (type < (uint32_t)(sizeof(counts) / sizeof(counts[0]))) { + counts[type]++; + } else { + unknown++; + } + } + } + if (fprintf(fp, "{") < 0) return false; + bool first = true; + for (uint32_t type = 0; type < (uint32_t)(sizeof(counts) / sizeof(counts[0])); type++) { + if (!counts[type]) continue; + if (!first && fprintf(fp, ", ") < 0) return false; + first = false; + if (fprintf(fp, "\"%s\": %llu", + tensor_type_name(type), + (unsigned long long)counts[type]) < 0) { + return false; + } + } + if (unknown) { + if (!first && fprintf(fp, ", ") < 0) return false; + if (fprintf(fp, "\"unknown\": %llu", (unsigned long long)unknown) < 0) return false; + } + return fprintf(fp, "}") >= 0; +} + +static bool dspark_target_cache_write_manifest(const char *output_dir, + const char *dataset_path, + const char *target_model_name_or_path, + const char *chat_template, + const ds4_model *model, + const ds4_weights *weights, + const ds4_dspark_config *cfg, + uint64_t num_samples, + uint64_t num_tokens) { + char path[PATH_MAX]; + if (!dspark_target_cache_join_path(path, sizeof(path), output_dir, "manifest.json")) { + fprintf(stderr, "ds4: DSpark target cache manifest path is too long\n"); + return false; + } + FILE *fp = fopen(path, "wb"); + if (!fp) { + fprintf(stderr, "ds4: failed to create DSpark target cache manifest %s: %s\n", + path, strerror(errno)); + return false; + } + const char *source_gguf_path = (model && model->path && model->path[0]) ? model->path : DS4_MODEL_SHAPE_NAME; + const char *target_model = target_model_name_or_path; + const char *template_name = (chat_template && chat_template[0]) ? + chat_template : + "ds4_tokenize_rendered_chat"; + bool ok = true; + ok = ok && fprintf(fp, "{\n") >= 0; + ok = ok && fprintf(fp, " \"version\": 2,\n") >= 0; + ok = ok && fprintf(fp, " \"format\": \"deepspec-target-cache\",\n") >= 0; + ok = ok && fprintf(fp, " \"producer\": \"ds4\",\n") >= 0; + ok = ok && fprintf(fp, " \"producer_commit\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, DS4_GIT_COMMIT); + ok = ok && fprintf(fp, ",\n \"source_dataset_path\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, dataset_path); + ok = ok && fprintf(fp, ",\n \"source_gguf_path\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, source_gguf_path); + ok = ok && fprintf(fp, ",\n \"target_model_name_or_path\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, target_model); + ok = ok && fprintf(fp, ",\n \"model_shape\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, DS4_MODEL_SHAPE_NAME); + ok = ok && fprintf(fp, ",\n \"quantization_family\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, dspark_target_cache_quant_family(weights)); + ok = ok && fprintf(fp, ",\n \"num_samples\": %llu,\n", (unsigned long long)num_samples) >= 0; + ok = ok && fprintf(fp, " \"num_tokens\": %llu,\n", (unsigned long long)num_tokens) >= 0; + ok = ok && fprintf(fp, " \"num_shards\": %u,\n", num_samples ? 1u : 0u) >= 0; + ok = ok && fprintf(fp, " \"target_layer_ids\": [%u, %u, %u],\n", + cfg->target_layer_ids[0], + cfg->target_layer_ids[1], + cfg->target_layer_ids[2]) >= 0; + ok = ok && fprintf(fp, " \"hidden_size\": %u,\n", DS4_N_EMBD) >= 0; + ok = ok && fprintf(fp, " \"target_hidden_size\": %u,\n", DS4_N_EMBD) >= 0; + ok = ok && fprintf(fp, " \"target_hidden_layers\": %u,\n", cfg->n_mtp_layers) >= 0; + ok = ok && fprintf(fp, " \"hidden_dtype\": \"bfloat16\",\n") >= 0; + ok = ok && fprintf(fp, " \"token_dtype\": \"int32\",\n") >= 0; + ok = ok && fprintf(fp, " \"mask_dtype\": \"uint8\",\n") >= 0; + ok = ok && fprintf(fp, " \"index_record_size\": 56,\n") >= 0; + ok = ok && fprintf(fp, " \"input_convention\": {\n") >= 0; + ok = ok && fprintf(fp, " \"tokenization\": \"ds4_tokenize_rendered_chat\",\n") >= 0; + ok = ok && fprintf(fp, " \"chat_template\": ") >= 0; + ok = ok && dspark_target_cache_write_json_string(fp, template_name); + ok = ok && fprintf(fp, ",\n \"sample_split_marker\": \"===== DS4_IMATRIX_PROMPT\",\n") >= 0; + ok = ok && fprintf(fp, " \"loss_mask\": \"1 for every exported prompt token\"\n") >= 0; + ok = ok && fprintf(fp, " },\n") >= 0; + ok = ok && fprintf(fp, " \"hidden_convention\": {\n") >= 0; + ok = ok && fprintf(fp, " \"target_hidden_states\": \"bfloat16 mean over DS4 HC heads after each target layer; row-major [seq_len, target_hidden_layers, hidden_size]\",\n") >= 0; + ok = ok && fprintf(fp, " \"target_last_hidden_states\": \"bfloat16 output-HC projection plus final RMSNorm; row-major [seq_len, hidden_size]\"\n") >= 0; + ok = ok && fprintf(fp, " },\n") >= 0; + ok = ok && fprintf(fp, " \"gguf_tensor_type_counts\": ") >= 0; + ok = ok && dspark_target_cache_write_tensor_type_counts(fp, model); + ok = ok && fprintf(fp, ",\n \"shards\": [") >= 0; + if (num_samples) { + ok = ok && fprintf(fp, "\n {\n \"file_name\": \"shard-00000.bin\",\n \"shard_id\": 0\n }\n ") >= 0; + } + ok = ok && fprintf(fp, "]\n}\n") >= 0; + if (fclose(fp) != 0) ok = false; + if (!ok) fprintf(stderr, "ds4: failed to write DSpark target cache manifest %s\n", path); + return ok; +} + +static uint32_t dspark_target_cache_layer_slot(const ds4_dspark_config *cfg, uint32_t layer_id) { + for (uint32_t i = 0; i < cfg->n_mtp_layers && i < 3; i++) { + if (cfg->target_layer_ids[i] == layer_id) return i; + } + return UINT32_MAX; +} + +static void dspark_target_cache_hc_mean_bf16(uint16_t *out, + const float *hc_rows, + uint32_t rows, + uint32_t slot, + uint32_t n_slots) { + const float inv_hc = 1.0f / (float)DS4_N_HC; + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + for (uint32_t row = 0; row < rows; row++) { + const float *hc = hc_rows + (uint64_t)row * hc_dim; + uint16_t *dst = out + ((uint64_t)row * n_slots + slot) * DS4_N_EMBD; + for (uint32_t d = 0; d < DS4_N_EMBD; d++) { + float sum = 0.0f; + for (uint32_t h = 0; h < DS4_N_HC; h++) { + sum += hc[(uint64_t)h * DS4_N_EMBD + d]; + } + dst[d] = f32_to_bf16(sum * inv_hc); + } + } +} + +static void dspark_target_cache_last_hidden_bf16(uint16_t *out, + const ds4_model *model, + const ds4_weights *weights, + const float *hc_rows, + uint32_t rows) { + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + float *embd = xmalloc((size_t)DS4_N_EMBD * sizeof(embd[0])); + float *norm = xmalloc((size_t)DS4_N_EMBD * sizeof(norm[0])); + const float *norm_weight = tensor_data(model, weights->output_norm); + for (uint32_t row = 0; row < rows; row++) { + const float *hc = hc_rows + (uint64_t)row * hc_dim; + output_hc_head_one(embd, model, weights, hc); + rms_norm_weight(norm, embd, norm_weight, DS4_N_EMBD, DS4_RMS_EPS); + uint16_t *dst = out + (uint64_t)row * DS4_N_EMBD; + for (uint32_t d = 0; d < DS4_N_EMBD; d++) dst[d] = f32_to_bf16(norm[d]); + } + free(norm); + free(embd); +} + +static bool dspark_target_cache_encode_chunk(ds4_gpu_graph *g, + const ds4_model *model, + const ds4_weights *weights, + const ds4_dspark_config *cfg, + const token_vec *prompt, + uint32_t pos0, + uint32_t n_tokens, + float *hc_rows, + uint16_t *target_chunk, + uint16_t *last_chunk) { + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + bool ok = metal_graph_upload_prompt_tokens(g->prefill_tokens, prompt, pos0, n_tokens); + if (ok) ok = metal_graph_upload_prompt_embeddings_hc(g->batch_cur_hc, + g->prefill_tokens, + model, + weights, + prompt, + pos0, + n_tokens); + for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) { + ok = ds4_gpu_begin_commands() != 0; + if (ok) { + ok = metal_graph_encode_layer_batch(g, + model, + &weights->layer[il], + il, + pos0, + n_tokens); + } + if (ok) ok = ds4_gpu_end_commands() != 0; + if (!ok) { + fprintf(stderr, "ds4: DSpark target cache layer %u encode failed\n", il); + return false; + } + const uint32_t slot = dspark_target_cache_layer_slot(cfg, il); + if (slot != UINT32_MAX) { + if (ds4_gpu_tensor_read(g->batch_cur_hc, + 0, + hc_rows, + (uint64_t)n_tokens * hc_dim * sizeof(float)) == 0) { + fprintf(stderr, "ds4: failed to read DSpark target layer %u hidden states\n", il); + return false; + } + dspark_target_cache_hc_mean_bf16(target_chunk, + hc_rows, + n_tokens, + slot, + cfg->n_mtp_layers); + } + } + if (ok && ds4_gpu_tensor_read(g->batch_cur_hc, + 0, + hc_rows, + (uint64_t)n_tokens * hc_dim * sizeof(float)) == 0) { + fprintf(stderr, "ds4: failed to read DSpark target final hidden states\n"); + ok = false; + } + if (ok) { + dspark_target_cache_last_hidden_bf16(last_chunk, + model, + weights, + hc_rows, + n_tokens); + } + return ok; +} +#endif + +int ds4_engine_collect_imatrix(ds4_engine *e, + const char *dataset_path, + const char *output_path, + int ctx_size, + int max_prompts, int max_tokens) { #ifdef DS4_NO_GPU (void)e; @@ -25023,7 +26709,8 @@ int ds4_engine_collect_imatrix(ds4_engine *e, ds4_gpu_graph g; bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0], - raw_cap, (uint32_t)ctx_size, prefill_cap, false); + NULL, raw_cap, (uint32_t)ctx_size, + prefill_cap, false); if (!ok) { fprintf(stderr, "ds4: failed to allocate imatrix Metal graph runtime\n"); free(dataset); @@ -25140,6 +26827,315 @@ int ds4_engine_collect_imatrix(ds4_engine *e, #endif } +int ds4_engine_collect_dspark_target_cache(ds4_engine *e, + const char *dataset_path, + const char *output_dir, + const char *target_model_name_or_path, + const char *chat_template, + int ctx_size, + int max_prompts, + int max_tokens) { +#ifdef DS4_NO_GPU + (void)e; + (void)dataset_path; + (void)output_dir; + (void)target_model_name_or_path; + (void)chat_template; + (void)ctx_size; + (void)max_prompts; + (void)max_tokens; + fprintf(stderr, "ds4: DSpark target cache export requires a graph backend build\n"); + return 1; +#else + if (!e || !dataset_path || !output_dir) return 1; + if (!target_model_name_or_path || !target_model_name_or_path[0]) { + fprintf(stderr, + "ds4: DSpark target cache export requires --dspark-target-cache-target-model\n"); + return 1; + } + if (e->backend != DS4_BACKEND_METAL || !e->metal_ready) { + fprintf(stderr, "ds4: DSpark target cache export currently requires --metal\n"); + return 1; + } + if (e->ssd_streaming) { + fprintf(stderr, "ds4: DSpark target cache export requires non-streaming Metal weights\n"); + return 1; + } + if (ctx_size <= 0) ctx_size = 32768; + + ds4_dspark_config cfg; + ds4_dspark_config_init_defaults(&cfg); + if (cfg.n_mtp_layers == 0 || cfg.n_mtp_layers > 3) { + fprintf(stderr, "ds4: unsupported DSpark target layer count %u\n", cfg.n_mtp_layers); + return 1; + } + for (uint32_t i = 0; i < cfg.n_mtp_layers; i++) { + if (cfg.target_layer_ids[i] >= DS4_N_LAYER) { + fprintf(stderr, + "ds4: DSpark target layer %u is outside the loaded %u-layer model\n", + cfg.target_layer_ids[i], + DS4_N_LAYER); + return 1; + } + for (uint32_t j = i + 1; j < cfg.n_mtp_layers; j++) { + if (cfg.target_layer_ids[i] == cfg.target_layer_ids[j]) { + fprintf(stderr, "ds4: duplicate DSpark target layer %u\n", cfg.target_layer_ids[i]); + return 1; + } + } + } + + char *dataset = NULL; + size_t dataset_len = 0; + if (!imatrix_read_text_file(dataset_path, &dataset, &dataset_len)) return 1; + if (!dspark_target_cache_output_dir_prepare(output_dir)) { + free(dataset); + return 1; + } + + char shard_path[PATH_MAX]; + char index_path[PATH_MAX]; + if (!dspark_target_cache_join_path(shard_path, sizeof(shard_path), output_dir, "shard-00000.bin") || + !dspark_target_cache_join_path(index_path, sizeof(index_path), output_dir, "samples.idx")) { + fprintf(stderr, "ds4: DSpark target cache output path is too long\n"); + free(dataset); + return 1; + } + + FILE *shard = fopen(shard_path, "wb"); + if (!shard) { + fprintf(stderr, "ds4: failed to create DSpark target cache shard %s: %s\n", + shard_path, strerror(errno)); + free(dataset); + return 1; + } + FILE *index = fopen(index_path, "wb"); + if (!index) { + fprintf(stderr, "ds4: failed to create DSpark target cache index %s: %s\n", + index_path, strerror(errno)); + fclose(shard); + free(dataset); + return 1; + } + + const ds4_model *model = &e->model; + const ds4_weights *weights = &e->weights; + const uint32_t prefill_cap = + metal_graph_prefill_cap_for_prompt(ctx_size, e->prefill_chunk); + const uint32_t raw_cap = metal_graph_raw_cap_for_context(ctx_size, prefill_cap); + + ds4_gpu_graph g; + bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0], + NULL, raw_cap, (uint32_t)ctx_size, + prefill_cap, false); + if (!ok) { + fprintf(stderr, "ds4: failed to allocate DSpark target cache Metal graph runtime\n"); + fclose(index); + fclose(shard); + free(dataset); + return 1; + } + g.quality = e->quality; + g.ssd_streaming = false; + g.ssd_streaming_cold = false; + g.streaming_preload_experts = 0; + g.power_percent = (uint32_t)e->power_percent; + + const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; + float *hc_rows = xmalloc((size_t)prefill_cap * (size_t)hc_dim * sizeof(hc_rows[0])); + uint16_t *target_chunk = xmalloc((size_t)prefill_cap * + (size_t)cfg.n_mtp_layers * + (size_t)DS4_N_EMBD * + sizeof(target_chunk[0])); + uint16_t *last_chunk = xmalloc((size_t)prefill_cap * + (size_t)DS4_N_EMBD * + sizeof(last_chunk[0])); + + fprintf(stderr, + "ds4: exporting DeepSpec DSpark target cache from %s (model=%s, target_layers=[%u,%u,%u], ctx=%d, chunk=%u)\n", + dataset_path, + DS4_MODEL_SHAPE_NAME, + cfg.target_layer_ids[0], + cfg.target_layer_ids[1], + cfg.target_layer_ids[2], + ctx_size, + prefill_cap); + + int prompts_done = 0; + int tokens_done = 0; + char *cursor = dataset; + const char *marker_lit = "===== DS4_IMATRIX_PROMPT"; + while (ok && *cursor) { + if (max_prompts > 0 && prompts_done >= max_prompts) break; + if (max_tokens > 0 && tokens_done >= max_tokens) break; + + char *start = cursor; + char *marker = strstr(cursor, marker_lit); + if (marker) { + char *nl = strchr(marker, '\n'); + if (!nl) break; + start = nl + 1; + } else if (prompts_done != 0) { + break; + } + + char *next = strstr(start, marker_lit); + char *end = next ? next : dataset + dataset_len; + char saved = *end; + char *prompt_text = imatrix_trim_block(start, end); + if (prompt_text[0] != '\0') { + token_vec prompt = {0}; + ds4_tokenize_rendered_chat(e, prompt_text, &prompt); + if (prompt.len > ctx_size) prompt.len = ctx_size; + if (max_tokens > 0 && prompt.len > max_tokens - tokens_done) { + prompt.len = max_tokens - tokens_done; + } + if (prompt.len > 0) { + uint16_t *last_full = xmalloc((size_t)prompt.len * + (size_t)DS4_N_EMBD * + sizeof(last_full[0])); + int32_t *ids = xmalloc((size_t)prompt.len * sizeof(ids[0])); + uint8_t *mask = xmalloc((size_t)prompt.len * sizeof(mask[0])); + for (int i = 0; i < prompt.len; i++) { + ids[i] = (int32_t)prompt.v[i]; + mask[i] = 1; + } + + uint64_t input_ids_offset = 0; + uint64_t attention_mask_offset = 0; + uint64_t loss_mask_offset = 0; + uint64_t target_hidden_states_offset = 0; + uint64_t target_last_hidden_states_offset = 0; + ok = dspark_target_cache_file_pos(shard, &input_ids_offset) && + dspark_target_cache_write_all(shard, + ids, + (size_t)prompt.len * sizeof(ids[0]), + "input_ids"); + ok = ok && dspark_target_cache_file_pos(shard, &attention_mask_offset) && + dspark_target_cache_write_all(shard, + mask, + (size_t)prompt.len * sizeof(mask[0]), + "attention_mask"); + ok = ok && dspark_target_cache_file_pos(shard, &loss_mask_offset) && + dspark_target_cache_write_all(shard, + mask, + (size_t)prompt.len * sizeof(mask[0]), + "loss_mask"); + ok = ok && dspark_target_cache_file_pos(shard, &target_hidden_states_offset); + + if (ok && !metal_graph_reset_prefill_state(&g)) { + fprintf(stderr, "ds4: failed to reset DSpark target cache graph state\n"); + ok = false; + } + for (uint32_t pos = 0; ok && pos < (uint32_t)prompt.len;) { + uint32_t chunk = (uint32_t)prompt.len - pos; + if (chunk > prefill_cap) chunk = prefill_cap; + memset(target_chunk, + 0, + (size_t)chunk * (size_t)cfg.n_mtp_layers * + (size_t)DS4_N_EMBD * sizeof(target_chunk[0])); + ok = dspark_target_cache_encode_chunk(&g, + model, + weights, + &cfg, + &prompt, + pos, + chunk, + hc_rows, + target_chunk, + last_chunk); + if (ok) { + ok = dspark_target_cache_write_all(shard, + target_chunk, + (size_t)chunk * + (size_t)cfg.n_mtp_layers * + (size_t)DS4_N_EMBD * + sizeof(target_chunk[0]), + "target_hidden_states"); + } + if (ok) { + memcpy(last_full + (uint64_t)pos * DS4_N_EMBD, + last_chunk, + (size_t)chunk * (size_t)DS4_N_EMBD * sizeof(last_chunk[0])); + } + pos += chunk; + } + ok = ok && dspark_target_cache_file_pos(shard, &target_last_hidden_states_offset) && + dspark_target_cache_write_all(shard, + last_full, + (size_t)prompt.len * + (size_t)DS4_N_EMBD * + sizeof(last_full[0]), + "target_last_hidden_states"); + ok = ok && dspark_target_cache_write_index_record(index, + (uint64_t)prompts_done, + 0, + (uint32_t)prompt.len, + input_ids_offset, + attention_mask_offset, + loss_mask_offset, + target_hidden_states_offset, + target_last_hidden_states_offset); + if (ok) { + prompts_done++; + tokens_done += prompt.len; + fprintf(stderr, + "ds4: DSpark target cache prompts=%d tokens=%d\r", + prompts_done, + tokens_done); + fflush(stderr); + } + free(mask); + free(ids); + free(last_full); + } + token_vec_free(&prompt); + } + *end = saved; + if (!next) break; + cursor = next; + } + fputc('\n', stderr); + + if (fflush(shard) != 0 || fsync(fileno(shard)) != 0) { + fprintf(stderr, "ds4: failed to flush DSpark target cache shard %s: %s\n", + shard_path, strerror(errno)); + ok = false; + } + if (fflush(index) != 0 || fsync(fileno(index)) != 0) { + fprintf(stderr, "ds4: failed to flush DSpark target cache index %s: %s\n", + index_path, strerror(errno)); + ok = false; + } + if (fclose(index) != 0) ok = false; + if (fclose(shard) != 0) ok = false; + + if (ok) ok = dspark_target_cache_write_manifest(output_dir, + dataset_path, + target_model_name_or_path, + chat_template, + model, + weights, + &cfg, + (uint64_t)prompts_done, + (uint64_t)tokens_done); + if (ok) { + fprintf(stderr, + "ds4: wrote DeepSpec DSpark target cache %s from %d prompts and %d tokens\n", + output_dir, + prompts_done, + tokens_done); + } + + free(last_chunk); + free(target_chunk); + free(hc_rows); + metal_graph_free(&g); + free(dataset); + return ok ? 0 : 1; +#endif +} + int ds4_engine_generate_argmax( ds4_engine *e, const ds4_tokens *prompt, @@ -25690,9 +27686,22 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { model_open(&e->mtp_model, opt->mtp_path, graph_backend, true); mtp_weights_bind(&e->mtp_weights, &e->mtp_model); e->mtp_ready = true; - fprintf(stderr, "ds4: MTP support model loaded: %s (draft=%d)\n", + if ((e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK || e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) && + (opt->mtp_draft_tokens <= 0 || opt->mtp_draft_tokens == 1)) { + e->mtp_draft_tokens = (int)e->mtp_weights.dspark.block_size; + } + fprintf(stderr, "ds4: draft model loaded: %s (kind=%s, draft=%d, runtime_mtp=%s)\n", opt->mtp_path, - e->mtp_draft_tokens); + ds4_mtp_draft_kind_name(e->mtp_weights.kind), + e->mtp_draft_tokens, + ds4_engine_has_mtp(e) ? "yes" : "no"); + const ds4_dspark_spec_gate spec_gate = ds4_dspark_speculative_gate(e->mtp_weights.kind, + e->mtp_ready, + e->mtp_draft_tokens); + if (spec_gate == DS4_DSPARK_SPEC_DSPARK_NOT_READY || + spec_gate == DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY) { + fprintf(stderr, "ds4: %s\n", ds4_dspark_spec_gate_reason(spec_gate)); + } } #ifndef DS4_NO_GPU @@ -25902,7 +27911,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { *out = NULL; return 1; } - if (e->mtp_ready && + if (ds4_engine_has_mtp(e) && !ds4_gpu_set_model_map_range(e->mtp_model.map, e->mtp_model.size, e->mtp_model.tensor_data_pos, @@ -25945,7 +27954,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { free(load_sizes); /* Also apply explicit optional Q8 preload settings to the MTP support * model when loaded. */ - if (e->mtp_ready) { + if (ds4_engine_has_mtp(e)) { (void)ds4_gpu_set_model_fd_for_map(e->mtp_model.fd, e->mtp_model.map); if (!accelerator_cache_model_tensors(e->backend, &e->mtp_model, NULL, NULL, 0)) { @@ -26072,7 +28081,8 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) { return 1; } if (!metal_graph_alloc_raw_cap(&s->graph, &e->weights, shape_layer, - raw_cap, (uint32_t)ctx_size, s->prefill_cap, e->mtp_ready)) + &e->mtp_weights, raw_cap, (uint32_t)ctx_size, + s->prefill_cap, ds4_engine_has_mtp(e))) { free(s); return 1; @@ -26091,9 +28101,16 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) { return 1; } s->logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(s->logits[0])); - if (e->mtp_ready) { + if (ds4_engine_has_mtp(e)) { s->mtp_logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(s->mtp_logits[0])); s->mtp_draft_token = -1; + /* Allocate B2 draft logits buffer when DS4_SPEC_TEMP is set and DSpark is active. */ + if (e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK && getenv("DS4_SPEC_TEMP")) { + const uint32_t block_size = e->mtp_weights.dspark.block_size > 0 + ? e->mtp_weights.dspark.block_size : 16; + s->dspark_b2_draft_logits = xmalloc( + (size_t)block_size * DS4_N_VOCAB * sizeof(float)); + } } if (e->distributed.role == DS4_DISTRIBUTED_COORDINATOR) { char err[256]; @@ -26110,6 +28127,7 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) { metal_graph_free(&s->graph); free(s->logits); free(s->mtp_logits); + free(s->dspark_b2_draft_logits); free(s); return 1; } @@ -26134,6 +28152,7 @@ void ds4_session_free(ds4_session *s) { token_vec_free(&s->checkpoint); free(s->logits); free(s->mtp_logits); + free(s->dspark_b2_draft_logits); free(s); } @@ -27107,7 +29126,7 @@ static int ds4_session_eval_internal(ds4_session *s, int token, bool probe_mtp, ds4_engine *e = s->engine; const bool mtp_probe_log = getenv("DS4_MTP_PROBE") != NULL; const bool mtp_should_draft = - probe_mtp && e->mtp_ready && s->mtp_logits && + probe_mtp && ds4_engine_has_mtp(e) && s->mtp_logits && (e->mtp_draft_tokens > 1 || mtp_probe_log); if (probe_mtp && s->mtp_draft_valid) { if (mtp_probe_log) { @@ -27133,20 +29152,45 @@ static int ds4_session_eval_internal(ds4_session *s, int token, bool probe_mtp, } token_vec_push(&s->checkpoint, token); if (mtp_should_draft) { - int mtp_top = -1; - if (metal_graph_eval_mtp_draft(&s->graph, - &e->model, - &e->weights, - &e->mtp_model, - &e->mtp_weights, - token, - (uint32_t)(s->checkpoint.len - 1), - getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL, - &mtp_top)) { - s->mtp_draft_token = mtp_top >= 0 ? mtp_top : sample_argmax(s->mtp_logits, DS4_N_VOCAB); - s->mtp_draft_valid = true; - } else if (getenv("DS4_MTP_PROBE")) { - fprintf(stderr, "ds4: mtp probe draft failed\n"); + if (e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK) { + int draft_n = 0; + uint32_t base_real = 0; + if (metal_graph_eval_dspark_draft_block(&s->graph, + &e->model, + &e->weights, + &e->mtp_model, + &e->mtp_weights, + token, + (uint32_t)(s->checkpoint.len - 1), + (uint32_t)e->mtp_draft_tokens, + s->dspark_draft_tokens, + &draft_n, + &base_real, + getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL, + s->dspark_b2_draft_logits)) { + s->dspark_draft_count = draft_n; + s->dspark_draft_base_real = base_real; + s->mtp_draft_token = draft_n > 0 ? s->dspark_draft_tokens[0] : -1; + s->mtp_draft_valid = draft_n > 0; + } else if (getenv("DS4_MTP_PROBE") || getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, "ds4: DSpark draft block failed\n"); + } + } else { + int mtp_top = -1; + if (metal_graph_eval_mtp_draft(&s->graph, + &e->model, + &e->weights, + &e->mtp_model, + &e->mtp_weights, + token, + (uint32_t)(s->checkpoint.len - 1), + getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL, + &mtp_top)) { + s->mtp_draft_token = mtp_top >= 0 ? mtp_top : sample_argmax(s->mtp_logits, DS4_N_VOCAB); + s->mtp_draft_valid = true; + } else if (getenv("DS4_MTP_PROBE")) { + fprintf(stderr, "ds4: mtp probe draft failed\n"); + } } } return 0; @@ -27204,7 +29248,7 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, accepted[n_accept++] = first_token; if (first_token == eos_token || max_tokens == 1 || n_accept >= accepted_cap) return n_accept; - if (!e->mtp_ready || !s->mtp_draft_valid || e->mtp_draft_tokens <= 1) return n_accept; + if (!ds4_engine_has_mtp(e) || !s->mtp_draft_valid || e->mtp_draft_tokens <= 1) return n_accept; int draft_cap = e->mtp_draft_tokens; if (draft_cap > max_tokens - n_accept) draft_cap = max_tokens - n_accept; @@ -27213,6 +29257,303 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token, if (draft_cap > room - 1) draft_cap = room - 1; if (draft_cap <= 0) return n_accept; + if (e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK) { + int drafts[16]; + int draft_n = s->dspark_draft_count; + if (draft_n > draft_cap) draft_n = draft_cap; + + /* Adaptive block size: conservative-then-aggressive. + * Start at block=2 (near-baseline, safe). Escalate to full block + * ONLY after seeing a full commit (high acceptance detected). + * Drop back to block=2 on any partial commit. + * + * This makes DSpark net-positive across ALL workloads: + * structured: escalates to block=5 after 1st full commit → +8% speedup + * creative: stays at block=2 → ~95-100% of baseline (no waste) + * DS4_DSPARK_ADAPTIVE=1 enables this. */ + if (getenv("DS4_DSPARK_ADAPTIVE") && draft_n > 2) { + if (s->dspark_prev_drafted == 0) { + draft_n = 2; /* first cycle → conservative */ + } else if (s->dspark_prev_accepted == s->dspark_prev_drafted) { + /* previous was full commit → escalate (keep full block) */ + } else { + draft_n = 2; /* previous was partial → conservative */ + } + } + if (draft_n <= 0) { + s->mtp_draft_valid = false; + return n_accept; + } + memcpy(drafts, s->dspark_draft_tokens, (size_t)draft_n * sizeof(drafts[0])); + s->mtp_draft_valid = false; + s->dspark_draft_count = 0; + + const bool mtp_timing = getenv("DS4_MTP_TIMING") != NULL; + const double mtp_t0 = mtp_timing ? now_sec() : 0.0; +#define DS4_DSPARK_KEEP_ACCEPTED(n_) do { \ + uint32_t keep_ = s->dspark_draft_base_real + 1u + (uint32_t)(n_); \ + if (keep_ > DS4_N_SWA) keep_ = 0; \ + s->graph.dspark_n_real = keep_; \ + s->dspark_prev_accepted = (int)(n_); \ + s->dspark_prev_drafted = draft_n; \ + } while (0) + + /* B2 rejection sampling: parse DS4_SPEC_TEMP for stochastic path. + * When set (temp > 0), uses B2 to produce lossless samples from + * the target model's distribution. Default (unset or <=0): greedy. + * + * ARDD adversarial review fix: RNG state persisted in session struct + * (not reseeded per call) to avoid correlated random sequences when + * multiple speculative eval calls happen within the same second. */ + float b2_temp = 0.0f; + const char *spec_temp_env = getenv("DS4_SPEC_TEMP"); + if (spec_temp_env && spec_temp_env[0]) { + char *end = NULL; + float v = strtof(spec_temp_env, &end); + if (end != spec_temp_env && v > 0.0f) b2_temp = v; + } + if (b2_temp > 0.0f && s->dspark_b2_rng == 0) { + const char *seed_env = getenv("DS4_SPEC_RNG_SEED"); + if (seed_env && seed_env[0]) { + s->dspark_b2_rng = (uint64_t)strtoull(seed_env, NULL, 0); + } + if (s->dspark_b2_rng == 0) { + s->dspark_b2_rng = (uint64_t)time(NULL) ^ + ((uint64_t)getpid() << 32) ^ + (uint64_t)clock(); + } + } + + /* Greedy first-draft check (common to both greedy and B2 paths). + * At temp=0 this is exact; at temp>0 it is a fast pre-filter — + * if the argmax doesn't match, B2 would also very likely reject. */ + if (b2_temp <= 0.0f && sample_argmax(s->logits, DS4_N_VOCAB) != drafts[0]) { + DS4_DSPARK_KEEP_ACCEPTED(0); + if (getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, "ds4: dspark spec miss first draft=%d\n", drafts[0]); + } + return n_accept; + } + if (drafts[0] == eos_token) draft_n = 1; + + ds4_spec_frontier frontier; + memset(&frontier, 0, sizeof(frontier)); + int *row_tops = xmalloc((size_t)draft_n * sizeof(row_tops[0])); + float *row_logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(row_logits[0])); + + /* For B2 at temp>0, allocate target logits buffer for ALL draft positions. + * At temp=0 this stays NULL and we use the existing argmax path. + * + * CRITICAL (ARDD adversarial review fix — off-by-one in target logits): + * metal_graph_verify_suffix_tops row[i] = target logits AFTER processing + * drafts[i] → predicts drafts[i+1], NOT drafts[i]. + * Correct mapping: drafts[0] → s->logits (previous target eval), + * drafts[j>0] → verify_row[j-1]. + * We store raw verify output in b2_verify_logits, then shift into + * b2_target_logits with s->logits prepended as row 0. */ + const uint64_t row_bytes = (uint64_t)DS4_N_VOCAB * sizeof(float); + const uint64_t all_logits_bytes = (uint64_t)draft_n * row_bytes; + float *b2_verify_logits = (b2_temp > 0.0f) + ? xmalloc((size_t)all_logits_bytes) : NULL; + float *b2_target_logits = (b2_temp > 0.0f) + ? xmalloc((size_t)all_logits_bytes) : NULL; + + const int start = s->checkpoint.len; + const double snapshot_t0 = mtp_timing ? now_sec() : 0.0; + bool have_frontier = spec_frontier_snapshot(&frontier, s); + bool ok = have_frontier; + const double snapshot_done = mtp_timing ? now_sec() : 0.0; + if (ok) { + for (int i = 0; i < draft_n; i++) token_vec_push(&s->checkpoint, drafts[i]); + ok = metal_graph_verify_suffix_tops(&s->graph, + &e->model, + &e->weights, + &s->checkpoint, + (uint32_t)start, + (uint32_t)draft_n, + false, + row_tops, + b2_verify_logits); + } + /* Assemble shifted target logits for B2: + * row 0 = s->logits (target prediction for drafts[0]) + * row j = verify_row[j-1] (target prediction for drafts[j]) */ + if (ok && b2_verify_logits && b2_target_logits) { + memcpy(b2_target_logits, s->logits, (size_t)row_bytes); + if (draft_n > 1) { + memcpy(b2_target_logits + DS4_N_VOCAB, + b2_verify_logits, + (size_t)(draft_n - 1) * (size_t)row_bytes); + } + } + const double verify_done = mtp_timing ? now_sec() : 0.0; + if (ok) { + int commit_drafts; + + if (b2_temp > 0.0f && b2_target_logits && s->dspark_b2_draft_logits) { + /* ---- B2 stochastic path ---- */ + b2_result b2r = b2_rejection_sample( + drafts, + s->dspark_b2_draft_logits, + b2_target_logits, + DS4_N_VOCAB, + draft_n, + b2_temp, + &s->dspark_b2_rng); + commit_drafts = b2r.n_accepted; + + if (getenv("DS4_MTP_SPEC_LOG")) { + fprintf(stderr, + "ds4: dspark b2 accepted=%d/%d correction=%s temp=%.2f\n", + b2r.n_accepted, draft_n, + b2r.has_correction ? "yes" : "no", + b2_temp); + } + + if (commit_drafts == draft_n && !b2r.has_correction) { + /* All draft tokens accepted — fast commit path. */ + ok = metal_graph_dspark_refresh_verified_rows(&s->graph, + &e->mtp_model, + &e->mtp_weights, + s->dspark_draft_base_real + 1u, + (uint32_t)start, + (uint32_t)draft_n); + if (ok) ok = metal_graph_read_spec_logits_row(&s->graph, + (uint32_t)(draft_n - 1), + row_logits); + if (ok) { + memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0])); + for (int i = 0; i < draft_n && n_accept < accepted_cap; i++) { + accepted[n_accept++] = drafts[i]; + if (drafts[i] == eos_token) break; + } + s->checkpoint_valid = true; + s->mtp_draft_valid = false; + DS4_DSPARK_KEEP_ACCEPTED(draft_n); + if (mtp_timing) { + fprintf(stderr, + "ds4: dspark b2 timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms total=%.3f ms\n", + draft_n, draft_n, + (snapshot_done - snapshot_t0) * 1000.0, + (verify_done - snapshot_done) * 1000.0, + (now_sec() - mtp_t0) * 1000.0); + } + spec_frontier_free(&frontier); + free(row_logits); + free(row_tops); + free(b2_target_logits); free(b2_verify_logits); + return n_accept; + } + } + /* B2 partial accept or correction: fall through to replay path. + * If B2 produced a correction token, replace the first rejected + * draft with it so the replay commits the corrected sequence. */ + if (b2r.has_correction && commit_drafts < draft_n) { + drafts[commit_drafts] = b2r.correction_token; + commit_drafts++; /* include the correction in replay */ + } + } else { + /* ---- Greedy argmax path (unchanged) ---- */ + commit_drafts = 1; + for (int i = 1; i < draft_n; i++) { + if (row_tops[i - 1] != drafts[i]) break; + commit_drafts++; + } + if (commit_drafts == draft_n) { + ok = metal_graph_dspark_refresh_verified_rows(&s->graph, + &e->mtp_model, + &e->mtp_weights, + s->dspark_draft_base_real + 1u, + (uint32_t)start, + (uint32_t)draft_n); + if (ok) ok = metal_graph_read_spec_logits_row(&s->graph, + (uint32_t)(draft_n - 1), + row_logits); + if (ok) { + memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0])); + for (int i = 0; i < draft_n && n_accept < accepted_cap; i++) { + accepted[n_accept++] = drafts[i]; + if (drafts[i] == eos_token) break; + } + s->checkpoint_valid = true; + s->mtp_draft_valid = false; + DS4_DSPARK_KEEP_ACCEPTED(draft_n); + if (mtp_timing) { + fprintf(stderr, + "ds4: dspark timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms total=%.3f ms\n", + draft_n, draft_n, + (snapshot_done - snapshot_t0) * 1000.0, + (verify_done - snapshot_done) * 1000.0, + (now_sec() - mtp_t0) * 1000.0); + } + spec_frontier_free(&frontier); + free(row_logits); + free(row_tops); + free(b2_target_logits); free(b2_verify_logits); + return n_accept; + } + } + } + + /* Partial commit: restore frontier, replay accepted tokens one-by-one. */ + s->checkpoint.len = start; + ok = have_frontier && spec_frontier_restore(&frontier, s); + int replayed = 0; + for (; ok && replayed < commit_drafts; replayed++) { + ok = metal_graph_eval_token_raw_swa(&s->graph, + &e->model, + &e->weights, + drafts[replayed], + (uint32_t)(start + replayed), + row_logits); + if (ok) { + token_vec_push(&s->checkpoint, drafts[replayed]); + ok = metal_graph_dspark_refresh_current_row(&s->graph, + &e->mtp_model, + &e->mtp_weights, + s->dspark_draft_base_real + 1u + (uint32_t)replayed, + (uint32_t)(start + replayed)); + } + } + if (ok) { + memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0])); + for (int i = 0; i < replayed && n_accept < accepted_cap; i++) { + accepted[n_accept++] = drafts[i]; + if (drafts[i] == eos_token) break; + } + s->checkpoint_valid = true; + s->mtp_draft_valid = false; + DS4_DSPARK_KEEP_ACCEPTED(replayed); + if (mtp_timing) { + fprintf(stderr, + "ds4: dspark timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms replay=%.3f ms total=%.3f ms\n", + draft_n, + replayed, + (snapshot_done - snapshot_t0) * 1000.0, + (verify_done - snapshot_done) * 1000.0, + (now_sec() - verify_done) * 1000.0, + (now_sec() - mtp_t0) * 1000.0); + } + spec_frontier_free(&frontier); + free(row_logits); + free(row_tops); + free(b2_target_logits); free(b2_verify_logits); + return n_accept; + } + } + s->checkpoint.len = start; + if (have_frontier) (void)spec_frontier_restore(&frontier, s); + snprintf(err, errlen, "DSpark verifier failed"); + s->checkpoint_valid = false; + DS4_DSPARK_KEEP_ACCEPTED(0); + spec_frontier_free(&frontier); + free(row_logits); + free(row_tops); + free(b2_target_logits); free(b2_verify_logits); + return -1; +#undef DS4_DSPARK_KEEP_ACCEPTED + } + int drafts[16]; int draft_n = 1; drafts[0] = s->mtp_draft_token; @@ -27769,6 +30110,7 @@ void ds4_session_invalidate(ds4_session *s) { s->checkpoint_valid = false; s->checkpoint.len = 0; s->mtp_draft_valid = false; + s->dspark_draft_count = 0; } void ds4_session_rewind(ds4_session *s, int pos) { @@ -27776,6 +30118,7 @@ void ds4_session_rewind(ds4_session *s, int pos) { if (pos > s->checkpoint.len) pos = s->checkpoint.len; s->checkpoint.len = pos; s->mtp_draft_valid = false; + s->dspark_draft_count = 0; } int ds4_session_pos(ds4_session *s) { diff --git a/ds4.h b/ds4.h index 9d040c92b..4ec3ad6cb 100644 --- a/ds4.h +++ b/ds4.h @@ -56,6 +56,32 @@ typedef struct { #define DS4_DEFAULT_TOP_P 1.0f #define DS4_DEFAULT_MIN_P 0.05f + +typedef enum { + DS4_MTP_DRAFT_NONE = 0, + DS4_MTP_DRAFT_LEGACY, + DS4_MTP_DRAFT_DSPARK, + DS4_MTP_DRAFT_DSPARK_NONSEQ, +} ds4_mtp_draft_kind; + +typedef struct { + uint32_t n_mtp_layers; + uint32_t block_size; + uint32_t noise_token_id; + uint32_t markov_rank; + uint32_t target_layer_ids[3]; +} ds4_dspark_config; + +void ds4_dspark_config_init_defaults(ds4_dspark_config *cfg); +const char *ds4_mtp_draft_kind_name(ds4_mtp_draft_kind kind); +/* Classify draft GGUF layout from presence markers (unit-testable, no model load). */ +ds4_mtp_draft_kind ds4_mtp_draft_kind_guess(bool has_e_proj, bool has_main_proj, bool has_markov_w1); +ds4_mtp_draft_kind ds4_mtp_draft_kind_guess_ex(bool has_e_proj, + bool has_main_proj, + bool has_markov_w1, + bool markov_rank_set, + uint32_t markov_rank); + typedef struct ds4_engine ds4_engine; typedef struct ds4_session ds4_session; @@ -186,6 +212,14 @@ int ds4_engine_collect_imatrix(ds4_engine *e, int ctx_size, int max_prompts, int max_tokens); +int ds4_engine_collect_dspark_target_cache(ds4_engine *e, + const char *dataset_path, + const char *output_dir, + const char *target_model_name_or_path, + const char *chat_template, + int ctx_size, + int max_prompts, + int max_tokens); void ds4_engine_dump_tokens(ds4_engine *e, const ds4_tokens *tokens); int ds4_dump_text_tokenization(const char *model_path, const char *text, FILE *fp); int ds4_engine_head_test(ds4_engine *e, const ds4_tokens *prompt); @@ -273,7 +307,13 @@ int ds4_session_ctx(ds4_session *s); int ds4_session_prefill_cap(ds4_session *s); int ds4_engine_routed_quant_bits(ds4_engine *e); bool ds4_engine_has_output_head(ds4_engine *e); +/* True when speculative decode has a real proposer and target verifier. */ +bool ds4_mtp_speculative_draft_ready(ds4_mtp_draft_kind kind); +bool ds4_mtp_draft_runtime_supported(ds4_backend backend, + ds4_mtp_draft_kind kind); bool ds4_engine_has_mtp(ds4_engine *e); +ds4_mtp_draft_kind ds4_engine_mtp_draft_kind(ds4_engine *e); + int ds4_engine_mtp_draft_tokens(ds4_engine *e); const ds4_tokens *ds4_session_tokens(ds4_session *s); diff --git a/ds4_cli.c b/ds4_cli.c index 4ad2240e8..61de77021 100644 --- a/ds4_cli.c +++ b/ds4_cli.c @@ -43,6 +43,12 @@ typedef struct { const char *imatrix_output_path; int imatrix_max_prompts; int imatrix_max_tokens; + const char *dspark_target_cache_dataset_path; + const char *dspark_target_cache_output_dir; + const char *dspark_target_cache_target_model; + const char *dspark_target_cache_chat_template; + int dspark_target_cache_max_prompts; + int dspark_target_cache_max_tokens; ds4_think_mode think_mode; bool head_test; bool first_token_test; @@ -1562,6 +1568,18 @@ static cli_config parse_options(int argc, char **argv) { c.gen.imatrix_max_prompts = parse_int(need_arg(&i, argc, argv, arg), arg); } else if (!strcmp(arg, "--imatrix-max-tokens")) { c.gen.imatrix_max_tokens = parse_int(need_arg(&i, argc, argv, arg), arg); + } else if (!strcmp(arg, "--dspark-target-cache-dataset")) { + c.gen.dspark_target_cache_dataset_path = need_arg(&i, argc, argv, arg); + } else if (!strcmp(arg, "--dspark-target-cache-out")) { + c.gen.dspark_target_cache_output_dir = need_arg(&i, argc, argv, arg); + } else if (!strcmp(arg, "--dspark-target-cache-target-model")) { + c.gen.dspark_target_cache_target_model = need_arg(&i, argc, argv, arg); + } else if (!strcmp(arg, "--dspark-target-cache-chat-template")) { + c.gen.dspark_target_cache_chat_template = need_arg(&i, argc, argv, arg); + } else if (!strcmp(arg, "--dspark-target-cache-max-prompts")) { + c.gen.dspark_target_cache_max_prompts = parse_int(need_arg(&i, argc, argv, arg), arg); + } else if (!strcmp(arg, "--dspark-target-cache-max-tokens")) { + c.gen.dspark_target_cache_max_tokens = parse_int(need_arg(&i, argc, argv, arg), arg); } else if (!strcmp(arg, "--think")) { c.gen.think_mode = DS4_THINK_HIGH; } else if (!strcmp(arg, "--think-max")) { @@ -1621,6 +1639,24 @@ static cli_config parse_options(int argc, char **argv) { fprintf(stderr, "ds4: --imatrix-dataset requires --imatrix-out\n"); exit(2); } + if (c.gen.dspark_target_cache_output_dir && !c.gen.dspark_target_cache_dataset_path) { + fprintf(stderr, "ds4: --dspark-target-cache-out requires --dspark-target-cache-dataset\n"); + exit(2); + } + if (c.gen.dspark_target_cache_dataset_path && !c.gen.dspark_target_cache_output_dir) { + fprintf(stderr, "ds4: --dspark-target-cache-dataset requires --dspark-target-cache-out\n"); + exit(2); + } + if (c.gen.dspark_target_cache_output_dir && c.gen.prompt) { + fprintf(stderr, "ds4: --dspark-target-cache-out does not use -p/--prompt-file\n"); + exit(2); + } + if (c.gen.dspark_target_cache_output_dir && + (!c.gen.dspark_target_cache_target_model || + !c.gen.dspark_target_cache_target_model[0])) { + fprintf(stderr, "ds4: --dspark-target-cache-out requires --dspark-target-cache-target-model\n"); + exit(2); + } if (c.gen.perplexity_file_path && c.gen.prompt) { fprintf(stderr, "ds4: --perplexity-file does not use -p/--prompt-file\n"); exit(2); @@ -1693,6 +1729,15 @@ int main(int argc, char **argv) { cfg.gen.ctx_size, cfg.gen.imatrix_max_prompts, cfg.gen.imatrix_max_tokens); + } else if (cfg.gen.dspark_target_cache_output_dir) { + rc = ds4_engine_collect_dspark_target_cache(engine, + cfg.gen.dspark_target_cache_dataset_path, + cfg.gen.dspark_target_cache_output_dir, + cfg.gen.dspark_target_cache_target_model, + cfg.gen.dspark_target_cache_chat_template, + cfg.gen.ctx_size, + cfg.gen.dspark_target_cache_max_prompts, + cfg.gen.dspark_target_cache_max_tokens); } else if (cfg.gen.perplexity_file_path) { rc = run_perplexity_file(engine, &cfg); } else if (cfg.gen.prompt == NULL) { diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 188b341ad..688507a44 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -8917,6 +8917,25 @@ extern "C" int ds4_gpu_attention_decode_raw_batch_heads_tensor( n_head, head_dim); } +extern "C" int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + ds4_gpu_tensor *heads, + const void *model_map, + uint64_t model_size, + uint64_t sinks_offset, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *raw_kv, + uint32_t n_tokens, + uint32_t n_raw, + uint32_t raw_cap, + uint32_t raw_start, + uint32_t n_head, + uint32_t head_dim) { + (void)heads; (void)model_map; (void)model_size; (void)sinks_offset; + (void)q; (void)raw_kv; (void)n_tokens; (void)n_raw; (void)raw_cap; + (void)raw_start; (void)n_head; (void)head_dim; + return 0; +} + extern "C" int ds4_gpu_attention_decode_mixed_batch_heads_tensor( ds4_gpu_tensor *heads, const void *model_map, diff --git a/ds4_dspark_runtime.c b/ds4_dspark_runtime.c new file mode 100644 index 000000000..cf6c5434e --- /dev/null +++ b/ds4_dspark_runtime.c @@ -0,0 +1,41 @@ +#include "ds4_dspark_runtime.h" + +#include + + +float ds4_dspark_bf16_to_f32(uint16_t h) { + uint32_t bits = (uint32_t)h << 16; + float f; + memcpy(&f, &bits, sizeof(f)); + return f; +} + + + +ds4_dspark_spec_gate ds4_dspark_speculative_gate(ds4_mtp_draft_kind kind, + bool mtp_ready, + int mtp_draft_tokens) { + if (!mtp_ready || mtp_draft_tokens <= 1) return DS4_DSPARK_SPEC_DISABLED; + if (kind == DS4_MTP_DRAFT_LEGACY) return DS4_DSPARK_SPEC_LEGACY_MTP; + if (kind == DS4_MTP_DRAFT_DSPARK) return DS4_DSPARK_SPEC_DSPARK_ENABLED; + if (kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) return DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY; + return DS4_DSPARK_SPEC_DISABLED; +} + +const char *ds4_dspark_spec_gate_reason(ds4_dspark_spec_gate gate) { + switch (gate) { + case DS4_DSPARK_SPEC_LEGACY_MTP: + return "legacy MTP draft path (DSpark block draft not engaged)"; + case DS4_DSPARK_SPEC_DSPARK_ENABLED: + return "DSpark block speculative decode enabled"; + case DS4_DSPARK_SPEC_DSPARK_NOT_READY: + return "DSpark draft graph has not been validated on real DSpark GGUF weights; " + "speculative decode stays off (no fake draft tokens)"; + case DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY: + return "DSpark nonseq draft head has not been validated on real trained DSpark GGUF weights; " + "speculative decode stays off (no fake draft tokens)"; + case DS4_DSPARK_SPEC_DISABLED: + default: + return "speculative draft disabled"; + } +} \ No newline at end of file diff --git a/ds4_dspark_runtime.h b/ds4_dspark_runtime.h new file mode 100644 index 000000000..c70384b3e --- /dev/null +++ b/ds4_dspark_runtime.h @@ -0,0 +1,29 @@ +#ifndef DS4_DSPARK_RUNTIME_H +#define DS4_DSPARK_RUNTIME_H + +#include +#include + +#include "ds4.h" + + +typedef enum { + DS4_DSPARK_SPEC_DISABLED = 0, + DS4_DSPARK_SPEC_LEGACY_MTP, + DS4_DSPARK_SPEC_DSPARK_ENABLED, + DS4_DSPARK_SPEC_DSPARK_NOT_READY, + DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY, +} ds4_dspark_spec_gate; + + + +float ds4_dspark_bf16_to_f32(uint16_t h); + + +ds4_dspark_spec_gate ds4_dspark_speculative_gate(ds4_mtp_draft_kind kind, + bool mtp_ready, + int mtp_draft_tokens); + +const char *ds4_dspark_spec_gate_reason(ds4_dspark_spec_gate gate); + +#endif \ No newline at end of file diff --git a/ds4_gpu.h b/ds4_gpu.h index b58aca9bd..6651a2880 100644 --- a/ds4_gpu.h +++ b/ds4_gpu.h @@ -623,6 +623,22 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor( uint32_t n_head, uint32_t head_dim); +/* Non-causal variant (mask = all-attend): every query attends to every key in + * the gathered window. Used by the DSpark drafter's block attention. */ +int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + ds4_gpu_tensor *heads, + const void *model_map, + uint64_t model_size, + uint64_t sinks_offset, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *raw_kv, + uint32_t n_tokens, + uint32_t n_raw, + uint32_t raw_cap, + uint32_t raw_start, + uint32_t n_head, + uint32_t head_dim); + int ds4_gpu_attention_decode_mixed_batch_heads_tensor( ds4_gpu_tensor *heads, const void *model_map, diff --git a/ds4_help.c b/ds4_help.c index d32e088cf..aae4b24a5 100644 --- a/ds4_help.c +++ b/ds4_help.c @@ -170,11 +170,11 @@ static void print_model_runtime(FILE *fp, const help_colors *c, opt(fp, c, "--prefill-chunk N", "Metal graph prefill chunk size. Default: auto (PRO long prompts use 8192; others use 4096)."); if (full) { if (tool != DS4_HELP_BENCH) { - opt(fp, c, "--mtp FILE", "Optional MTP support GGUF used for draft-token probes."); + opt(fp, c, "--mtp FILE", "Optional speculative draft GGUF: legacy MTP or experimental converted DSpark/DeepSpec on Metal."); } if (tool == DS4_HELP_DS4 || tool == DS4_HELP_AGENT || tool == DS4_HELP_SERVER) { - opt(fp, c, "--mtp-draft N", "Maximum autoregressive MTP draft tokens. Default: 1"); - opt(fp, c, "--mtp-margin F", "Verifier confidence margin for fast MTP acceptance. Default: 3"); + opt(fp, c, "--mtp-draft N", "Maximum speculative draft tokens. Legacy default: 1; DSpark uses GGUF block size."); + opt(fp, c, "--mtp-margin F", "Verifier confidence margin for legacy fast MTP acceptance. Default: 3"); } opt(fp, c, "--quality", "Prefer exact kernels where faster approximate paths exist."); opt(fp, c, "--warm-weights", "Touch mapped tensor pages at startup to reduce first-use stalls."); @@ -254,6 +254,12 @@ static void print_cli_diagnostics(FILE *fp, const help_colors *c) { opt(fp, c, "--imatrix-out FILE", "Write llama-compatible routed-MoE imatrix .dat."); opt(fp, c, "--imatrix-max-prompts N", "Stop imatrix collection after N prompts."); opt(fp, c, "--imatrix-max-tokens N", "Stop imatrix collection after N prompt tokens."); + opt(fp, c, "--dspark-target-cache-dataset FILE", "Rendered prompt dataset for DeepSpec DSpark target-cache export."); + opt(fp, c, "--dspark-target-cache-out DIR", "Write DeepSpec DSpark target cache manifest/index/shard."); + opt(fp, c, "--dspark-target-cache-target-model HF_OR_PATH", "Required DeepSpec target model name/path stored in the target-cache manifest."); + opt(fp, c, "--dspark-target-cache-chat-template NAME", "DeepSpec chat template name stored in the target-cache manifest."); + opt(fp, c, "--dspark-target-cache-max-prompts N", "Stop target-cache export after N prompts."); + opt(fp, c, "--dspark-target-cache-max-tokens N", "Stop target-cache export after N prompt tokens."); opt(fp, c, "--head-test", "Run the output HC/logits head after the native slice."); opt(fp, c, "--first-token-test", "Run exact CPU whole-model pass for the first prompt token."); opt(fp, c, "--metal-graph-test", "Compare first GPU-resident graph stages with CPU."); diff --git a/ds4_metal.m b/ds4_metal.m index 7e3f8bd5c..c43762e0e 100644 --- a/ds4_metal.m +++ b/ds4_metal.m @@ -17050,6 +17050,13 @@ static void ds4_gpu_fill_raw_decode_batch_mask( } } +static void ds4_gpu_fill_raw_decode_batch_all_mask( + uint16_t *mask, + uint32_t n_tokens, + uint32_t n_raw) { + memset(mask, 0, (size_t)n_tokens * n_raw * sizeof(mask[0])); +} + static void ds4_gpu_fill_mixed_decode_batch_mask( uint16_t *mask, uint32_t n_tokens, @@ -18432,6 +18439,7 @@ static int ds4_gpu_encode_flash_attention_decode_raw_batch_heads( uint32_t raw_cap, uint32_t raw_start, uint32_t window, + bool noncausal, uint32_t n_head, uint32_t head_dim) { if (head_dim != 512 || n_head == 0 || n_tokens == 0 || @@ -18528,11 +18536,17 @@ static int ds4_gpu_encode_flash_attention_decode_raw_batch_heads( return 0; } - ds4_gpu_fill_raw_decode_batch_mask((uint16_t *)[mask_buffer contents], - n_tokens, - n_raw, - pos0, - window); + if (noncausal) { + ds4_gpu_fill_raw_decode_batch_all_mask((uint16_t *)[mask_buffer contents], + n_tokens, + n_raw); + } else { + ds4_gpu_fill_raw_decode_batch_mask((uint16_t *)[mask_buffer contents], + n_tokens, + n_raw, + pos0, + window); + } id pad_pipeline = nil; if (has_kvpad) { @@ -18693,6 +18707,7 @@ static int ds4_gpu_encode_flash_attention_decode_mixed_batch_heads( raw_cap, raw_start, window, + false, n_head, head_dim); } @@ -19052,6 +19067,7 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor( raw_cap, raw_start, window, + false, n_head, head_dim)) { return 0; @@ -19063,6 +19079,66 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor( return 1; } +int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + ds4_gpu_tensor *heads, + const void *model_map, + uint64_t model_size, + uint64_t sinks_offset, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *raw_kv, + uint32_t n_tokens, + uint32_t n_raw, + uint32_t raw_cap, + uint32_t raw_start, + uint32_t n_head, + uint32_t head_dim) { + if (!g_initialized && !ds4_gpu_init()) return 0; + if (!heads || !q || !raw_kv || !model_map || n_tokens == 0 || + n_raw == 0 || raw_cap < n_raw || raw_start >= raw_cap) { + return 0; + } + + @autoreleasepool { + if (sinks_offset > model_size || (uint64_t)n_head * sizeof(float) > model_size - sinks_offset) { + fprintf(stderr, "ds4: Metal attention sinks range is outside the mapped model\n"); + return 0; + } + + uint64_t sinks_inner = 0; + id sinks_buf = ds4_gpu_wrap_model_range(model_map, model_size, + sinks_offset, + (uint64_t)n_head * sizeof(float), + &sinks_inner); + if (!sinks_buf) return 0; + + int owned = 0; + id cb = ds4_gpu_command_buffer(&owned); + if (!cb) return 0; + + if (!ds4_gpu_encode_flash_attention_decode_raw_batch_heads(cb, + heads, + sinks_buf, + (NSUInteger)sinks_inner, + q, + raw_kv, + n_tokens, + 0, + n_raw, + raw_cap, + raw_start, + 0, + true, + n_head, + head_dim)) { + return 0; + } + + if (!ds4_gpu_finish_command_buffer(cb, owned, "dspark noncausal batch attention heads")) return 0; + } + + return 1; +} + int ds4_gpu_attention_decode_mixed_batch_heads_tensor( ds4_gpu_tensor *heads, const void *model_map, diff --git a/gguf-tools/README.md b/gguf-tools/README.md index f692a86d1..1636f4f4f 100644 --- a/gguf-tools/README.md +++ b/gguf-tools/README.md @@ -13,6 +13,9 @@ The important pieces are: importance with `ds4`. - `quality-testing/`: prompts and scripts used to compare local GGUF variants against official DeepSeek V4 Flash continuations. +- `deepspec/ds4_deepspec.py`: validates DS4 target-cache exports against the + DeepSpec v2 manifest/index/shard contract and emits the DS4-side non-Markov + DeepSpec config scaffold before external training. ## Build @@ -108,6 +111,29 @@ gguf-tools/deepseek4-quantize \ `--compare-tensor` regenerates a single tensor and byte-compares it against the template or `--compare-gguf`. `--threads N` controls routed-expert workers. +## Generate A DSpark/DeepSpec Draft GGUF + +Official DeepSeek-V4-Flash DSpark/DeepSpec Markov draft weights are stored in +separate Hugging Face safetensor shards under the `mtp.*` namespace. Convert +those shards into a DS4 auxiliary MTP GGUF with `--dspark-only`; the main Flash +template supplies tokenizer metadata, tensor order, and GGUF layout: + +```sh +gguf-tools/deepseek4-quantize \ + --hf gguf/dspark-hf \ + --template gguf/ds4flash.gguf \ + --out gguf/deepseek4.dspark.gguf \ + --dspark-only +``` + +The converter detects the official Markov layout from `mtp.0.main_proj.weight` +plus `mtp.2.markov_head.markov_w1.weight`, stores the rank-256 Markov weights +as F16, emits `deepseek4.dspark.*` metadata, and accepts the model +repository root `config.json` as a fallback when `inference/config.json` is not +present. Use `--dry-run` before writing and `--self-test-dspark-map` after +changing tensor mapping rules. + + ## When No Imatrix Is Given `iq2_xxs` requires an importance vector. If `--imatrix` is not provided and diff --git a/gguf-tools/deepseek4-quantize.c b/gguf-tools/deepseek4-quantize.c index 3955b4352..c32053a8e 100644 --- a/gguf-tools/deepseek4-quantize.c +++ b/gguf-tools/deepseek4-quantize.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #if defined(_WIN32) #error "deepseek4-quantize.c currently targets POSIX systems" @@ -47,6 +49,13 @@ #define DS4_KV_QUANTIZE_IMATRIX_N_CHUNKS "quantize.imatrix.chunks_count" #define DS4_GGUF_DEFAULT_ALIGNMENT 32 +#define DS4_KV_DSPARK_N_MTP_LAYERS "deepseek4.dspark.n_mtp_layers" +#define DS4_KV_DSPARK_BLOCK_SIZE "deepseek4.dspark.block_size" +#define DS4_KV_DSPARK_NOISE_TOKEN_ID "deepseek4.dspark.noise_token_id" +#define DS4_KV_DSPARK_MARKOV_RANK "deepseek4.dspark.markov_rank" +#define DS4_KV_DSPARK_TARGET_LAYER_ID "deepseek4.dspark.target_layer_ids" +#define DS4_DSPARK_TARGET_LAYER_COUNT 3 + typedef enum { GGUF_TYPE_UINT8 = 0, GGUF_TYPE_INT8 = 1, @@ -142,6 +151,24 @@ static char *read_file(const char *path, size_t *len_out) { return buf; } +static char *read_optional_file(const char *path, size_t *len_out) { + FILE *fp = fopen(path, "rb"); + if (!fp) { + if (errno == ENOENT) return NULL; + die_errno("open", path); + } + if (fseeko(fp, 0, SEEK_END) != 0) die_errno("seek", path); + off_t n = ftello(fp); + if (n < 0) die_errno("tell", path); + if (fseeko(fp, 0, SEEK_SET) != 0) die_errno("seek", path); + char *buf = xmalloc((size_t)n + 1); + if (n && fread(buf, 1, (size_t)n, fp) != (size_t)n) die_errno("read", path); + buf[n] = '\0'; + fclose(fp); + if (len_out) *len_out = (size_t)n; + return buf; +} + static uint64_t read_u64_le_fp(FILE *fp, const char *what) { uint8_t b[8]; if (fread(b, 1, sizeof(b), fp) != sizeof(b)) { @@ -874,24 +901,28 @@ typedef enum { EXP_NONE, EXP_W1, EXP_W2, EXP_W3 } expert_part; typedef struct { bool is_expert; + bool is_mtp; int layer; expert_part part; } expert_tensor; -static expert_tensor parse_expert_tensor(const char *name) { - expert_tensor e = {0}; +static bool parse_expert_tensor_as(const char *name, const char *fmt, bool is_mtp, expert_tensor *out) { int layer = -1; char kind[16]; int rest = 0; - if (sscanf(name, "blk.%d.ffn_%15[^_]_exps.weight%n", &layer, kind, &rest) == 2 - && rest == (int)strlen(name)) - { - if (strcmp(kind, "gate") == 0 || strcmp(kind, "down") == 0 || strcmp(kind, "up") == 0) { - e.is_expert = true; - e.layer = layer; - e.part = strcmp(kind, "gate") == 0 ? EXP_W1 : strcmp(kind, "down") == 0 ? EXP_W2 : EXP_W3; - } - } + if (sscanf(name, fmt, &layer, kind, &rest) != 2 || rest != (int)strlen(name)) return false; + if (strcmp(kind, "gate") != 0 && strcmp(kind, "down") != 0 && strcmp(kind, "up") != 0) return false; + out->is_expert = true; + out->is_mtp = is_mtp; + out->layer = layer; + out->part = strcmp(kind, "gate") == 0 ? EXP_W1 : strcmp(kind, "down") == 0 ? EXP_W2 : EXP_W3; + return true; +} + +static expert_tensor parse_expert_tensor(const char *name) { + expert_tensor e = {0}; + if (parse_expert_tensor_as(name, "blk.%d.ffn_%15[^_]_exps.weight%n", false, &e)) return e; + if (parse_expert_tensor_as(name, "mtp.%d.ffn_%15[^_]_exps.weight%n", true, &e)) return e; return e; } @@ -905,6 +936,16 @@ static const char *expert_part_name(expert_part p) { return ""; } +static void expert_hf_prefix(char *buf, size_t cap, + const expert_tensor *e, int xid, + const char *wid) { + if (e->is_mtp) { + snprintf(buf, cap, "mtp.%d.ffn.experts.%d.%s", e->layer, xid, wid); + } else { + snprintf(buf, cap, "layers.%d.ffn.experts.%d.%s", e->layer, xid, wid); + } +} + typedef struct { const char *gguf; const char *hf; @@ -950,34 +991,203 @@ static const name_map layer_map[] = { { "ffn_up_shexp.weight", "ffn.shared_experts.w3.weight" }, { "ffn_down_shexp.weight", "ffn.shared_experts.w2.weight" }, { "ffn_gate_inp.weight", "ffn.gate.weight" }, + { "ffn_gate_exps.weight", "ffn.experts.*.w1.weight" }, + { "ffn_up_exps.weight", "ffn.experts.*.w3.weight" }, + { "ffn_down_exps.weight", "ffn.experts.*.w2.weight" }, { "exp_probs_b.bias", "ffn.gate.bias" }, { "ffn_gate_tid2eid.weight", "ffn.gate.tid2eid" }, }; -static char *hf_name_for_regular(const char *gguf_name) { - for (size_t i = 0; i < sizeof(top_map) / sizeof(top_map[0]); i++) { - if (strcmp(gguf_name, top_map[i].gguf) == 0) return xstrdup(top_map[i].hf); - } + +static const name_map dspark_mtp_map[] = { + { "main_proj.weight", "main_proj.weight" }, + { "main_norm.weight", "main_norm.weight" }, + { "norm.weight", "norm.weight" }, + { "markov_head.markov_w1.weight", "markov_head.markov_w1.weight" }, + { "markov_head.markov_w2.weight", "markov_head.markov_w2.weight" }, + { "confidence_head.proj.weight", "confidence_head.proj.weight" }, + { "hc_head_base.weight", "hc_head_base" }, + { "hc_head_fn.weight", "hc_head_fn" }, + { "hc_head_scale.weight", "hc_head_scale" }, +}; + +static char *hf_name_for_mapped_layer( + const char *gguf_name, + const char *gguf_prefix, + const char *hf_prefix, + const name_map *extra_map, + size_t extra_map_len) { int layer = -1; - const char *p = gguf_name; - if (sscanf(p, "blk.%d.", &layer) != 1) { - fprintf(stderr, "error: cannot map GGUF tensor to HF tensor: %s\n", gguf_name); - exit(1); - } - const char *rest = strchr(p + 4, '.'); + char scan_fmt[32]; + snprintf(scan_fmt, sizeof(scan_fmt), "%s.%%d.", gguf_prefix); + if (sscanf(gguf_name, scan_fmt, &layer) != 1) return NULL; + + const char *rest = strchr(gguf_name + strlen(gguf_prefix) + 1, '.'); if (!rest) die("bad layer tensor name"); rest++; + + for (size_t i = 0; i < extra_map_len; i++) { + if (strcmp(rest, extra_map[i].gguf) == 0) { + char buf[512]; + snprintf(buf, sizeof(buf), "%s.%d.%s", hf_prefix, layer, extra_map[i].hf); + return xstrdup(buf); + } + } for (size_t i = 0; i < sizeof(layer_map) / sizeof(layer_map[0]); i++) { if (strcmp(rest, layer_map[i].gguf) == 0) { char buf[512]; - snprintf(buf, sizeof(buf), "layers.%d.%s", layer, layer_map[i].hf); + snprintf(buf, sizeof(buf), "%s.%d.%s", hf_prefix, layer, layer_map[i].hf); return xstrdup(buf); } } + return NULL; +} + +static char *hf_name_for_regular(const char *gguf_name) { + for (size_t i = 0; i < sizeof(top_map) / sizeof(top_map[0]); i++) { + if (strcmp(gguf_name, top_map[i].gguf) == 0) return xstrdup(top_map[i].hf); + } + + char *hf_name = hf_name_for_mapped_layer(gguf_name, "blk", "layers", NULL, 0); + if (hf_name) return hf_name; + + hf_name = hf_name_for_mapped_layer(gguf_name, "mtp", "mtp", + dspark_mtp_map, + sizeof(dspark_mtp_map) / sizeof(dspark_mtp_map[0])); + if (hf_name) return hf_name; + fprintf(stderr, "error: cannot map GGUF tensor to HF tensor: %s\n", gguf_name); exit(1); } +static void expect_hf_name(const char *gguf, const char *want) { + char *got = hf_name_for_regular(gguf); + if (strcmp(got, want) != 0) { + fprintf(stderr, "error: map %s -> %s, expected %s\n", gguf, got, want); + exit(1); + } + free(got); +} + +typedef struct { + uint32_t block_size; + uint32_t noise_token_id; + uint32_t markov_rank; + uint32_t n_mtp_layers; + uint32_t target_layer_ids[DS4_DSPARK_TARGET_LAYER_COUNT]; +} dspark_metadata; + +typedef enum { + DS4_DSPARK_HF_NONE = 0, + DS4_DSPARK_HF_MARKOV, + DS4_DSPARK_HF_NONSEQ, +} dspark_hf_layout; + +static const char *dspark_hf_layout_name(dspark_hf_layout layout) { + switch (layout) { + case DS4_DSPARK_HF_MARKOV: return "markov"; + case DS4_DSPARK_HF_NONSEQ: return "nonseq"; + case DS4_DSPARK_HF_NONE: + default: return "none"; + } +} + +static bool is_mtp_tensor_name(const char *name) { + return str_starts(name, "mtp."); +} + +static bool is_dspark_special_tensor(const char *name) { + return strstr(name, ".main_proj.weight") != NULL || + strstr(name, ".main_norm.weight") != NULL || + strstr(name, ".attn_norm.weight") != NULL || + strstr(name, ".attn_q_a_norm.weight") != NULL || + strstr(name, ".attn_kv_a_norm.weight") != NULL || + strstr(name, ".ffn_norm.weight") != NULL || + strstr(name, ".markov_head.markov_w1.weight") != NULL || + strstr(name, ".markov_head.markov_w2.weight") != NULL || + strstr(name, ".confidence_head.proj.weight") != NULL; +} + +static bool is_dspark_kv_key(const char *key) { + return strcmp(key, DS4_KV_DSPARK_N_MTP_LAYERS) == 0 || + strcmp(key, DS4_KV_DSPARK_BLOCK_SIZE) == 0 || + strcmp(key, DS4_KV_DSPARK_NOISE_TOKEN_ID) == 0 || + strcmp(key, DS4_KV_DSPARK_MARKOV_RANK) == 0 || + strncmp(key, DS4_KV_DSPARK_TARGET_LAYER_ID, strlen(DS4_KV_DSPARK_TARGET_LAYER_ID)) == 0; +} + +static dspark_hf_layout dspark_hf_layout_guess(bool has_main_proj, + bool has_markov_w1, + bool has_confidence_proj, + bool markov_rank_set, + uint32_t markov_rank) { + if (!has_main_proj) return DS4_DSPARK_HF_NONE; + if (has_markov_w1 && has_confidence_proj) return DS4_DSPARK_HF_MARKOV; + if (!has_markov_w1 && !has_confidence_proj && markov_rank_set && markov_rank == 0) { + return DS4_DSPARK_HF_NONSEQ; + } + return DS4_DSPARK_HF_NONE; +} + +static dspark_hf_layout db_dspark_hf_layout(const st_db *db, bool markov_rank_set, uint32_t markov_rank) { + return dspark_hf_layout_guess(db_has(db, "mtp.0.main_proj.weight"), + db_has(db, "mtp.2.markov_head.markov_w1.weight"), + db_has(db, "mtp.2.confidence_head.proj.weight"), + markov_rank_set, + markov_rank); +} + +static dspark_metadata dspark_metadata_defaults(void) { + dspark_metadata m = { + .block_size = 5, + .noise_token_id = 128799, + .markov_rank = 256, + .n_mtp_layers = 3, + .target_layer_ids = {40, 41, 42}, + }; + return m; +} + +static void dspark_metadata_apply_hf_config_path(dspark_metadata *m, const char *cfg_path, bool *markov_rank_set) { + size_t len = 0; + char *jtext = read_optional_file(cfg_path, &len); + if (!jtext) return; + json_doc d = json_parse_text(jtext, len); + int block = json_obj_get(&d, 0, "dspark_block_size"); + int noise = json_obj_get(&d, 0, "dspark_noise_token_id"); + int rank = json_obj_get(&d, 0, "dspark_markov_rank"); + int n_mtp = json_obj_get(&d, 0, "n_mtp_layers"); + int layers = json_obj_get(&d, 0, "dspark_target_layer_ids"); + if (block >= 0) m->block_size = (uint32_t)json_i64(&d, block); + if (noise >= 0) m->noise_token_id = (uint32_t)json_i64(&d, noise); + if (rank >= 0) { + m->markov_rank = (uint32_t)json_i64(&d, rank); + if (markov_rank_set) *markov_rank_set = true; + } + if (n_mtp >= 0) m->n_mtp_layers = (uint32_t)json_i64(&d, n_mtp); + if (layers >= 0 && d.v[layers].type == JT_ARRAY) { + int n = 0; + for (int i = layers + 1; i < d.len && d.v[i].parent == layers && n < DS4_DSPARK_TARGET_LAYER_COUNT;) { + m->target_layer_ids[n++] = (uint32_t)json_i64(&d, i); + i = json_skip(&d, i); + } + } + json_free(&d); + free(jtext); +} + +static dspark_metadata dspark_metadata_from_hf_config(const char *hf_dir, bool *markov_rank_set) { + if (markov_rank_set) *markov_rank_set = false; + dspark_metadata m = dspark_metadata_defaults(); + char *root_cfg_path = path_join(hf_dir, "config.json"); + dspark_metadata_apply_hf_config_path(&m, root_cfg_path, markov_rank_set); + free(root_cfg_path); + char *inference_cfg_path = path_join(hf_dir, "inference/config.json"); + dspark_metadata_apply_hf_config_path(&m, inference_cfg_path, markov_rank_set); + free(inference_cfg_path); + return m; +} + typedef struct { char *prefix; ds4q_type type; @@ -1000,14 +1210,27 @@ static bool is_attention_tensor(const char *name) { return strstr(name, ".attn") || strstr(name, "attn_") || strstr(name, ".indexer") || strstr(name, "indexer_"); } +static bool is_norm_tensor(const char *name) { + return strcmp(name, "output_norm.weight") == 0 || + strstr(name, "_norm.weight") != NULL || + strstr(name, ".norm.weight") != NULL; +} + static bool is_shared_expert(const char *name) { return strstr(name, "_shexp.") != NULL; } - static bool is_output_tensor(const char *name) { return str_starts(name, "output."); } +static bool is_loader_plain_f16_tensor(const char *name) { + return strcmp(name, "output_hc_fn.weight") == 0 || + strstr(name, ".hc_attn_fn.weight") != NULL || + strstr(name, ".hc_ffn_fn.weight") != NULL || + strstr(name, ".hc_head_fn.weight") != NULL || + strstr(name, ".ffn_gate_inp.weight") != NULL; +} + typedef struct { char *name; int n_dims; @@ -1041,6 +1264,19 @@ static ds4q_type policy_type(const quant_policy *p, const char *name, const tens tmpl->type != DS4Q_TYPE_BF16 && !ds4q_can_quantize(tmpl->type)) { return tmpl->type; } + if (is_mtp_tensor_name(name) && is_dspark_special_tensor(name)) { + if (strstr(name, ".confidence_head.proj.weight")) return DS4Q_TYPE_F32; + if (strstr(name, ".main_proj.weight")) return DS4Q_TYPE_Q8_0; + if (strstr(name, ".main_norm.weight") || strstr(name, ".attn_norm.weight") || + strstr(name, ".attn_q_a_norm.weight") || strstr(name, ".attn_kv_a_norm.weight") || + strstr(name, ".ffn_norm.weight")) return DS4Q_TYPE_F32; + if (strstr(name, ".markov_head.markov_w1.weight") || + strstr(name, ".markov_head.markov_w2.weight")) { + return tmpl->type == DS4Q_TYPE_F32 ? DS4Q_TYPE_F32 : DS4Q_TYPE_F16; + } + } + if (is_loader_plain_f16_tensor(name)) return DS4Q_TYPE_F16; + if (is_norm_tensor(name)) return DS4Q_TYPE_F32; if (tensor_n_dims(tmpl) <= 1) return tmpl->type; if (strcmp(name, "token_embd.weight") == 0 && p->embedding != DS4Q_TYPE_COUNT) return p->embedding; if (is_output_tensor(name) && p->output != DS4Q_TYPE_COUNT) return p->output; @@ -1051,6 +1287,148 @@ static ds4q_type policy_type(const quant_policy *p, const char *name, const tens return tmpl->type; } +static void expect_policy_type(const quant_policy *p, const char *name, ds4q_type tmpl_type, ds4q_type want) { + tensor_meta tmpl = { + .name = (char *)name, + .n_dims = 2, + .ne = {4096, 4096, 1, 1}, + .type = tmpl_type, + }; + ds4q_type got = policy_type(p, name, &tmpl); + if (got != want) { + fprintf(stderr, "error: policy %s -> %s, expected %s\n", + name, ds4q_type_name(got), ds4q_type_name(want)); + exit(1); + } +} + +static void self_test_dspark_only_args(void); +static ds4q_type dspark_template_for_name(const char *name, ds4q_type hf_type); + +static void expect_dspark_template_type(const char *name, ds4q_type hf_type, ds4q_type want) { + ds4q_type got = dspark_template_for_name(name, hf_type); + if (got != want) { + fprintf(stderr, "error: DSpark template %s -> %s, expected %s\n", + name, ds4q_type_name(got), ds4q_type_name(want)); + exit(1); + } +} + + +static void self_test_dspark_map(void) { + expect_hf_name("mtp.0.hc_attn_base.weight", "mtp.0.hc_attn_base"); + expect_hf_name("mtp.0.main_proj.weight", "mtp.0.main_proj.weight"); + expect_hf_name("mtp.2.markov_head.markov_w1.weight", "mtp.2.markov_head.markov_w1.weight"); + expect_hf_name("mtp.2.confidence_head.proj.weight", "mtp.2.confidence_head.proj.weight"); + expert_tensor routed = parse_expert_tensor("mtp.2.ffn_down_exps.weight"); + if (!routed.is_expert || !routed.is_mtp || routed.layer != 2 || routed.part != EXP_W2) { + die("bad DSpark MTP routed expert parse"); + } + char eprefix[256]; + expert_hf_prefix(eprefix, sizeof(eprefix), &routed, 7, expert_part_name(routed.part)); + if (strcmp(eprefix, "mtp.2.ffn.experts.7.w2") != 0) { + die("bad DSpark MTP expert HF prefix"); + } + quant_policy pol = {0}; + pol.dense = DS4Q_TYPE_Q4_K; + expect_policy_type(&pol, "mtp.0.main_proj.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_Q8_0); + expect_policy_type(&pol, "mtp.2.markov_head.markov_w1.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "mtp.2.confidence_head.proj.weight", DS4Q_TYPE_F32, DS4Q_TYPE_F32); + expect_policy_type(&pol, "mtp.2.hc_head_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "mtp.0.hc_attn_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "mtp.0.ffn_gate_inp.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "blk.0.hc_ffn_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "output_hc_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_policy_type(&pol, "blk.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "blk.0.attn_q_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "blk.0.attn_kv_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "blk.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + pol.dense = DS4Q_TYPE_COUNT; + expect_policy_type(&pol, "mtp.0.main_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "mtp.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_policy_type(&pol, "mtp.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.hc_attn_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.hc_attn_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.attn_sinks.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.attn_q_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.attn_kv_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.hc_ffn_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.hc_ffn_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.exp_probs_b.bias", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.2.main_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.2.norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.0.ffn_gate_inp.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16); + expect_dspark_template_type("mtp.2.hc_head_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.2.hc_head_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + expect_dspark_template_type("mtp.2.confidence_head.proj.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32); + if (dspark_hf_layout_guess(true, true, true, false, 0) != DS4_DSPARK_HF_MARKOV) { + die("official DSpark HF layout not detected"); + } + if (dspark_hf_layout_guess(true, false, false, true, 0) != DS4_DSPARK_HF_NONSEQ) { + die("nonseq DSpark HF layout not detected"); + } + if (dspark_hf_layout_guess(true, false, false, false, 0) != DS4_DSPARK_HF_NONE) { + die("main-proj-only DSpark layout detected without markov_rank=0 metadata"); + } + char tmpdir[] = "/tmp/ds4q-config-XXXXXX"; + char *dir = mkdtemp(tmpdir); + if (!dir) die_errno("mkdtemp", tmpdir); + char *cfg_path = path_join(dir, "config.json"); + FILE *cfp = fopen(cfg_path, "wb"); + if (!cfp) die_errno("create config", cfg_path); + fputs("{\"dspark_block_size\":7,\"dspark_noise_token_id\":9,\"dspark_markov_rank\":0," + "\"n_mtp_layers\":3,\"dspark_target_layer_ids\":[5,6,7]}", cfp); + if (fclose(cfp) != 0) die_errno("close config", cfg_path); + bool rank_set = false; + dspark_metadata fm = dspark_metadata_from_hf_config(dir, &rank_set); + if (!rank_set || fm.block_size != 7 || fm.noise_token_id != 9 || fm.markov_rank != 0 || + fm.n_mtp_layers != 3 || fm.target_layer_ids[0] != 5 || fm.target_layer_ids[2] != 7) { + die("bad DSpark root config metadata parse"); + } + unlink(cfg_path); + free(cfg_path); + rmdir(dir); + char tmpdir_inference[] = "/tmp/ds4q-config-merge-XXXXXX"; + char *dir_inference = mkdtemp(tmpdir_inference); + if (!dir_inference) die_errno("mkdtemp", tmpdir_inference); + char *root_cfg_path = path_join(dir_inference, "config.json"); + FILE *root_cfp = fopen(root_cfg_path, "wb"); + if (!root_cfp) die_errno("create root config", root_cfg_path); + fputs("{\"num_nextn_predict_layers\":1}", root_cfp); + if (fclose(root_cfp) != 0) die_errno("close root config", root_cfg_path); + char *inf_dir = path_join(dir_inference, "inference"); + if (mkdir(inf_dir, 0700) != 0) die_errno("mkdir", inf_dir); + char *inf_cfg_path = path_join(inf_dir, "config.json"); + FILE *inf_cfp = fopen(inf_cfg_path, "wb"); + if (!inf_cfp) die_errno("create inference config", inf_cfg_path); + fputs("{\"dspark_block_size\":8,\"dspark_noise_token_id\":11,\"dspark_markov_rank\":0," + "\"n_mtp_layers\":3,\"dspark_target_layer_ids\":[40,41,42]}", inf_cfp); + if (fclose(inf_cfp) != 0) die_errno("close inference config", inf_cfg_path); + rank_set = false; + fm = dspark_metadata_from_hf_config(dir_inference, &rank_set); + if (!rank_set || fm.block_size != 8 || fm.noise_token_id != 11 || fm.markov_rank != 0 || + fm.n_mtp_layers != 3 || fm.target_layer_ids[0] != 40 || fm.target_layer_ids[2] != 42) { + die("bad DSpark inference config metadata merge"); + } + unlink(inf_cfg_path); + unlink(root_cfg_path); + rmdir(inf_dir); + rmdir(dir_inference); + free(inf_cfg_path); + free(inf_dir); + free(root_cfg_path); + dspark_metadata dm = dspark_metadata_defaults(); + if (dm.block_size != 5 || dm.noise_token_id != 128799 || dm.markov_rank != 256 || + dm.n_mtp_layers != 3 || dm.target_layer_ids[0] != 40) { + die("bad DSpark metadata defaults"); + } + self_test_dspark_only_args(); + puts("dspark_map: OK"); +} + + static ds4q_type parse_type(const char *raw) { char wanted[64]; size_t n = 0; @@ -1150,18 +1528,23 @@ static size_t tensor_nbytes(ds4q_type type, const int64_t *ne, int n_dims) { return nbytes; } +static bool reversed_shape_matches(const st_info *info, const tensor_meta *tmpl, int nd) { + if (info->n_dims != nd) return false; + for (int i = 0; i < nd; i++) { + if (tmpl->ne[i] != info->shape[nd - 1 - i]) return false; + } + return true; +} + static void check_reversed_shape(const char *gguf_name, const st_info *info, const tensor_meta *tmpl) { - int nd = tensor_n_dims(tmpl); - if (info->n_dims != nd) { + if (reversed_shape_matches(info, tmpl, tmpl->n_dims)) return; + if (reversed_shape_matches(info, tmpl, tensor_n_dims(tmpl))) return; + if (info->n_dims != tmpl->n_dims && info->n_dims != tensor_n_dims(tmpl)) { fprintf(stderr, "error: rank mismatch for %s\n", gguf_name); exit(1); } - for (int i = 0; i < nd; i++) { - if (tmpl->ne[i] != info->shape[nd - 1 - i]) { - fprintf(stderr, "error: shape mismatch for %s\n", gguf_name); - exit(1); - } - } + fprintf(stderr, "error: shape mismatch for %s\n", gguf_name); + exit(1); } static byte_buf generate_regular(st_db *db, const char *gguf_name, const tensor_meta *tmpl, @@ -1223,7 +1606,7 @@ typedef struct { static void generate_one_expert(expert_job *j, int xid) { char prefix[256]; - snprintf(prefix, sizeof(prefix), "layers.%d.ffn.experts.%d.%s", j->expert.layer, xid, j->wid); + expert_hf_prefix(prefix, sizeof(prefix), &j->expert, xid, j->wid); char weight_name[320]; char scale_name[320]; snprintf(weight_name, sizeof(weight_name), "%s.weight", prefix); @@ -1339,6 +1722,8 @@ typedef struct { size_t data_offset; size_t tensor_bytes; size_t alignment; + bool write_dspark; + dspark_metadata dspark; } output_context; static size_t gguf_scalar_size(uint32_t type) { @@ -1455,6 +1840,62 @@ static void write_imatrix_kvs(FILE *fp, const imatrix_store *im) { } } + +static size_t gguf_kv_scalar_size(uint32_t type) { + return 4 + gguf_scalar_size(type); +} + + +static size_t gguf_kv_u32_size(const char *key) { + return gguf_string_size(key) + gguf_kv_scalar_size(GGUF_TYPE_UINT32); +} + +static uint64_t extra_dspark_kv_count(bool enabled) { + if (!enabled) return 0; + return 4 + DS4_DSPARK_TARGET_LAYER_COUNT; +} + +static size_t extra_dspark_kv_size(bool enabled) { + if (!enabled) return 0; + size_t n = 0; + n += gguf_kv_u32_size(DS4_KV_DSPARK_N_MTP_LAYERS); + n += gguf_kv_u32_size(DS4_KV_DSPARK_BLOCK_SIZE); + n += gguf_kv_u32_size(DS4_KV_DSPARK_NOISE_TOKEN_ID); + n += gguf_kv_u32_size(DS4_KV_DSPARK_MARKOV_RANK); + for (int i = 0; i < DS4_DSPARK_TARGET_LAYER_COUNT; i++) { + char key[64]; + snprintf(key, sizeof(key), "%s.%d", DS4_KV_DSPARK_TARGET_LAYER_ID, i); + n += gguf_kv_u32_size(key); + } + return n; +} + +static void write_dspark_kvs(FILE *fp, const dspark_metadata *m) { + write_gguf_string(fp, DS4_KV_DSPARK_N_MTP_LAYERS); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->n_mtp_layers); + + write_gguf_string(fp, DS4_KV_DSPARK_BLOCK_SIZE); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->block_size); + + write_gguf_string(fp, DS4_KV_DSPARK_NOISE_TOKEN_ID); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->noise_token_id); + + write_gguf_string(fp, DS4_KV_DSPARK_MARKOV_RANK); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->markov_rank); + + for (int i = 0; i < DS4_DSPARK_TARGET_LAYER_COUNT; i++) { + char key[64]; + snprintf(key, sizeof(key), "%s.%d", DS4_KV_DSPARK_TARGET_LAYER_ID, i); + write_gguf_string(fp, key); + write_u32(fp, GGUF_TYPE_UINT32); + write_u32(fp, m->target_layer_ids[i]); + } +} + static gguf_file load_gguf_metadata(const char *path) { gguf_file g = {0}; g.path = xstrdup(path); @@ -1499,7 +1940,7 @@ static gguf_file load_gguf_metadata(const char *path) { * otherwise the output can contain duplicate GGUF metadata with stale * and new values. */ - if (!is_imatrix_kv_key(key)) { + if (!is_imatrix_kv_key(key) && !is_dspark_kv_key(key)) { kv_keep[n_kv_keep++] = (byte_span){ .start = (size_t)(rec_start - kv_start), .end = (size_t)(rec_end - kv_start), @@ -1549,6 +1990,149 @@ static gguf_file load_gguf_metadata(const char *path) { return g; } +static void gguf_replace_tensors_start(gguf_file *g) { + for (uint64_t i = 0; i < g->n_tensors; i++) free(g->tensors[i].name); + free(g->tensors); + g->tensors = NULL; + g->n_tensors = 0; + g->data_offset = 0; + hmap_free(&g->tensor_map); +} + +static void gguf_add_tensor_meta(gguf_file *g, const char *name, int n_dims, const int64_t *ne, ds4q_type type) { + g->tensors = xrealloc(g->tensors, (size_t)(g->n_tensors + 1) * sizeof(g->tensors[0])); + tensor_meta *t = &g->tensors[g->n_tensors++]; + memset(t, 0, sizeof(*t)); + t->name = xstrdup(name); + t->n_dims = n_dims; + for (int i = 0; i < n_dims; i++) t->ne[i] = ne[i]; + t->type = type; + t->size = tensor_nbytes(type, t->ne, t->n_dims); +} + +static ds4q_type template_type_for_hf_dtype(const char *dtype) { + if (strcmp(dtype, "F32") == 0) return DS4Q_TYPE_F32; + if (strcmp(dtype, "BF16") == 0) return DS4Q_TYPE_BF16; + if (strcmp(dtype, "F8_E4M3") == 0) return DS4Q_TYPE_F16; + if (strcmp(dtype, "I8") == 0) return DS4Q_TYPE_Q4_K; + if (strcmp(dtype, "I64") == 0) return DS4Q_TYPE_I32; + fprintf(stderr, "error: unsupported HF dtype for DSpark template: %s\n", dtype); + exit(1); +} + +static bool is_dspark_required_stage_tensor(const char *rest) { + return strcmp(rest, "hc_attn_fn.weight") == 0 || + strcmp(rest, "hc_attn_scale.weight") == 0 || + strcmp(rest, "hc_attn_base.weight") == 0 || + strcmp(rest, "attn_norm.weight") == 0 || + strcmp(rest, "attn_q_a.weight") == 0 || + strcmp(rest, "attn_q_a_norm.weight") == 0 || + strcmp(rest, "attn_q_b.weight") == 0 || + strcmp(rest, "attn_kv.weight") == 0 || + strcmp(rest, "attn_kv_a_norm.weight") == 0 || + strcmp(rest, "attn_sinks.weight") == 0 || + strcmp(rest, "attn_output_a.weight") == 0 || + strcmp(rest, "attn_output_b.weight") == 0 || + strcmp(rest, "hc_ffn_fn.weight") == 0 || + strcmp(rest, "hc_ffn_scale.weight") == 0 || + strcmp(rest, "hc_ffn_base.weight") == 0 || + strcmp(rest, "ffn_norm.weight") == 0 || + strcmp(rest, "ffn_gate_inp.weight") == 0 || + strcmp(rest, "exp_probs_b.bias") == 0 || + strcmp(rest, "ffn_gate_shexp.weight") == 0 || + strcmp(rest, "ffn_up_shexp.weight") == 0 || + strcmp(rest, "ffn_down_shexp.weight") == 0; +} + +static bool is_dspark_routed_stage_tensor(const char *rest) { + return strcmp(rest, "ffn_gate_exps.weight") == 0 || + strcmp(rest, "ffn_up_exps.weight") == 0 || + strcmp(rest, "ffn_down_exps.weight") == 0; +} + +static bool is_dspark_loader_f32_tensor(const char *name) { + return strstr(name, ".main_norm.weight") || + (strstr(name, ".norm.weight") && str_starts(name, "mtp.")) || + strstr(name, ".attn_norm.weight") || + strstr(name, ".attn_q_a_norm.weight") || + strstr(name, ".attn_kv_a_norm.weight") || + strstr(name, ".hc_attn_scale.weight") || + strstr(name, ".hc_attn_base.weight") || + strstr(name, ".attn_sinks.weight") || + strstr(name, ".hc_ffn_scale.weight") || + strstr(name, ".hc_ffn_base.weight") || + strstr(name, ".ffn_norm.weight") || + strstr(name, ".exp_probs_b.bias") || + strstr(name, ".hc_head_base.weight") || + strstr(name, ".hc_head_scale.weight") || + strstr(name, ".confidence_head.proj.weight"); +} + +static ds4q_type dspark_template_for_name(const char *name, ds4q_type hf_type) { + if (is_dspark_loader_f32_tensor(name)) return DS4Q_TYPE_F32; + if (strstr(name, ".markov_head.markov_w1.weight") || + strstr(name, ".markov_head.markov_w2.weight")) return DS4Q_TYPE_F16; + if (strstr(name, ".hc_head_fn.weight") || + strstr(name, ".hc_attn_fn.weight") || + strstr(name, ".hc_ffn_fn.weight") || + strstr(name, ".ffn_gate_inp.weight")) return DS4Q_TYPE_F16; + if (is_attention_projection(name) || is_shared_expert(name)) return DS4Q_TYPE_Q8_0; + if (parse_expert_tensor(name).is_expert) return DS4Q_TYPE_Q4_K; + return hf_type; +} + +static void gguf_add_regular_from_hf(gguf_file *g, st_db *db, const char *gguf_name) { + char *hf_name = hf_name_for_regular(gguf_name); + tensor_entry *te = db_tensor(db, hf_name, NULL); + int nd = te->info.n_dims; + int64_t ne[DS4Q_MAX_DIMS] = {0}; + for (int i = 0; i < nd; i++) ne[i] = te->info.shape[nd - 1 - i]; + ds4q_type hf_type = template_type_for_hf_dtype(te->info.dtype); + gguf_add_tensor_meta(g, gguf_name, nd, ne, dspark_template_for_name(gguf_name, hf_type)); + free(hf_name); +} + +static void gguf_add_expert_from_hf(gguf_file *g, st_db *db, const char *gguf_name, int n_experts) { + expert_tensor e = parse_expert_tensor(gguf_name); + if (!e.is_expert) die("internal error: expected routed expert tensor"); + char prefix[256]; + expert_hf_prefix(prefix, sizeof(prefix), &e, 0, expert_part_name(e.part)); + char weight_name[320]; + snprintf(weight_name, sizeof(weight_name), "%s.weight", prefix); + tensor_entry *te = db_tensor(db, weight_name, NULL); + if (te->info.n_dims != 2) die("bad DSpark routed expert rank"); + int64_t ne[3] = { te->info.shape[1] * 2, te->info.shape[0], n_experts }; + gguf_add_tensor_meta(g, gguf_name, 3, ne, DS4Q_TYPE_Q4_K); +} + +static void gguf_add_dspark_stage(gguf_file *g, st_db *db, uint32_t stage, int n_experts) { + char name[256]; + for (size_t i = 0; i < sizeof(layer_map) / sizeof(layer_map[0]); i++) { + const char *rest = layer_map[i].gguf; + if (!is_dspark_required_stage_tensor(rest) && !is_dspark_routed_stage_tensor(rest)) continue; + snprintf(name, sizeof(name), "mtp.%u.%s", stage, rest); + if (is_dspark_routed_stage_tensor(rest)) gguf_add_expert_from_hf(g, db, name, n_experts); + else gguf_add_regular_from_hf(g, db, name); + } +} + +static void gguf_use_dspark_mtp_template(gguf_file *g, st_db *db, int n_experts, dspark_hf_layout layout) { + if (layout == DS4_DSPARK_HF_NONE) die("--dspark-only requires DSpark HF tensors"); + gguf_replace_tensors_start(g); + gguf_add_regular_from_hf(g, db, "mtp.0.main_proj.weight"); + gguf_add_regular_from_hf(g, db, "mtp.0.main_norm.weight"); + for (uint32_t s = 0; s < DS4_DSPARK_TARGET_LAYER_COUNT; s++) gguf_add_dspark_stage(g, db, s, n_experts); + gguf_add_regular_from_hf(g, db, "mtp.2.norm.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_base.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_fn.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_scale.weight"); + if (layout == DS4_DSPARK_HF_MARKOV) { + gguf_add_regular_from_hf(g, db, "mtp.2.markov_head.markov_w1.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.markov_head.markov_w2.weight"); + gguf_add_regular_from_hf(g, db, "mtp.2.confidence_head.proj.weight"); + } +} + static byte_buf read_gguf_tensor_data(const gguf_file *g, const char *path, const char *name) { int idx = hmap_get(&g->tensor_map, name); if (idx < 0) { @@ -1574,11 +2158,15 @@ static uint64_t fnv1a64_bytes(const uint8_t *data, size_t n) { return h; } -static output_context build_output_context(const gguf_file *tmpl, const quant_policy *policy, const imatrix_store *im) { +static output_context build_output_context(const gguf_file *tmpl, const quant_policy *policy, + const imatrix_store *im, bool write_dspark, + const dspark_metadata *dspark) { output_context out = {0}; out.n_tensors = tmpl->n_tensors; - out.n_kv_extra = extra_imatrix_kv_count(im); + out.n_kv_extra = extra_imatrix_kv_count(im) + extra_dspark_kv_count(write_dspark); out.alignment = tmpl->alignment; + out.write_dspark = write_dspark; + if (write_dspark && dspark) out.dspark = *dspark; out.tensors = xcalloc((size_t)out.n_tensors, sizeof(out.tensors[0])); size_t tensor_info = 0; size_t off = 0; @@ -1598,7 +2186,8 @@ static output_context build_output_context(const gguf_file *tmpl, const quant_po tensor_info += gguf_string_size(dst->name) + 4 + (size_t)dst->n_dims * 8 + 4 + 8; } out.tensor_bytes = off; - out.meta_size = 4 + 4 + 8 + 8 + tmpl->kv_raw_len + extra_imatrix_kv_size(im) + tensor_info; + out.meta_size = 4 + 4 + 8 + 8 + tmpl->kv_raw_len + extra_imatrix_kv_size(im) + + extra_dspark_kv_size(write_dspark) + tensor_info; out.data_offset = ds4q_pad(out.meta_size, tmpl->alignment); return out; } @@ -1623,6 +2212,7 @@ static void write_full_gguf(st_db *db, const gguf_file *tmpl, const output_conte write_u64(fp, tmpl->n_kv + out_ctx->n_kv_extra); if (fwrite(tmpl->kv_raw, 1, tmpl->kv_raw_len, fp) != tmpl->kv_raw_len) die("write GGUF KV failed"); write_imatrix_kvs(fp, imatrix); + if (out_ctx->write_dspark) write_dspark_kvs(fp, &out_ctx->dspark); for (uint64_t i = 0; i < out_ctx->n_tensors; i++) { const tensor_meta *t = &out_ctx->tensors[i]; write_gguf_string(fp, t->name); @@ -1646,10 +2236,9 @@ static void write_full_gguf(st_db *db, const gguf_file *tmpl, const output_conte fprintf(stderr, "error: generated size mismatch for %s: got %zu expected %zu\n", dst->name, data.size, expected); exit(1); } - if (fwrite(data.data, 1, data.size, fp) != data.size) die_errno("write tensor", out_path); - size_t padded = ds4q_pad(data.size, out_ctx->alignment); + if (fwrite(data.data, 1, data.size, fp) != data.size) die("write tensor data failed"); + const size_t padded = ds4q_pad(data.size, out_ctx->alignment); write_padding(fp, padded - data.size); - fprintf(stderr, " generated %.2f MiB\n", (double)data.size / 1048576.0); free(data.data); } fclose(fp); @@ -1691,6 +2280,8 @@ typedef struct { bool dry_run; bool overwrite; bool imatrix_strict; + bool dspark_only; + bool self_test_dspark_map; } params; static void usage(const char *argv0) { @@ -1704,6 +2295,8 @@ static void usage(const char *argv0) { printf(" --compare-tensor NAME regenerate one tensor, byte-compare, and exit\n"); printf(" --overwrite replace --out if it already exists\n"); printf(" --dry-run print output plan without reading HF tensor data\n"); + printf(" --self-test-dspark-map validate DSpark HF map, policy, and metadata defaults\n"); + printf(" --dspark-only replace template tensors with official DSpark MTP tensors\n"); printf(" --imatrix FILE legacy .dat imatrix from ds4 --imatrix-out\n"); printf(" --imatrix-strict fail if a quantized tensor has no matching imatrix vector\n"); printf(" --experts TYPE set routed w1/w2/w3 expert tensors to TYPE\n"); @@ -1762,6 +2355,10 @@ static params parse_args(int argc, char **argv) { p.compare_tensor = need_value(argc, argv, &i, arg); } else if (strcmp(arg, "--overwrite") == 0) { p.overwrite = true; + } else if (strcmp(arg, "--self-test-dspark-map") == 0) { + p.self_test_dspark_map = true; + } else if (strcmp(arg, "--dspark-only") == 0) { + p.dspark_only = true; } else if (strcmp(arg, "--dry-run") == 0) { p.dry_run = true; } else if (strcmp(arg, "--imatrix") == 0) { @@ -1805,6 +2402,7 @@ static params parse_args(int argc, char **argv) { exit(1); } } + if (p.self_test_dspark_map) return p; if (!p.hf_dir) die("--hf is required"); if (!p.template_gguf) die("--template is required"); if (!p.dry_run && !p.compare_tensor && !p.out_gguf) die("--out is required unless --dry-run or --compare-tensor is used"); @@ -1813,6 +2411,18 @@ static params parse_args(int argc, char **argv) { return p; } +static void self_test_dspark_only_args(void) { + char *argv[] = { + "deepseek4-quantize", + "--self-test-dspark-map", + "--dspark-only", + }; + params p = parse_args((int)(sizeof(argv) / sizeof(argv[0])), argv); + if (!p.self_test_dspark_map || !p.dspark_only) { + die("bad --dspark-only self-test parsing"); + } +} + static void free_gguf_file(gguf_file *g) { free(g->path); free(g->kv_raw); @@ -1866,6 +2476,10 @@ static void compare_one_tensor(st_db *db, const gguf_file *tmpl, const output_co int main(int argc, char **argv) { params p = parse_args(argc, argv); + if (p.self_test_dspark_map) { + self_test_dspark_map(); + return 0; + } imatrix_store imatrix = {0}; if (p.imatrix_file) imatrix_load(&imatrix, p.imatrix_file, p.imatrix_strict); @@ -1881,12 +2495,31 @@ int main(int argc, char **argv) { } else { fprintf(stderr, "using %d routed experts from --n-experts\n", p.n_experts); } - output_context out_ctx = build_output_context(&tmpl, &p.policy, &imatrix); - print_plan(&tmpl, &out_ctx); - if (p.dry_run) return 0; st_db db; + bool write_dspark = false; + dspark_metadata dspark_meta = dspark_metadata_defaults(); + bool markov_rank_set = false; + dspark_meta = dspark_metadata_from_hf_config(p.hf_dir, &markov_rank_set); db_open(&db, p.hf_dir); + dspark_hf_layout dspark_layout = db_dspark_hf_layout(&db, markov_rank_set, dspark_meta.markov_rank); + if (dspark_layout != DS4_DSPARK_HF_NONE) { + write_dspark = true; + fprintf(stderr, "DSpark HF %s layout detected; writing deepseek4.dspark.* metadata\n", + dspark_hf_layout_name(dspark_layout)); + } + if (p.dspark_only) { + gguf_use_dspark_mtp_template(&tmpl, &db, p.n_experts, dspark_layout); + write_dspark = true; + } + output_context out_ctx = build_output_context(&tmpl, &p.policy, &imatrix, write_dspark, &dspark_meta); + print_plan(&tmpl, &out_ctx); + if (p.dry_run) { + db_close(&db); + free_gguf_file(&tmpl); + free(out_ctx.tensors); + return 0; + } if (p.compare_tensor) { compare_one_tensor(&db, &tmpl, &out_ctx, &p, &imatrix); db_close(&db); diff --git a/gguf-tools/deepspec/ds4_deepspec.py b/gguf-tools/deepspec/ds4_deepspec.py new file mode 100755 index 000000000..b76f85a73 --- /dev/null +++ b/gguf-tools/deepspec/ds4_deepspec.py @@ -0,0 +1,505 @@ +#!/usr/bin/env python3 +"""DS4 helpers for DeepSpec target-cache interoperability.""" + +from __future__ import annotations + +import argparse +import json +import struct +import sys +import tempfile +import textwrap +from pathlib import Path + +INDEX_RECORD_STRUCT = struct.Struct(" None: + if not condition: + raise CacheValidationError(message) + + +def _read_json(path: Path) -> dict: + try: + with path.open("r", encoding="utf-8") as fp: + data = json.load(fp) + except OSError as exc: + raise CacheValidationError(f"cannot read {path}: {exc}") from exc + except json.JSONDecodeError as exc: + raise CacheValidationError(f"invalid JSON in {path}: {exc}") from exc + _require(isinstance(data, dict), f"{path} is not a JSON object") + return data + + +def _required_int(manifest: dict, key: str) -> int: + value = manifest.get(key) + _require(isinstance(value, int) and value >= 0, f"manifest.{key} must be a non-negative integer") + return value + + +def _validate_manifest(manifest: dict, + expected_target_model: str | None, + expected_chat_template: str | None) -> tuple[int, list[int], int, list[dict]]: + _require(manifest.get("version") == TARGET_CACHE_VERSION, + f"manifest.version must be {TARGET_CACHE_VERSION}") + if "format" in manifest: + _require(manifest["format"] == "deepspec-target-cache", + "manifest.format must be deepspec-target-cache") + _require(manifest.get("hidden_dtype") == EXPECTED_HIDDEN_DTYPE, + f"manifest.hidden_dtype must be {EXPECTED_HIDDEN_DTYPE}") + _require(manifest.get("token_dtype") == EXPECTED_TOKEN_DTYPE, + f"manifest.token_dtype must be {EXPECTED_TOKEN_DTYPE}") + _require(manifest.get("mask_dtype") == EXPECTED_MASK_DTYPE, + f"manifest.mask_dtype must be {EXPECTED_MASK_DTYPE}") + _require(manifest.get("index_record_size") == INDEX_RECORD_STRUCT.size, + f"manifest.index_record_size must be {INDEX_RECORD_STRUCT.size}") + + hidden_size = _required_int(manifest, "hidden_size") + _require(hidden_size > 0, "manifest.hidden_size must be positive") + num_samples = _required_int(manifest, "num_samples") + num_shards = _required_int(manifest, "num_shards") + + layers = manifest.get("target_layer_ids") + _require(isinstance(layers, list) and len(layers) > 0, + "manifest.target_layer_ids must be a non-empty list") + _require(all(isinstance(layer, int) and layer >= 0 for layer in layers), + "manifest.target_layer_ids must contain non-negative integers") + _require(len(set(layers)) == len(layers), "manifest.target_layer_ids must not contain duplicates") + _require(layers == sorted(layers), "manifest.target_layer_ids must be sorted in capture order") + + target_hidden_layers = manifest.get("target_hidden_layers") + if target_hidden_layers is not None: + _require(target_hidden_layers == len(layers), + "manifest.target_hidden_layers must match target_layer_ids length") + + if expected_target_model is not None: + _require(manifest.get("target_model_name_or_path") == expected_target_model, + "manifest.target_model_name_or_path does not match expected target model") + + if expected_chat_template is not None: + convention = manifest.get("input_convention") + _require(isinstance(convention, dict), "manifest.input_convention must be an object") + _require(convention.get("chat_template") == expected_chat_template, + "manifest.input_convention.chat_template does not match expected template") + + shards = manifest.get("shards") + _require(isinstance(shards, list), "manifest.shards must be a list") + _require(len(shards) == num_shards, "manifest.num_shards must match shards length") + if num_samples > 0: + _require(num_shards > 0, "manifest with samples must contain at least one shard") + return hidden_size, layers, num_samples, shards + + +def _load_shard_map(cache_dir: Path, shards: list[dict]) -> dict[int, Path]: + shard_map: dict[int, Path] = {} + for entry in shards: + _require(isinstance(entry, dict), "manifest.shards entries must be objects") + shard_id = entry.get("shard_id") + file_name = entry.get("file_name") + _require(isinstance(shard_id, int) and shard_id >= 0, "shard_id must be a non-negative integer") + _require(isinstance(file_name, str) and file_name, "shard file_name must be a non-empty string") + _require(shard_id not in shard_map, f"duplicate shard_id {shard_id}") + path = cache_dir / file_name + _require(path.is_file(), f"missing shard file {path}") + shard_map[shard_id] = path + return shard_map + + +def _intervals_for_record(seq_len: int, + hidden_size: int, + num_layers: int, + offsets: tuple[int, int, int, int, int]) -> list[tuple[str, int, int]]: + input_ids_offset, attention_mask_offset, loss_mask_offset, target_hidden_offset, target_last_offset = offsets + target_hidden_bytes = seq_len * num_layers * hidden_size * 2 + target_last_bytes = seq_len * hidden_size * 2 + return [ + ("input_ids", input_ids_offset, seq_len * 4), + ("attention_mask", attention_mask_offset, seq_len), + ("loss_mask", loss_mask_offset, seq_len), + ("target_hidden_states", target_hidden_offset, target_hidden_bytes), + ("target_last_hidden_states", target_last_offset, target_last_bytes), + ] + + +def _validate_record(cache_dir: Path, + record_index: int, + record: tuple[int, int, int, int, int, int, int, int], + shard_map: dict[int, Path], + hidden_size: int, + num_layers: int) -> None: + sample_id, shard_id, seq_len, input_ids_offset, attention_mask_offset, loss_mask_offset, target_hidden_offset, target_last_offset = record + _require(sample_id == record_index, + f"record {record_index} sample_id is {sample_id}, expected {record_index}") + _require(seq_len > 0, f"record {record_index} seq_len must be positive") + _require(shard_id in shard_map, f"record {record_index} references unknown shard_id {shard_id}") + shard = shard_map[shard_id] + shard_size = shard.stat().st_size + intervals = _intervals_for_record(seq_len, + hidden_size, + num_layers, + (input_ids_offset, + attention_mask_offset, + loss_mask_offset, + target_hidden_offset, + target_last_offset)) + sorted_intervals = sorted(intervals, key=lambda item: item[1]) + for name, offset, size in sorted_intervals: + _require(offset >= 0, f"record {record_index} {name} offset must be non-negative") + _require(size > 0, f"record {record_index} {name} size must be positive") + _require(offset + size <= shard_size, + f"record {record_index} {name} extends beyond shard {shard.relative_to(cache_dir)}") + for (_, prev_offset, prev_size), (name, offset, _) in zip(sorted_intervals, sorted_intervals[1:]): + _require(prev_offset + prev_size <= offset, + f"record {record_index} {name} overlaps previous tensor payload") + + +def validate_target_cache(cache_dir: Path, + expected_target_model: str | None = None, + expected_chat_template: str | None = None) -> dict: + cache_dir = cache_dir.resolve() + _require(cache_dir.is_dir(), f"cache directory does not exist: {cache_dir}") + manifest = _read_json(cache_dir / "manifest.json") + hidden_size, layers, num_samples, shards = _validate_manifest(manifest, + expected_target_model, + expected_chat_template) + shard_map = _load_shard_map(cache_dir, shards) + index_path = cache_dir / "samples.idx" + _require(index_path.is_file(), f"missing index file {index_path}") + index_size = index_path.stat().st_size + _require(index_size == num_samples * INDEX_RECORD_STRUCT.size, + "samples.idx size must equal num_samples * index_record_size") + with index_path.open("rb") as fp: + for record_index in range(num_samples): + raw = fp.read(INDEX_RECORD_STRUCT.size) + _require(len(raw) == INDEX_RECORD_STRUCT.size, + f"short samples.idx record {record_index}") + _validate_record(cache_dir, + record_index, + INDEX_RECORD_STRUCT.unpack(raw), + shard_map, + hidden_size, + len(layers)) + return { + "cache_dir": str(cache_dir), + "num_samples": num_samples, + "num_shards": len(shards), + "hidden_size": hidden_size, + "target_layer_ids": layers, + "index_record_size": INDEX_RECORD_STRUCT.size, + } + +def render_nonseq_config(target_cache_path: str | None = None, + target_model_name_or_path: str = DEFAULT_TARGET_MODEL, + chat_template: str = DEFAULT_CHAT_TEMPLATE, + target_layer_ids: list[int] | None = None, + max_train_steps: int | None = None, + global_batch_size: int = 512, + local_batch_size: int = 1) -> str: + """Return a DeepSpec config for a DeepSeek-V4 non-Markov DSpark pilot.""" + if target_layer_ids is None: + target_layer_ids = DEFAULT_TARGET_LAYER_IDS + _require(len(target_layer_ids) > 0, "target_layer_ids must not be empty") + return textwrap.dedent(f"""\ + # Generated by ds4_deepspec.py for DS4 DeepSpec training. + import os + + try: + from deepspec.trainer import DeepSeekV4DSparkTrainer + except ImportError as exc: + raise RuntimeError( + "DS4 DeepSeek-V4 DSpark training needs a DeepSpec checkout/fork " + "that provides DeepSeekV4DSparkTrainer; upstream DeepSpec main " + "currently ships Qwen3/Gemma trainers only." + ) from exc + + BASE_TB_DIR = os.path.expanduser("~/tensorboard") + BASE_CKPT_DIR = os.path.expanduser("~/checkpoints") + + seed = 42 + project_name = "deepspec" + exp_name = "dspark_block5_deepseek_v4_flash_nonseq" + + model = dict( + target_model_name_or_path={target_model_name_or_path!r}, + block_size={DEFAULT_DSPARK_BLOCK_SIZE}, + num_draft_layers={len(target_layer_ids)}, + target_layer_ids={target_layer_ids!r}, + mask_token_id={DEFAULT_MASK_TOKEN_ID}, + num_anchors=512, + markov_rank=0, + markov_head_type="vanilla", + confidence_head_alpha=0.0, + confidence_head_with_markov=False, + ) + + train = dict( + trainer_cls=DeepSeekV4DSparkTrainer, + lr=6.0e-4, + warmup_ratio=0.04, + weight_decay=0.0, + precision="bf16", + local_batch_size={local_batch_size}, + global_batch_size={global_batch_size}, + num_train_epochs=10, + max_train_steps={max_train_steps!r}, + max_grad_norm=1.0, + sharding_strategy="no_shard", + torch_compile=False, + loss_decay_gamma=None, + ce_loss_alpha=1.0, + l1_loss_alpha=0.0, + ) + + logging = dict( + logging_steps=10, + checkpointing_steps=3000, + ) + + data = dict( + target_cache_path={target_cache_path!r}, + chat_template={chat_template!r}, + max_length=4096, + num_workers=4, + ) + + def finalize_cfg(cfg): + logging_cfg = dict(cfg["logging"]) + project = str(cfg["project_name"]) + exp = str(cfg["exp_name"]) + logging_cfg["checkpoint_dir"] = os.path.join(BASE_CKPT_DIR, project, exp) + logging_cfg["tensorboard_dir"] = os.path.join(BASE_TB_DIR, project, exp) + cfg["logging"] = logging_cfg + return cfg + """) + + +def _target_cache_config_defaults(target_cache_path: str, + target_model_name_or_path: str | None, + chat_template: str | None) -> tuple[str, str, list[int]]: + cache_dir = Path(target_cache_path) + manifest = _read_json(cache_dir / "manifest.json") + + manifest_target = manifest.get("target_model_name_or_path") + if target_model_name_or_path is None: + _require(isinstance(manifest_target, str) and manifest_target, + "manifest.target_model_name_or_path is required to emit a config without --target-model") + target_model_name_or_path = manifest_target + elif manifest_target is not None: + _require(isinstance(manifest_target, str) and manifest_target, + "manifest.target_model_name_or_path must be a non-empty string when present") + _require(manifest_target == target_model_name_or_path, + "manifest.target_model_name_or_path does not match expected target model") + + convention = manifest.get("input_convention") + manifest_template = None + if convention is not None: + _require(isinstance(convention, dict), "manifest.input_convention must be an object") + manifest_template = convention.get("chat_template") + if manifest_template is not None: + _require(isinstance(manifest_template, str) and manifest_template, + "manifest.input_convention.chat_template must be a non-empty string when present") + if chat_template is None: + _require(isinstance(manifest_template, str) and manifest_template, + "manifest.input_convention.chat_template is required to emit a config without --chat-template") + chat_template = manifest_template + elif manifest_template is not None: + _require(manifest_template == chat_template, + "manifest.input_convention.chat_template does not match expected template") + + _, target_layer_ids, _, _ = _validate_manifest(manifest, None, None) + return target_model_name_or_path, chat_template, target_layer_ids + + +def write_nonseq_config(path: Path, + target_cache_path: str | None = None, + target_model_name_or_path: str | None = None, + chat_template: str | None = None, + max_train_steps: int | None = None, + global_batch_size: int = 512, + local_batch_size: int = 1, + overwrite: bool = False) -> dict: + if path.exists() and not overwrite: + raise CacheValidationError(f"refusing to overwrite existing config: {path}") + _require(target_cache_path is not None and target_cache_path != "", + "--target-cache is required with --emit-nonseq-config") + if max_train_steps is not None: + _require(max_train_steps > 0, "--max-train-steps must be positive") + _require(global_batch_size > 0, "--global-batch-size must be positive") + _require(local_batch_size > 0, "--local-batch-size must be positive") + target_model_name_or_path, chat_template, target_layer_ids = _target_cache_config_defaults( + target_cache_path, + target_model_name_or_path, + chat_template) + config = render_nonseq_config(target_cache_path, + target_model_name_or_path, + chat_template, + target_layer_ids, + max_train_steps, + global_batch_size, + local_batch_size) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(config, encoding="utf-8") + return { + "config": str(path), + "target_model_name_or_path": target_model_name_or_path, + "chat_template": chat_template, + "target_cache_path": target_cache_path, + "markov_rank": 0, + } + + +def _write_self_test_cache(cache_dir: Path, + target_model_name_or_path: str = DEFAULT_TARGET_MODEL, + chat_template: str = DEFAULT_CHAT_TEMPLATE, + include_optional_config: bool = True) -> None: + hidden_size = 4 + layers = [1, 2, 3] + seq_len = 2 + shard = cache_dir / "shard-00000.bin" + index = cache_dir / "samples.idx" + manifest = cache_dir / "manifest.json" + cache_dir.mkdir(parents=True, exist_ok=True) + offsets: list[int] = [] + payloads = [ + struct.pack(" dict: + with tempfile.TemporaryDirectory(prefix="ds4-deepspec-cache-") as tmp: + cache_dir = Path(tmp) / "cache" + config_path = Path(tmp) / "dspark_v4_nonseq.py" + self_test_target_model = "local/self-test-target" + self_test_chat_template = "self_test_template" + _write_self_test_cache(cache_dir, + target_model_name_or_path=self_test_target_model, + chat_template=self_test_chat_template) + cache_result = validate_target_cache(cache_dir, + expected_target_model=self_test_target_model, + expected_chat_template=self_test_chat_template) + config_result = write_nonseq_config(config_path, + target_cache_path=str(cache_dir), + max_train_steps=1) + config_text = config_path.read_text(encoding="utf-8") + compile(config_text, str(config_path), "exec") + _require(f"target_model_name_or_path={self_test_target_model!r}" in config_text, + "emitted config must inherit target model from cache manifest") + _require(f"chat_template={self_test_chat_template!r}" in config_text, + "emitted config must inherit chat template from cache manifest") + _require("block_size=5" in config_text, "emitted config must use DeepSeek-V4 DSpark block_size=5") + _require("num_draft_layers=3" in config_text, "emitted config must use the three DSpark MTP layers") + _require("target_layer_ids=[1, 2, 3]" in config_text, + "emitted config must inherit target layers from cache manifest") + optional_cache_dir = Path(tmp) / "optional-cache" + optional_config_path = Path(tmp) / "optional_nonseq.py" + explicit_target_model = "explicit/target" + explicit_chat_template = "explicit_template" + _write_self_test_cache(optional_cache_dir, include_optional_config=False) + optional_config = write_nonseq_config(optional_config_path, + target_cache_path=str(optional_cache_dir), + target_model_name_or_path=explicit_target_model, + chat_template=explicit_chat_template, + max_train_steps=1) + optional_text = optional_config_path.read_text(encoding="utf-8") + compile(optional_text, str(optional_config_path), "exec") + _require(optional_config["target_model_name_or_path"] == explicit_target_model, + "explicit target model must be accepted when optional manifest target is absent") + _require(optional_config["chat_template"] == explicit_chat_template, + "explicit chat template must be accepted when optional manifest template is absent") + _require(f"target_model_name_or_path={explicit_target_model!r}" in optional_text, + "explicit target model must be emitted when optional manifest target is absent") + _require(f"chat_template={explicit_chat_template!r}" in optional_text, + "explicit chat template must be emitted when optional manifest template is absent") + cache_result["nonseq_config"] = config_result + return cache_result + + +def _parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Validate DS4 DeepSpec target-cache artifacts.") + parser.add_argument("cache_dir", nargs="?", help="Directory containing manifest.json, samples.idx, and shard files.") + parser.add_argument("--target-model", help="Expected manifest target_model_name_or_path, or emitted config target model.") + parser.add_argument("--chat-template", help="Expected manifest input_convention.chat_template, or emitted config chat template.") + parser.add_argument("--self-test", action="store_true", help="Run the built-in synthetic cache/config compatibility smoke.") + parser.add_argument("--emit-nonseq-config", metavar="FILE", help="Write a DeepSeek-V4 non-Markov DSpark DeepSpec config.") + parser.add_argument("--target-cache", help="target_cache_path value for --emit-nonseq-config.") + parser.add_argument("--max-train-steps", type=int, help="Optional train.max_train_steps value for the emitted config.") + parser.add_argument("--global-batch-size", type=int, default=512, help="Emitted train.global_batch_size. Default: 512.") + parser.add_argument("--local-batch-size", type=int, default=1, help="Emitted train.local_batch_size. Default: 1.") + parser.add_argument("--overwrite", action="store_true", help="Allow --emit-nonseq-config to replace FILE.") + return parser.parse_args(argv) + + +def main(argv: list[str]) -> int: + args = _parse_args(argv) + try: + if args.emit_nonseq_config: + result = write_nonseq_config(Path(args.emit_nonseq_config), + target_cache_path=args.target_cache, + target_model_name_or_path=args.target_model, + chat_template=args.chat_template, + max_train_steps=args.max_train_steps, + global_batch_size=args.global_batch_size, + local_batch_size=args.local_batch_size, + overwrite=args.overwrite) + elif args.self_test: + result = self_test() + else: + _require(args.cache_dir is not None, "cache_dir is required unless --self-test or --emit-nonseq-config is used") + result = validate_target_cache(Path(args.cache_dir), + expected_target_model=args.target_model, + expected_chat_template=args.chat_template) + except CacheValidationError as exc: + print(f"ds4-deepspec: {exc}", file=sys.stderr) + return 1 + json.dump(result, sys.stdout, indent=2, sort_keys=True) + sys.stdout.write("\n") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/rocm/ds4_rocm_attention_launch.cuh b/rocm/ds4_rocm_attention_launch.cuh index b9b43d958..0691db2e8 100644 --- a/rocm/ds4_rocm_attention_launch.cuh +++ b/rocm/ds4_rocm_attention_launch.cuh @@ -324,6 +324,25 @@ extern "C" int ds4_gpu_attention_decode_raw_batch_heads_tensor( n_head, head_dim); } +extern "C" int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor( + ds4_gpu_tensor *heads, + const void *model_map, + uint64_t model_size, + uint64_t sinks_offset, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *raw_kv, + uint32_t n_tokens, + uint32_t n_raw, + uint32_t raw_cap, + uint32_t raw_start, + uint32_t n_head, + uint32_t head_dim) { + (void)heads; (void)model_map; (void)model_size; (void)sinks_offset; + (void)q; (void)raw_kv; (void)n_tokens; (void)n_raw; (void)raw_cap; + (void)raw_start; (void)n_head; (void)head_dim; + return 0; +} + extern "C" int ds4_gpu_attention_decode_mixed_batch_heads_tensor( ds4_gpu_tensor *heads, const void *model_map, diff --git a/tests/ds4_test.c b/tests/ds4_test.c index ea1e52487..eea2db8ea 100644 --- a/tests/ds4_test.c +++ b/tests/ds4_test.c @@ -1,9 +1,11 @@ #define DS4_SERVER_TEST #define DS4_SERVER_TEST_NO_MAIN #include "../ds4_server.c" +#include "../ds4_dspark_runtime.h" #ifndef DS4_NO_GPU #include "../ds4_gpu.h" #include +#include static ds4_engine *test_engine_fast; static ds4_engine *test_engine_quality; @@ -85,11 +87,24 @@ static void test_restore_canonical_streaming_prefill( saved.batch_selected_addr); } +static ds4_backend test_backend(void) { +#ifdef __APPLE__ + return DS4_BACKEND_METAL; +#else + return DS4_BACKEND_CUDA; +#endif +} + + static ds4_engine *test_open_engine(bool quality) { ds4_engine *engine = NULL; - /* DS4_TEST_MTP loads the MTP head on the fast engine so the speculative - * verify regression can reuse it; draft=4 hits the multi-row verify path. */ - const char *mtp = getenv("DS4_TEST_MTP"); + /* DS4_TEST_MTP loads the legacy MTP head on the fast engine so the speculative + * verify regression can reuse it; draft=4 hits the multi-row verify path. + * DS4_TEST_DSPARK loads an official DSpark draft GGUF and lets metadata choose + * the block size. */ + const char *dspark = getenv("DS4_TEST_DSPARK"); + const char *mtp = (dspark && dspark[0]) ? dspark : getenv("DS4_TEST_MTP"); + const bool use_mtp = mtp && mtp[0] && !quality; ds4_engine_options opt = { .model_path = test_model_path(), #ifdef __APPLE__ @@ -106,8 +121,8 @@ static ds4_engine *test_open_engine(bool quality) { test_env_gib("DS4_TEST_SSD_STREAMING_CACHE_GB"), .ssd_streaming_preload_experts = test_env_u32("DS4_TEST_SSD_STREAMING_PRELOAD_EXPERTS"), - .mtp_path = (mtp && mtp[0] && !quality) ? mtp : NULL, - .mtp_draft_tokens = (mtp && mtp[0] && !quality) ? 4 : 0, + .mtp_path = use_mtp ? mtp : NULL, + .mtp_draft_tokens = use_mtp && !(dspark && dspark[0]) ? 4 : 0, }; TEST_ASSERT(ds4_engine_open(&engine, &opt) == 0); return engine; @@ -2174,8 +2189,368 @@ static void test_mtp_verify_depth(void) { free(spec); ds4_tokens_free(&prompt); } + +static void test_dspark_speculative_block(void) { + const char *dspark = getenv("DS4_TEST_DSPARK"); + if (!dspark || !dspark[0]) { + fprintf(stderr, "ds4-test: dspark-speculative-block skipped (set DS4_TEST_DSPARK to a DSpark GGUF)\n"); + return; + } + + ds4_engine *engine = test_get_engine(false); + const ds4_mtp_draft_kind draft_kind = ds4_engine_mtp_draft_kind(engine); + TEST_ASSERT(draft_kind == DS4_MTP_DRAFT_DSPARK); + if (!ds4_mtp_draft_runtime_supported(test_backend(), draft_kind)) { + fprintf(stderr, "ds4-test: dspark-speculative-block skipped (backend does not support DSpark runtime)\n"); + return; + } + TEST_ASSERT(ds4_engine_has_mtp(engine)); + TEST_ASSERT(ds4_engine_mtp_draft_tokens(engine) == 5); + + ds4_tokens prompt = {0}; + ds4_chat_begin(engine, &prompt); + ds4_chat_append_message(engine, &prompt, "user", test_mtp_copy_prompt()); + ds4_chat_append_assistant_prefix(engine, &prompt, DS4_THINK_NONE); + TEST_ASSERT(prompt.len > 0); + + int *spec = malloc((size_t)TEST_MTP_MAXGEN * sizeof(*spec)); + TEST_ASSERT(spec != NULL); + if (spec && prompt.len > 0) { + int nspec = 0, max_chunk = 0; + const bool ok_spec = test_mtp_capture_speculative(engine, &prompt, 96, + spec, &nspec, &max_chunk); + TEST_ASSERT(ok_spec); + TEST_ASSERT(max_chunk > 1); + + float worst_gap = 0.0f; + int worst_at = -1; + const bool ok_check = test_mtp_worst_argmax_gap(engine, &prompt, spec, nspec, + &worst_gap, &worst_at); + TEST_ASSERT(ok_check); + fprintf(stderr, "ds4-test: dspark-speculative-block nspec=%d max_chunk=%d worst_argmax_gap=%.3f at=%d\n", + nspec, max_chunk, worst_gap, worst_at); + TEST_ASSERT(worst_gap <= 2.0f); + } + + free(spec); + ds4_tokens_free(&prompt); +} + + #endif +static void test_dspark_binder_helpers(void) { + ds4_dspark_config cfg; + ds4_dspark_config_init_defaults(&cfg); + TEST_ASSERT(cfg.n_mtp_layers == 3); + TEST_ASSERT(cfg.block_size == 5); + TEST_ASSERT(cfg.noise_token_id == 128799u); + TEST_ASSERT(cfg.markov_rank == 256); + TEST_ASSERT(cfg.target_layer_ids[0] == 40); + TEST_ASSERT(cfg.target_layer_ids[1] == 41); + TEST_ASSERT(cfg.target_layer_ids[2] == 42); + + TEST_ASSERT(ds4_mtp_draft_kind_guess(false, false, false) == DS4_MTP_DRAFT_NONE); + TEST_ASSERT(ds4_mtp_draft_kind_guess(true, false, false) == DS4_MTP_DRAFT_LEGACY); + TEST_ASSERT(ds4_mtp_draft_kind_guess(false, true, true) == DS4_MTP_DRAFT_DSPARK); + TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, true, 0) == + DS4_MTP_DRAFT_DSPARK_NONSEQ); + TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, false, 0) == + DS4_MTP_DRAFT_NONE); + TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, true, 256) == + DS4_MTP_DRAFT_NONE); + TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_DSPARK_NONSEQ), + "dspark-nonseq")); + TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_DSPARK), "dspark")); + TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_LEGACY), "legacy-mtp")); +} + +static void test_dspark_markov_bf16_helpers(void) { + TEST_ASSERT(fabsf(ds4_dspark_bf16_to_f32(0x3fc0u) - 1.5f) < 0.001f); + TEST_ASSERT(fabsf(ds4_dspark_bf16_to_f32(0xbe80u) + 0.25f) < 0.001f); +} + + +static void test_dspark_runtime_helpers(void) { + ds4_dspark_config cfg; + ds4_dspark_config_init_defaults(&cfg); + TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_LEGACY, true, 4) == + DS4_DSPARK_SPEC_LEGACY_MTP); + TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK, true, 5) == + DS4_DSPARK_SPEC_DSPARK_ENABLED); + TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK_NONSEQ, true, 5) == + DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY); + TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK, true, 1) == + DS4_DSPARK_SPEC_DISABLED); + TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_ENABLED), + "enabled") != NULL); + TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY), + "nonseq") != NULL); + TEST_ASSERT(ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_LEGACY)); + TEST_ASSERT(!ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_NONE)); + TEST_ASSERT(ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_DSPARK)); + TEST_ASSERT(!ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_DSPARK_NONSEQ)); + TEST_ASSERT(ds4_mtp_draft_runtime_supported(DS4_BACKEND_METAL, + DS4_MTP_DRAFT_DSPARK)); + TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CUDA, + DS4_MTP_DRAFT_DSPARK)); + TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CUDA, + DS4_MTP_DRAFT_DSPARK_NONSEQ)); + TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CPU, + DS4_MTP_DRAFT_LEGACY)); + TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY), + "nonseq") != NULL); + TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY), + "not been validated") != NULL); + TEST_ASSERT(ds4_engine_has_mtp(NULL) == false); +} + +static uint32_t test_le32(const unsigned char *p) { + return (uint32_t)p[0] | + ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); +} + +static uint64_t test_le64(const unsigned char *p) { + return (uint64_t)p[0] | + ((uint64_t)p[1] << 8) | + ((uint64_t)p[2] << 16) | + ((uint64_t)p[3] << 24) | + ((uint64_t)p[4] << 32) | + ((uint64_t)p[5] << 40) | + ((uint64_t)p[6] << 48) | + ((uint64_t)p[7] << 56); +} + +static bool test_file_size(const char *path, uint64_t *size_out) { + struct stat st; + if (stat(path, &st) != 0 || st.st_size < 0) return false; + *size_out = (uint64_t)st.st_size; + return true; +} +static bool test_bf16_region_nonzero_finite(const char *path, + uint64_t offset, + uint64_t bytes) { + if (!path || bytes == 0 || (bytes & 1u) != 0) return false; + FILE *fp = fopen(path, "rb"); + if (!fp) return false; + if (fseeko(fp, (off_t)offset, SEEK_SET) != 0) { + fclose(fp); + return false; + } + unsigned char buf[4096]; + uint64_t remaining = bytes; + uint64_t values = 0; + uint64_t nonzero = 0; + while (remaining > 0) { + size_t chunk = remaining < sizeof(buf) ? (size_t)remaining : sizeof(buf); + if ((chunk & 1u) != 0) chunk--; + if (chunk == 0 || fread(buf, 1, chunk, fp) != chunk) { + fclose(fp); + return false; + } + for (size_t i = 0; i < chunk; i += 2) { + uint16_t u = (uint16_t)buf[i] | ((uint16_t)buf[i + 1] << 8); + if ((u & 0x7f80u) == 0x7f80u) { + fclose(fp); + return false; + } + if (u != 0) nonzero++; + values++; + } + remaining -= chunk; + } + return fclose(fp) == 0 && values == bytes / 2 && nonzero > 0; +} + + +static bool test_write_dspark_target_cache_dataset(const char *path) { + FILE *fp = fopen(path, "wb"); + if (!fp) return false; + const bool ok = fputs("===== DS4_IMATRIX_PROMPT 0 =====\n" + "Explain target cache export in one short sentence.\n", + fp) >= 0; + return fclose(fp) == 0 && ok; +} + +static int test_run_dspark_target_cache_cli(const char *dataset_path, + const char *output_dir) { + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) { + execl("./ds4", "./ds4", + "-m", test_model_path(), + "--metal", + "--dspark-target-cache-dataset", dataset_path, + "--dspark-target-cache-out", output_dir, + "--dspark-target-cache-target-model", "deepseek-ai/DeepSeek-V4-Flash", + "--dspark-target-cache-chat-template", "deepseek_v4_rendered", + "--dspark-target-cache-max-prompts", "1", + "--dspark-target-cache-max-tokens", "8", + "--ctx", "128", + (char *)NULL); + _exit(127); + } + int status = 0; + while (waitpid(pid, &status, 0) < 0) { + if (errno != EINTR) return -1; + } + if (!WIFEXITED(status)) return -1; + return WEXITSTATUS(status); +} + +static int test_run_dspark_target_cache_cli_missing_target_model(const char *dataset_path, + const char *output_dir) { + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) { + execl("./ds4", "./ds4", + "-m", test_model_path(), + "--metal", + "--dspark-target-cache-dataset", dataset_path, + "--dspark-target-cache-out", output_dir, + "--dspark-target-cache-chat-template", "deepseek_v4_rendered", + "--dspark-target-cache-max-prompts", "1", + "--dspark-target-cache-max-tokens", "8", + "--ctx", "128", + (char *)NULL); + _exit(127); + } + int status = 0; + while (waitpid(pid, &status, 0) < 0) { + if (errno != EINTR) return -1; + } + if (!WIFEXITED(status)) return -1; + return WEXITSTATUS(status); +} +static bool test_json_u64_field(const char *json, const char *key, uint64_t *out) { + const char *p = strstr(json, key); + if (!p) return false; + p += strlen(key); + while (*p == ' ' || *p == '\t') p++; + char *end = NULL; + unsigned long long v = strtoull(p, &end, 10); + if (end == p) return false; + *out = (uint64_t)v; + return true; +} + + +static void test_dspark_target_cache_export(void) { + char root_template[PATH_MAX]; + snprintf(root_template, sizeof(root_template), "%s", + "/tmp/ds4-target-cache-test-XXXXXX"); + char *root = mkdtemp(root_template); + TEST_ASSERT(root != NULL); + if (!root) return; + + char dataset_path[PATH_MAX]; + char output_dir[PATH_MAX]; + char missing_target_output_dir[PATH_MAX]; + char manifest_path[PATH_MAX]; + char lock_path[PATH_MAX]; + char index_path[PATH_MAX]; + char shard_path[PATH_MAX]; + TEST_ASSERT(snprintf(dataset_path, sizeof(dataset_path), "%s/prompts.txt", root) < + (int)sizeof(dataset_path)); + TEST_ASSERT(snprintf(output_dir, sizeof(output_dir), "%s/cache", root) < + (int)sizeof(output_dir)); + TEST_ASSERT(snprintf(missing_target_output_dir, sizeof(missing_target_output_dir), + "%s/missing-target-cache", root) < + (int)sizeof(missing_target_output_dir)); + TEST_ASSERT(snprintf(manifest_path, sizeof(manifest_path), "%s/manifest.json", + output_dir) < (int)sizeof(manifest_path)); + TEST_ASSERT(snprintf(index_path, sizeof(index_path), "%s/samples.idx", output_dir) < + (int)sizeof(index_path)); + TEST_ASSERT(snprintf(shard_path, sizeof(shard_path), "%s/shard-00000.bin", + output_dir) < (int)sizeof(shard_path)); + TEST_ASSERT(snprintf(lock_path, sizeof(lock_path), "%s/ds4.lock", root) < + (int)sizeof(lock_path)); + TEST_ASSERT(setenv("DS4_LOCK_FILE", lock_path, 1) == 0); + TEST_ASSERT(test_write_dspark_target_cache_dataset(dataset_path)); + const int missing_target_rc = + test_run_dspark_target_cache_cli_missing_target_model(dataset_path, + missing_target_output_dir); + TEST_ASSERT(missing_target_rc != 0); + + const int rc = test_run_dspark_target_cache_cli(dataset_path, output_dir); + TEST_ASSERT(rc == 0); + if (rc != 0) return; + + char *manifest = test_read_file(manifest_path); + TEST_ASSERT(manifest != NULL); + if (!manifest) return; + uint64_t hidden_size = 0; + uint64_t target_hidden_layers = 0; + TEST_ASSERT(strstr(manifest, "\"version\": 2") != NULL); + TEST_ASSERT(strstr(manifest, "\"format\": \"deepspec-target-cache\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"producer\": \"ds4\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"target_model_name_or_path\": \"deepseek-ai/DeepSeek-V4-Flash\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"source_gguf_path\": \"") != NULL); + TEST_ASSERT(strstr(manifest, "\"chat_template\": \"deepseek_v4_rendered\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"target_layer_ids\": [40, 41, 42]") != NULL); + TEST_ASSERT(strstr(manifest, "\"hidden_dtype\": \"bfloat16\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"token_dtype\": \"int32\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"mask_dtype\": \"uint8\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"index_record_size\": 56") != NULL); + TEST_ASSERT(test_json_u64_field(manifest, "\"target_hidden_layers\": ", + &target_hidden_layers)); + TEST_ASSERT(target_hidden_layers == 3); + TEST_ASSERT(strstr(manifest, "\"sample_split_marker\": \"===== DS4_IMATRIX_PROMPT\"") != NULL); + TEST_ASSERT(strstr(manifest, "\"shard-00000.bin\"") != NULL); + TEST_ASSERT(test_json_u64_field(manifest, "\"hidden_size\": ", &hidden_size)); + TEST_ASSERT(hidden_size > 0); + free(manifest); + + uint64_t index_size = 0; + uint64_t shard_size = 0; + TEST_ASSERT(test_file_size(index_path, &index_size)); + TEST_ASSERT(index_size == 56); + TEST_ASSERT(test_file_size(shard_path, &shard_size)); + TEST_ASSERT(shard_size > 0); + if (index_size != 56 || shard_size == 0) return; + + FILE *idx = fopen(index_path, "rb"); + TEST_ASSERT(idx != NULL); + if (!idx) return; + unsigned char rec[56]; + TEST_ASSERT(fread(rec, 1, sizeof(rec), idx) == sizeof(rec)); + TEST_ASSERT(fclose(idx) == 0); + + const uint64_t sample_id = test_le64(rec + 0); + const uint32_t shard_id = test_le32(rec + 8); + const uint32_t seq_len = test_le32(rec + 12); + const uint64_t input_ids_offset = test_le64(rec + 16); + const uint64_t attention_mask_offset = test_le64(rec + 24); + const uint64_t loss_mask_offset = test_le64(rec + 32); + const uint64_t target_hidden_states_offset = test_le64(rec + 40); + const uint64_t target_last_hidden_states_offset = test_le64(rec + 48); + + TEST_ASSERT(sample_id == 0); + TEST_ASSERT(seq_len > 0 && seq_len <= 8); + TEST_ASSERT(shard_id == 0); + TEST_ASSERT(input_ids_offset == 0); + TEST_ASSERT(attention_mask_offset == (uint64_t)seq_len * sizeof(int32_t)); + TEST_ASSERT(loss_mask_offset == attention_mask_offset + seq_len); + TEST_ASSERT(target_hidden_states_offset == loss_mask_offset + seq_len); + const uint64_t target_hidden_bytes = + (uint64_t)seq_len * target_hidden_layers * hidden_size * sizeof(uint16_t); + TEST_ASSERT(target_last_hidden_states_offset == + target_hidden_states_offset + target_hidden_bytes); + TEST_ASSERT(test_bf16_region_nonzero_finite(shard_path, + target_hidden_states_offset, + target_hidden_bytes)); + const uint64_t target_last_hidden_bytes = + (uint64_t)seq_len * hidden_size * sizeof(uint16_t); + TEST_ASSERT(shard_size == target_last_hidden_states_offset + target_last_hidden_bytes); + TEST_ASSERT(test_bf16_region_nonzero_finite(shard_path, + target_last_hidden_states_offset, + target_last_hidden_bytes)); +} + + + static void test_server_unit_group(void) { ds4_server_unit_tests_run(); } @@ -2202,18 +2577,31 @@ static const ds4_test_entry test_entries[] = { {"--metal-tensor-equivalence", "metal-tensor-equivalence", "fast/quality Metal prompt-logit and greedy equivalence", test_metal_mpp_equivalence}, {"--streaming-decode-prefill-correctness", "streaming-decode-prefill-correctness", "streaming decode-style cold prefill drift and repeatability", test_streaming_decode_prefill_correctness}, {"--mtp-verify-depth", "mtp-verify-depth", "MTP speculative verify commits autoregressive-identical tokens at draft depth > 2", test_mtp_verify_depth}, + {"--dspark-speculative-block", "dspark-speculative-block", "DSpark block drafts commit only target-verified tokens", test_dspark_speculative_block}, #endif + {"--dspark-binder", "dspark-binder", "DSpark draft kind/config defaults without GGUF", test_dspark_binder_helpers}, + {"--dspark-markov-bf16", "dspark-markov-bf16", "DSpark Markov BF16 tensor decoding", test_dspark_markov_bf16_helpers}, + {"--dspark-runtime", "dspark-runtime", "DSpark capture plan and speculative gate helpers", test_dspark_runtime_helpers}, + {"--server", "server", "server parser/rendering/cache unit tests", test_server_unit_group}, }; +static const ds4_test_entry manual_test_entries[] = { + {"--dspark-target-cache-export", "dspark-target-cache-export", "DeepSpec target-cache exporter smoke", test_dspark_target_cache_export}, +}; + static void test_print_help(const char *prog) { printf("Usage: %s [--all | TEST...]\n\n", prog); puts("Tests:"); puts(" --all"); - puts(" Run every test. This is the default, ordered from slower to faster."); + puts(" Run every default test. This is the default, ordered from slower to faster."); for (size_t i = 0; i < sizeof(test_entries) / sizeof(test_entries[0]); i++) { printf(" %-20s %s\n", test_entries[i].flag, test_entries[i].desc); } + puts("\nManual tests:"); + for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) { + printf(" %-20s %s\n", manual_test_entries[i].flag, manual_test_entries[i].desc); + } puts(" --list"); puts(" Print test names only."); #ifndef DS4_NO_GPU @@ -2247,6 +2635,13 @@ static const ds4_test_entry *test_find_entry(const char *arg) { return NULL; } +static const ds4_test_entry *test_find_manual_entry(const char *arg) { + for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) { + if (!strcmp(arg, manual_test_entries[i].flag)) return &manual_test_entries[i]; + } + return NULL; +} + static void test_run_entry(const ds4_test_entry *entry) { int before = test_failures; fprintf(stderr, "%s:\n", entry->name); @@ -2262,6 +2657,7 @@ static void test_run_entry(const ds4_test_entry *entry) { int main(int argc, char **argv) { bool run_all = argc == 1; bool selected[sizeof(test_entries) / sizeof(test_entries[0])] = {0}; + bool selected_manual[sizeof(manual_test_entries) / sizeof(manual_test_entries[0])] = {0}; for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "--all")) { @@ -2270,18 +2666,27 @@ int main(int argc, char **argv) { for (size_t j = 0; j < sizeof(test_entries) / sizeof(test_entries[0]); j++) { puts(test_entries[j].flag); } + for (size_t j = 0; j < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); j++) { + puts(manual_test_entries[j].flag); + } return 0; } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { test_print_help(argv[0]); return 0; } else { const ds4_test_entry *entry = test_find_entry(argv[i]); - if (!entry) { - fprintf(stderr, "ds4-test: unknown test switch: %s\n", argv[i]); - test_print_help(argv[0]); - return 2; + if (entry) { + selected[(size_t)(entry - test_entries)] = true; + continue; } - selected[(size_t)(entry - test_entries)] = true; + entry = test_find_manual_entry(argv[i]); + if (entry) { + selected_manual[(size_t)(entry - manual_test_entries)] = true; + continue; + } + fprintf(stderr, "ds4-test: unknown test switch: %s\n", argv[i]); + test_print_help(argv[0]); + return 2; } } @@ -2293,6 +2698,9 @@ int main(int argc, char **argv) { for (size_t i = 0; i < sizeof(test_entries) / sizeof(test_entries[0]); i++) { if (selected[i]) test_run_entry(&test_entries[i]); } + for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) { + if (selected_manual[i]) test_run_entry(&manual_test_entries[i]); + } } #ifndef DS4_NO_GPU