From cf3e62b5d3951207acd7a04726b8910d15cd18d7 Mon Sep 17 00:00:00 2001 From: Vincenzo Ingrosso Date: Sun, 28 Jun 2026 21:14:43 +0200 Subject: [PATCH 1/2] GLM/PRO: generalize CUDA DSA indexer top_k 512->1024 DeepSeek-V4-PRO uses indexer top_k=1024 (Flash=512). The CUDA indexed attention path was hardcoded for 512, so PRO emitted garbage on --cuda: - ds4_gpu_attention_indexed_mixed_batch_heads_tensor() hard-rejected top_k>512 (`if (top_k > 512u) return 0;`), so the whole indexed attention no-oped for PRO. - attention_indexed_mixed_kernel / *_online_kernel capped candidate rows at 512 via comp_rows[512]/scores[768] and comp_count clamps, truncating PRO's 1024 selected compressed rows -> wrong sparse attn. - ds4_gpu_indexer_topk_tensor() gated its fast parallel top-k kernels behind `top_k == 512u`, forcing PRO onto the O(n_comp*top_k) single-thread fallback. Fix (Flash-safe; 512 keeps its exact prior path): - widen comp_rows[512]->[1024], scores[768]->[1280], caps 512->1024 in the single + online indexed kernels (the existing parallel topk kernels are already generic in top_k <= SORT_N). - raise the dispatch wall to top_k <= 1024u and force the streaming online heads8 kernel for top_k>512 (its candidates stream, so no shared-mem blowup), bypassing the 512-wide bitonic sort path. - relax the 5 indexer_topk selection gates from `== 512u` to `<= 1024u`. Validated on 2x H200 (atlas01), PRO IQ2XXS: - before: control-char garbage on every prompt. - after: coherent output ("The capital of France is" -> "Paris.", think mode reasons then answers). - parallel top-k vs forced fallback: gen 1.69 vs 1.33 t/s (+27%). - Flash IQ2XXS regression byte-stable: prefill 97.3 / gen 40.9 t/s. PRO single-GPU is SSD-streaming bound (432 GiB GGUF > 287 GiB VRAM); this fixes correctness + the GPU-side selection cost. Throughput is gated by NVMe expert paging, not this path. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01N1GeZuRmy2sERszdL3oRXJ --- ds4_cuda.cu | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 188b341ad..f10d27495 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -4824,9 +4824,9 @@ __global__ static void attention_indexed_mixed_kernel( if (visible_comp > n_comp) visible_comp = n_comp; } const float *qh = q + ((uint64_t)t * n_head + h) * head_dim; - __shared__ float scores[768]; + __shared__ float scores[1280]; __shared__ uint32_t raw_rows[256]; - __shared__ uint32_t comp_rows[512]; + __shared__ uint32_t comp_rows[1024]; __shared__ float partial[256]; __shared__ float max_s; __shared__ float denom; @@ -4863,12 +4863,12 @@ __global__ static void attention_indexed_mixed_kernel( int32_t c = topk[(uint64_t)t * top_k + i]; if (c >= 0 && (uint32_t)c < visible_comp) { uint32_t slot = atomicAdd(&comp_count, 1u); - if (slot < 512u) comp_rows[slot] = (uint32_t)c; + if (slot < 1024u) comp_rows[slot] = (uint32_t)c; } } __syncthreads(); if (threadIdx.x == 0) { - if (comp_count > 512u) comp_count = 512u; + if (comp_count > 1024u) comp_count = 1024u; } __syncthreads(); uint32_t n_score = raw_count + comp_count; @@ -5197,7 +5197,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel( __syncthreads(); uint32_t comp_count = top_k < visible_comp ? top_k : visible_comp; - if (comp_count > 512u) comp_count = 512u; + if (comp_count > 1024u) comp_count = 1024u; const uint32_t n_score = raw_count + comp_count; const float scale = rsqrtf((float)head_dim); const float4 *q4 = valid_head @@ -7484,21 +7484,21 @@ extern "C" int ds4_gpu_indexer_topk_tensor( selected->bytes < (uint64_t)n_tokens * top_k * sizeof(uint32_t)) { return 0; } - if (top_k == 512u && n_comp <= 1024u && + if (top_k <= 1024u && n_comp <= 1024u && getenv("DS4_CUDA_NO_TOPK1024") == NULL) { indexer_topk_1024_kernel<<>>((uint32_t *)selected->ptr, (const float *)scores->ptr, n_comp, n_tokens, top_k); return cuda_ok(cudaGetLastError(), "indexer topk 1024 launch"); } - if (top_k == 512u && n_comp <= 2048u && + if (top_k <= 1024u && n_comp <= 2048u && getenv("DS4_CUDA_NO_TOPK2048") == NULL) { indexer_topk_pow2_kernel<2048><<>>((uint32_t *)selected->ptr, (const float *)scores->ptr, n_comp, n_tokens, top_k); return cuda_ok(cudaGetLastError(), "indexer topk 2048 launch"); } - if (top_k == 512u && n_comp <= 4096u && + if (top_k <= 1024u && n_comp <= 4096u && getenv("DS4_CUDA_NO_TOPK2048") == NULL) { if (n_comp == 4096u) { using TopkCubSort = cub::BlockRadixSort; @@ -7528,7 +7528,7 @@ extern "C" int ds4_gpu_indexer_topk_tensor( n_comp, n_tokens, top_k); return cuda_ok(cudaGetLastError(), "indexer topk 4096 launch"); } - if (top_k == 512u && n_comp <= 8192u && + if (top_k <= 1024u && n_comp <= 8192u && getenv("DS4_CUDA_NO_TOPK2048") == NULL && getenv("DS4_CUDA_NO_TOPK8192") == NULL) { if (n_comp > 4096u) { @@ -7559,7 +7559,7 @@ extern "C" int ds4_gpu_indexer_topk_tensor( n_comp, n_tokens, top_k); return cuda_ok(cudaGetLastError(), "indexer topk 8192 launch"); } - if (top_k == 512u && getenv("DS4_CUDA_NO_TOPK2048") == NULL && + if (top_k <= 1024u && getenv("DS4_CUDA_NO_TOPK2048") == NULL && getenv("DS4_CUDA_NO_TOPK_CHUNKED") == NULL) { const uint32_t chunk_n = 4096u; const uint32_t n_chunks = (n_comp + chunk_n - 1u) / chunk_n; @@ -8979,7 +8979,7 @@ extern "C" int ds4_gpu_attention_indexed_mixed_batch_heads_tensor( topk->bytes < (uint64_t)n_tokens * top_k * sizeof(int32_t)) { return 0; } - if (top_k > 512u) return 0; + if (top_k > 1024u) return 0; const float *sinks = (const float *)cuda_model_range_ptr( model_map, sinks_offset, (uint64_t)n_head * sizeof(float), "attn_sinks"); if (!sinks) return 0; @@ -8993,9 +8993,9 @@ extern "C" int ds4_gpu_attention_indexed_mixed_batch_heads_tensor( if (!cuda_ok(cudaGetLastError(), "indexed attention topk sort launch")) return 0; topk_ptr = sorted; } - if (n_tokens > 1 && head_dim == 512 && top_k <= 512u && + if (n_tokens > 1 && head_dim == 512 && top_k <= 1024u && getenv("DS4_CUDA_NO_INDEXED_HEADS8") == NULL) { - if (getenv("DS4_CUDA_INDEXED_TWOPASS") == NULL) { + if (getenv("DS4_CUDA_INDEXED_TWOPASS") == NULL || top_k > 512u) { dim3 grid(n_tokens, (n_head + 15u) / 16u, 1); attention_indexed_mixed_heads8_online_kernel<8, 16><<>>((float *)heads->ptr, sinks, From b6917a710152e59bdc70965b9fe840d0926fa4b8 Mon Sep 17 00:00:00 2001 From: Vincenzo Ingrosso Date: Sun, 28 Jun 2026 23:39:24 +0200 Subject: [PATCH 2/2] PRO: enable decode LUT iq2_xxs gate kernel for expert_in_dim>4096 The fast n_tokens==1 MoE gate path (moe_gate_up_mid_decode_lut_qwarp32_kernel, DS4_CUDA_MOE_NO_DECODE_LUT_GATE to disable) staged the quantized activation into a fixed __shared__ sxq[16] and was gated `xq_blocks <= 16u`. PRO's experts have expert_in_dim=7168 -> xq_blocks=28 > 16, so PRO fell off this path onto the slower global-memory gate, making iq2_xxs gateup the dominant decode-MoE cost. Widen the staging buffer to sxq[32] and the guard/dispatch to `xq_blocks <= 32u` so PRO (28 blocks) also stages x in shared memory. 32 q8_K blocks (~9 KiB) plus the iq2 grid/signs stays well under the per-SM shared-memory limit. Logically inert for Flash/GLM (xq_blocks<=16 run the identical staged path). Validated on 2x H200, PRO IQ2XXS, --ssd-streaming: - MoE gateup 1.03 ms -> 0.179 ms/layer (5.7x). - decode gen 1.74 -> 1.84 t/s (+5.7%). - output unchanged ("The capital of France is" -> "Paris." with LUT on and off). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01N1GeZuRmy2sERszdL3oRXJ --- ds4_cuda.cu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ds4_cuda.cu b/ds4_cuda.cu index f10d27495..2e40ed18b 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -10469,10 +10469,13 @@ __global__ static void moe_gate_up_mid_decode_lut_qwarp32_kernel( if (expert_i < 0) expert_i = 0; uint32_t expert = (uint32_t)expert_i; const cuda_block_q8_K *xqb = xq + (uint64_t)tok * xq_blocks; - __shared__ cuda_block_q8_K sxq[16]; + /* Stage the quantized activation in shared mem; sized 32 to also cover PRO + * (expert_in_dim 7168 -> xq_blocks 28), not just <=4096-wide Flash/GLM (<=16). + * 32 blocks (~9 KiB) + grid/signs stays well under the per-SM shared limit. */ + __shared__ cuda_block_q8_K sxq[32]; __shared__ uint64_t s_iq2_grid[256]; __shared__ uint8_t s_iq2_signs[128]; - if (xq_blocks <= 16u) { + if (xq_blocks <= 32u) { for (uint32_t i = threadIdx.x; i < xq_blocks; i += blockDim.x) sxq[i] = xqb[i]; for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) s_iq2_grid[i] = cuda_iq2xxs_grid[i]; for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) s_iq2_signs[i] = cuda_ksigns_iq2xs[i]; @@ -12336,7 +12339,7 @@ static int routed_moe_launch( n_tokens >= 128u && getenv("DS4_CUDA_MOE_NO_DOWN_TILE16") == NULL && (use_atomic_down || q4k_path); const uint32_t use_decode_lut_gate = - !q4k_path && n_tokens == 1u && xq_blocks <= 16u && + !q4k_path && n_tokens == 1u && xq_blocks <= 32u && getenv("DS4_CUDA_MOE_NO_DECODE_LUT_GATE") == NULL; const uint32_t gate_row_span = getenv("DS4_CUDA_MOE_GATE_ROW512") != NULL ? 512u :