From cf3e62b5d3951207acd7a04726b8910d15cd18d7 Mon Sep 17 00:00:00 2001
From: Vincenzo Ingrosso <vincenzo@ingrosso.net>
Date: Sun, 28 Jun 2026 21:14:43 +0200
Subject: [PATCH 1/2] GLM/PRO: generalize CUDA DSA indexer top_k 512->1024

DeepSeek-V4-PRO uses indexer top_k=1024 (Flash=512). The CUDA indexed
attention path was hardcoded for 512, so PRO emitted garbage on --cuda:

  - ds4_gpu_attention_indexed_mixed_batch_heads_tensor() hard-rejected
    top_k>512 (`if (top_k > 512u) return 0;`), so the whole indexed
    attention no-oped for PRO.
  - attention_indexed_mixed_kernel / *_online_kernel capped candidate
    rows at 512 via comp_rows[512]/scores[768] and comp_count clamps,
    truncating PRO's 1024 selected compressed rows -> wrong sparse attn.
  - ds4_gpu_indexer_topk_tensor() gated its fast parallel top-k kernels
    behind `top_k == 512u`, forcing PRO onto the O(n_comp*top_k)
    single-thread fallback.

Fix (Flash-safe; 512 keeps its exact prior path):
  - widen comp_rows[512]->[1024], scores[768]->[1280], caps 512->1024
    in the single + online indexed kernels (the existing parallel topk
    kernels are already generic in top_k <= SORT_N).
  - raise the dispatch wall to top_k <= 1024u and force the streaming
    online heads8 kernel for top_k>512 (its candidates stream, so no
    shared-mem blowup), bypassing the 512-wide bitonic sort path.
  - relax the 5 indexer_topk selection gates from `== 512u` to
    `<= 1024u`.

Validated on 2x H200 (atlas01), PRO IQ2XXS:
  - before: control-char garbage on every prompt.
  - after: coherent output ("The capital of France is" -> "Paris.",
    think mode reasons then answers).
  - parallel top-k vs forced fallback: gen 1.69 vs 1.33 t/s (+27%).
  - Flash IQ2XXS regression byte-stable: prefill 97.3 / gen 40.9 t/s.

PRO single-GPU is SSD-streaming bound (432 GiB GGUF > 287 GiB VRAM);
this fixes correctness + the GPU-side selection cost. Throughput is
gated by NVMe expert paging, not this path.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01N1GeZuRmy2sERszdL3oRXJ
---
 ds4_cuda.cu | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 188b341ad..f10d27495 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -4824,9 +4824,9 @@ __global__ static void attention_indexed_mixed_kernel(
         if (visible_comp > n_comp) visible_comp = n_comp;
     }
     const float *qh = q + ((uint64_t)t * n_head + h) * head_dim;
-    __shared__ float scores[768];
+    __shared__ float scores[1280];
     __shared__ uint32_t raw_rows[256];
-    __shared__ uint32_t comp_rows[512];
+    __shared__ uint32_t comp_rows[1024];
     __shared__ float partial[256];
     __shared__ float max_s;
     __shared__ float denom;
@@ -4863,12 +4863,12 @@ __global__ static void attention_indexed_mixed_kernel(
         int32_t c = topk[(uint64_t)t * top_k + i];
         if (c >= 0 && (uint32_t)c < visible_comp) {
             uint32_t slot = atomicAdd(&comp_count, 1u);
-            if (slot < 512u) comp_rows[slot] = (uint32_t)c;
+            if (slot < 1024u) comp_rows[slot] = (uint32_t)c;
         }
     }
     __syncthreads();
     if (threadIdx.x == 0) {
-        if (comp_count > 512u) comp_count = 512u;
+        if (comp_count > 1024u) comp_count = 1024u;
     }
     __syncthreads();
     uint32_t n_score = raw_count + comp_count;
@@ -5197,7 +5197,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel(
     __syncthreads();
 
     uint32_t comp_count = top_k < visible_comp ? top_k : visible_comp;
-    if (comp_count > 512u) comp_count = 512u;
+    if (comp_count > 1024u) comp_count = 1024u;
     const uint32_t n_score = raw_count + comp_count;
     const float scale = rsqrtf((float)head_dim);
     const float4 *q4 = valid_head
@@ -7484,21 +7484,21 @@ extern "C" int ds4_gpu_indexer_topk_tensor(
         selected->bytes < (uint64_t)n_tokens * top_k * sizeof(uint32_t)) {
         return 0;
     }
-    if (top_k == 512u && n_comp <= 1024u &&
+    if (top_k <= 1024u && n_comp <= 1024u &&
         getenv("DS4_CUDA_NO_TOPK1024") == NULL) {
         indexer_topk_1024_kernel<<<n_tokens, 1024>>>((uint32_t *)selected->ptr,
                                                      (const float *)scores->ptr,
                                                      n_comp, n_tokens, top_k);
         return cuda_ok(cudaGetLastError(), "indexer topk 1024 launch");
     }
-    if (top_k == 512u && n_comp <= 2048u &&
+    if (top_k <= 1024u && n_comp <= 2048u &&
         getenv("DS4_CUDA_NO_TOPK2048") == NULL) {
         indexer_topk_pow2_kernel<2048><<<n_tokens, 1024>>>((uint32_t *)selected->ptr,
                                                            (const float *)scores->ptr,
                                                            n_comp, n_tokens, top_k);
         return cuda_ok(cudaGetLastError(), "indexer topk 2048 launch");
     }
-    if (top_k == 512u && n_comp <= 4096u &&
+    if (top_k <= 1024u && n_comp <= 4096u &&
         getenv("DS4_CUDA_NO_TOPK2048") == NULL) {
         if (n_comp == 4096u) {
             using TopkCubSort = cub::BlockRadixSort<uint64_t, 512, 16>;
@@ -7528,7 +7528,7 @@ extern "C" int ds4_gpu_indexer_topk_tensor(
                                                            n_comp, n_tokens, top_k);
         return cuda_ok(cudaGetLastError(), "indexer topk 4096 launch");
     }
-    if (top_k == 512u && n_comp <= 8192u &&
+    if (top_k <= 1024u && n_comp <= 8192u &&
         getenv("DS4_CUDA_NO_TOPK2048") == NULL &&
         getenv("DS4_CUDA_NO_TOPK8192") == NULL) {
         if (n_comp > 4096u) {
@@ -7559,7 +7559,7 @@ extern "C" int ds4_gpu_indexer_topk_tensor(
                                                                n_comp, n_tokens, top_k);
         return cuda_ok(cudaGetLastError(), "indexer topk 8192 launch");
     }
-    if (top_k == 512u && getenv("DS4_CUDA_NO_TOPK2048") == NULL &&
+    if (top_k <= 1024u && getenv("DS4_CUDA_NO_TOPK2048") == NULL &&
         getenv("DS4_CUDA_NO_TOPK_CHUNKED") == NULL) {
         const uint32_t chunk_n = 4096u;
         const uint32_t n_chunks = (n_comp + chunk_n - 1u) / chunk_n;
@@ -8979,7 +8979,7 @@ extern "C" int ds4_gpu_attention_indexed_mixed_batch_heads_tensor(
         topk->bytes < (uint64_t)n_tokens * top_k * sizeof(int32_t)) {
         return 0;
     }
-    if (top_k > 512u) return 0;
+    if (top_k > 1024u) return 0;
     const float *sinks = (const float *)cuda_model_range_ptr(
             model_map, sinks_offset, (uint64_t)n_head * sizeof(float), "attn_sinks");
     if (!sinks) return 0;
@@ -8993,9 +8993,9 @@ extern "C" int ds4_gpu_attention_indexed_mixed_batch_heads_tensor(
         if (!cuda_ok(cudaGetLastError(), "indexed attention topk sort launch")) return 0;
         topk_ptr = sorted;
     }
-    if (n_tokens > 1 && head_dim == 512 && top_k <= 512u &&
+    if (n_tokens > 1 && head_dim == 512 && top_k <= 1024u &&
         getenv("DS4_CUDA_NO_INDEXED_HEADS8") == NULL) {
-        if (getenv("DS4_CUDA_INDEXED_TWOPASS") == NULL) {
+        if (getenv("DS4_CUDA_INDEXED_TWOPASS") == NULL || top_k > 512u) {
             dim3 grid(n_tokens, (n_head + 15u) / 16u, 1);
             attention_indexed_mixed_heads8_online_kernel<8, 16><<<grid, 512>>>((float *)heads->ptr,
                                                                                sinks,

From b6917a710152e59bdc70965b9fe840d0926fa4b8 Mon Sep 17 00:00:00 2001
From: Vincenzo Ingrosso <vincenzo@ingrosso.net>
Date: Sun, 28 Jun 2026 23:39:24 +0200
Subject: [PATCH 2/2] PRO: enable decode LUT iq2_xxs gate kernel for
 expert_in_dim>4096

The fast n_tokens==1 MoE gate path (moe_gate_up_mid_decode_lut_qwarp32_kernel,
DS4_CUDA_MOE_NO_DECODE_LUT_GATE to disable) staged the quantized activation into
a fixed __shared__ sxq[16] and was gated `xq_blocks <= 16u`. PRO's experts have
expert_in_dim=7168 -> xq_blocks=28 > 16, so PRO fell off this path onto the
slower global-memory gate, making iq2_xxs gateup the dominant decode-MoE cost.

Widen the staging buffer to sxq[32] and the guard/dispatch to `xq_blocks <= 32u`
so PRO (28 blocks) also stages x in shared memory. 32 q8_K blocks (~9 KiB) plus
the iq2 grid/signs stays well under the per-SM shared-memory limit.

Logically inert for Flash/GLM (xq_blocks<=16 run the identical staged path).
Validated on 2x H200, PRO IQ2XXS, --ssd-streaming:
  - MoE gateup 1.03 ms -> 0.179 ms/layer (5.7x).
  - decode gen 1.74 -> 1.84 t/s (+5.7%).
  - output unchanged ("The capital of France is" -> "Paris." with LUT on and off).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01N1GeZuRmy2sERszdL3oRXJ
---
 ds4_cuda.cu | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index f10d27495..2e40ed18b 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -10469,10 +10469,13 @@ __global__ static void moe_gate_up_mid_decode_lut_qwarp32_kernel(
     if (expert_i < 0) expert_i = 0;
     uint32_t expert = (uint32_t)expert_i;
     const cuda_block_q8_K *xqb = xq + (uint64_t)tok * xq_blocks;
-    __shared__ cuda_block_q8_K sxq[16];
+    /* Stage the quantized activation in shared mem; sized 32 to also cover PRO
+     * (expert_in_dim 7168 -> xq_blocks 28), not just <=4096-wide Flash/GLM (<=16).
+     * 32 blocks (~9 KiB) + grid/signs stays well under the per-SM shared limit. */
+    __shared__ cuda_block_q8_K sxq[32];
     __shared__ uint64_t s_iq2_grid[256];
     __shared__ uint8_t s_iq2_signs[128];
-    if (xq_blocks <= 16u) {
+    if (xq_blocks <= 32u) {
         for (uint32_t i = threadIdx.x; i < xq_blocks; i += blockDim.x) sxq[i] = xqb[i];
         for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) s_iq2_grid[i] = cuda_iq2xxs_grid[i];
         for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) s_iq2_signs[i] = cuda_ksigns_iq2xs[i];
@@ -12336,7 +12339,7 @@ static int routed_moe_launch(
             n_tokens >= 128u && getenv("DS4_CUDA_MOE_NO_DOWN_TILE16") == NULL &&
             (use_atomic_down || q4k_path);
         const uint32_t use_decode_lut_gate =
-            !q4k_path && n_tokens == 1u && xq_blocks <= 16u &&
+            !q4k_path && n_tokens == 1u && xq_blocks <= 32u &&
             getenv("DS4_CUDA_MOE_NO_DECODE_LUT_GATE") == NULL;
         const uint32_t gate_row_span =
             getenv("DS4_CUDA_MOE_GATE_ROW512") != NULL ? 512u :