Luce-Org · davide221 · May 28, 2026 · May 28, 2026
diff --git a/README.md b/README.md
@@ -129,7 +129,7 @@ harness/clients/run_codex.sh
 
 ## Run the Server
 
-Default: Qwen 3.6-27B Q4_K_M target + Lucebox Q8_0 DFlash drafter on RTX 3090. DDTree budget=22, TQ3_0 KV cache, sliding FA window 2048. OpenAI-compatible HTTP on `:8000`.
+Default: Qwen 3.6-27B Q4_K_M target + Lucebox Q4_K_M DFlash drafter on RTX 3090. DDTree budget=22, TQ3_0 KV cache, sliding FA window 2048. OpenAI-compatible HTTP on `:8000`.
 
 ```bash
 # build (CUDA 12+, CMake 3.18+)
@@ -139,12 +139,12 @@ cmake --build server/build --target dflash_server -j
 
 # default weights (~18 GB)
 hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir server/models/
-hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/
+hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir server/models/draft/
 
 # run (TQ3_0 KV auto-enabled; set =0 to disable)
 DFLASH27B_KV_TQ3=1 \
 ./server/build/dflash_server server/models/Qwen3.6-27B-Q4_K_M.gguf \
-  --draft server/models/draft/dflash-draft-3.6-q8_0.gguf \
+  --draft server/models/draft/dflash-draft-3.6-q4_k_m.gguf \
   --ddtree --ddtree-budget 22 --fa-window 2048 --port 8000
 ```
 

diff --git a/docs/specs/openapi-props.yaml b/docs/specs/openapi-props.yaml
@@ -104,7 +104,7 @@ paths:
                       lifetime_hits: 0
                     model:
                       arch: "qwen35"
-                      draft_path: "/.../dflash-draft-3.6-q8_0.gguf"
+                      draft_path: "/.../dflash-draft-3.6-q4_k_m.gguf"
                       tokenizer_id: null
                     model_alias: "dflash"
                     model_card:
@@ -451,7 +451,7 @@ components:
           description: |
             Filesystem path of the loaded speculative-decode draft
             GGUF; `null` when no draft is loaded.
-          example: "/.../dflash-draft-3.6-q8_0.gguf"
+          example: "/.../dflash-draft-3.6-q4_k_m.gguf"
         tokenizer_id:
           type: ["string", "null"]
           description: |

diff --git a/docs/specs/props-endpoint.md b/docs/specs/props-endpoint.md
@@ -570,7 +570,7 @@ version increments.
   },
   "model": {
     "arch":         "qwen35",
-    "draft_path":   "/.../dflash-draft-3.6-q8_0.gguf",
+    "draft_path":   "/.../dflash-draft-3.6-q4_k_m.gguf",
     "tokenizer_id": "qwen3"
   },
   "model_alias": "dflash",

diff --git a/harness/README.md b/harness/README.md
@@ -53,7 +53,7 @@ paths and profile:
 ```bash
 DFLASH_SERVER_BIN=server/build/dflash_server \
 TARGET=server/models/Qwen3.6-27B-Q4_K_M.gguf \
-DRAFT=server/models/draft/dflash-draft-3.6-q8_0.gguf \
+DRAFT=server/models/draft/dflash-draft-3.6-q4_k_m.gguf \
 MODEL_ID=luce-dflash \
 MAX_CTX=32768 MAX_TOKENS=512 \
 BUDGET=22 VERIFY_MODE=ddtree FA_WINDOW=2048 \
@@ -64,7 +64,7 @@ To test an already-running native server:
 
 ```bash
 server/build/dflash_server server/models/Qwen3.6-27B-Q4_K_M.gguf \
-  --draft server/models/draft/dflash-draft-3.6-q8_0.gguf \
+  --draft server/models/draft/dflash-draft-3.6-q4_k_m.gguf \
   --host 127.0.0.1 --port 18080 \
   --max-ctx 32768 --max-tokens 512 \
   --fa-window 2048 \

diff --git a/harness/benchmarks/run_lucebox_vs_llamacpp.sh b/harness/benchmarks/run_lucebox_vs_llamacpp.sh
@@ -8,7 +8,7 @@ STAMP="${STAMP:-generation-baseline-$(date +%Y%m%d-%H%M%S)}"
 LOG_DIR="$RUN_DIR/$STAMP"
 
 TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}"
-DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}"
+DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q4_k_m.gguf}"
 LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-$REPO_DIR/server/deps/llama.cpp/build/bin/llama-server}"
 
 HOST="${HOST:-127.0.0.1}"

diff --git a/harness/clients/common.sh b/harness/clients/common.sh
@@ -9,7 +9,7 @@ CLIENT_WORK_DIR="${CLIENT_WORK_DIR:-/workspace/lucebox-harness-work}"
 RUN_DIR="${RUN_DIR:-/workspace/lucebox-client-harness-runs}"
 
 TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}"
-DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}"
+DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q4_k_m.gguf}"
 MODEL_SERVER="${MODEL_SERVER:-lucebox}"
 DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/server/build/dflash_server}"
 LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}"

diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -428,7 +428,10 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
         target_compile_definitions(dflash_common PRIVATE
             DFLASH27B_HAVE_CUDA_SCALAR_FLASHPREFILL=1
             DFLASH27B_HAVE_PASCAL_FLASHPREFILL=1)
-        # Scalar Pascal kernels only target sm_60-69.
+        # Best-effort narrowing for CMake versions/toolchains that honor
+        # per-source CUDA_ARCHITECTURES; flashprefill_scalar.cu also keeps
+        # its kernels visible in newer arch passes so default fatbinary builds
+        # do not end up with host launchers but missing device symbols.
         set_source_files_properties(src/flashprefill_scalar.cu PROPERTIES
             CUDA_ARCHITECTURES "60;61;62")
     endif()

diff --git a/server/DEVELOPER.md b/server/DEVELOPER.md
@@ -92,8 +92,8 @@ Download models before running the server:
 # Target model (Q4_K_M quantized Qwen3.6-27B)
 hf download <repo-id> --local-dir server/models/
 
-# Draft model (1.84 GB default Qwen3.6 GGUF draft)
-hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/
+# Draft model (0.98 GB default Qwen3.6 GGUF draft)
+hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir server/models/draft/
 ```
 
 Expected layout:
@@ -102,7 +102,7 @@ Expected layout:
 server/models/
 ├── Qwen3.6-27B-Q4_K_M.gguf          # --target (GGUF)
 └── draft/
-    └── dflash-draft-3.6-q8_0.gguf     # --draft  (GGUF)
+    └── dflash-draft-3.6-q4_k_m.gguf   # --draft  (GGUF)
 ```
 
 The target path can also be set via the `DFLASH_TARGET` environment variable.
@@ -159,12 +159,12 @@ cd server/build
 
 # Numerics tests
 ./test_vs_oracle --target ../models/Qwen3.6-27B-Q4_K_M.gguf \
-                 --draft ../models/draft/dflash-draft-3.6-q8_0.gguf
+                 --draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf
 
 # Smoke tests
 ./smoke_load_target --target ../models/Qwen3.6-27B-Q4_K_M.gguf
-./smoke_load_draft --draft ../models/draft/dflash-draft-3.6-q8_0.gguf
-./smoke_draft_graph --draft ../models/draft/dflash-draft-3.6-q8_0.gguf
+./smoke_load_draft --draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf
+./smoke_draft_graph --draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf
 ```
 
 ### Integration tests (require running server)
@@ -200,7 +200,7 @@ server/
 │   └── Block-Sparse-Attention/ # BSA kernels (submodule)
 ├── models/                     # Model files (not in git)
 │   ├── Qwen3.6-27B-Q4_K_M.gguf
-│   └── draft/dflash-draft-3.6-q8_0.gguf
+│   └── draft/dflash-draft-3.6-q4_k_m.gguf
 ├── scripts/
 │   ├── run.py                  # CLI text generation
 │   ├── test_server_prefix_cache.py    # Integration test (--url or auto-spawn)

diff --git a/server/README.md b/server/README.md
@@ -10,7 +10,7 @@
 
 <p align="center">
   <strong>GGUF DFlash speculative decoding for Qwen3.5/Qwen3.6 27B.</strong><br/>
-  C++/CUDA runtime on top of ggml. Default path: Qwen3.6-27B Q4_K_M target + Lucebox Q8_0 GGUF DFlash draft.<br/>
+  C++/CUDA runtime on top of ggml. Default path: Qwen3.6-27B Q4_K_M target + Lucebox Q4_K_M GGUF DFlash draft.<br/>
   Qwen3.5 reference: 129.5 tok/s mean on HumanEval (3.43x vs AR); best demo run: 207.6 tok/s vs 38.0 tok/s AR (5.46x).<br/><br/>
   <a href="https://lucebox.com/blog/dflash27b">Blog post</a> · <a href="RESULTS.md">Benchmarks</a> · <a href="https://discord.gg/yHfswqZmJQ">Discord</a> · <a href="https://lucebox.com">lucebox.com</a>
 </p>
@@ -135,7 +135,7 @@ Qwen3.6-27B is the default integration path. It uses the same `qwen35` target ar
 hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir models/
 
 # 2. matched 3.6 draft (GGUF, used by default by scripts/run.py and dflash_server)
-hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir models/draft/
+hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir models/draft/
 
 # 3. bench
 DFLASH_TARGET=models/Qwen3.6-27B-Q4_K_M.gguf python3 scripts/bench_he.py --n-gen 128
@@ -162,7 +162,7 @@ Run it directly:
 
 ```bash
 ./build/dflash_server models/Qwen3.6-27B-Q4_K_M.gguf \
-  --draft models/draft/dflash-draft-3.6-q8_0.gguf \
+  --draft models/draft/dflash-draft-3.6-q4_k_m.gguf \
   --host 127.0.0.1 --port 18080 \
   --max-ctx 32768 --max-tokens 512 \
   --fa-window 2048 \
@@ -326,12 +326,12 @@ cd lucebox-hub/dflash
 cmake -B build -S . -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=86
 cmake --build build --target test_dflash -j
 
-# Fetch models: ~16 GB target + 1.84 GB Lucebox Q8_0 GGUF DFlash draft.
+# Fetch models: ~16 GB target + 0.98 GB Lucebox Q4_K_M GGUF DFlash draft.
 # Quickstart pins to Qwen3.6-27B (latest release). For Qwen3.5-27B swap in
 # unsloth/Qwen3.5-27B-GGUF + z-lab/Qwen3.5-27B-DFlash; arch is identical so
 # no rebuild is needed.
 hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir models/
-hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir models/draft/
+hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir models/draft/
 
 # Streaming one-shot generate (run.py defaults to models/Qwen3.6-27B-Q4_K_M.gguf;
 # override with --target or DFLASH_TARGET=... env var).
@@ -342,7 +342,7 @@ python3 examples/chat.py
 
 # OpenAI-compatible HTTP server (drop-in for Open WebUI / LM Studio / Cline).
 ./build/dflash_server models/Qwen3.6-27B-Q4_K_M.gguf \
-  --draft models/draft/dflash-draft-3.6-q8_0.gguf --port 8000
+  --draft models/draft/dflash-draft-3.6-q4_k_m.gguf --port 8000
 
 # Reproduce paper numbers
 python3 scripts/bench_llm.py                                 # HE + GSM8K + Math500
@@ -353,7 +353,7 @@ python3 scripts/bench_he.py --n-gen 256 --ddtree-budget 22   # minimal HE bench
 ```bash
 DFLASH27B_KV_TQ3=1 DFLASH27B_PREFILL_UBATCH=16 \
   build/test_dflash models/Qwen3.6-27B-Q4_K_M.gguf \
-  models/draft/dflash-draft-3.6-q8_0.gguf /tmp/long_prompt.bin 64 /tmp/out.bin \
+  models/draft/dflash-draft-3.6-q4_k_m.gguf /tmp/long_prompt.bin 64 /tmp/out.bin \
   --fast-rollback --ddtree --ddtree-budget=16 --max-ctx=4096   # align_up(prompt + n_gen + 64, 256); raise up to 262144 for long prompts
 ```
 

diff --git a/server/scripts/bench_agent.py b/server/scripts/bench_agent.py
@@ -41,7 +41,7 @@
     "DFLASH_TARGET",
     str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
 )
-_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf"
+_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
 _LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
 DRAFT = None
 TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}"))

diff --git a/server/scripts/bench_he.py b/server/scripts/bench_he.py
@@ -26,7 +26,7 @@
     "DFLASH_TARGET",
     str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
 )
-_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf"
+_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
 _LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
 DRAFT = None
 TEST_DFLASH = os.environ.get(

diff --git a/server/scripts/bench_llm.py b/server/scripts/bench_llm.py
@@ -25,7 +25,7 @@
     "DFLASH_TARGET",
     str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
 )
-_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf"
+_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
 _LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
 DRAFT = None
 TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}"))

diff --git a/server/src/flashprefill_scalar.cu b/server/src/flashprefill_scalar.cu
@@ -4,16 +4,16 @@
 //   - F16 (half) instead of __nv_bfloat16 — Pascal has no BF16 hardware
 //   - Scalar F16×F16→F32 math instead of WMMA tensor cores
 //   - Cooperative shared-memory loads instead of cp.async (Pascal has no async copy)
-//   - __shfl / __shfl_down instead of __shfl_sync / __shfl_down_sync
-//   - membar.gl instead of fence.acq_rel.gpu
+//   - Uses full-warp _sync vote/shuffle intrinsics, which CUDA 12 accepts for
+//     Pascal and newer targets.
 //
 // Dispatched from flashprefill.cpp when DFLASH27B_HAVE_PASCAL_FLASHPREFILL is set.
 // The drafter's persistent buffers must be GGML_TYPE_F16.
 //
-// Guarded to compile only for sm_60-69 so Pascal-specific intrinsics
-// (__shfl without _sync, etc.) don't affect sm_70+ codepaths.
-
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600 && __CUDA_ARCH__ < 700)
+// CMake's CUDA_ARCHITECTURES is a target property in many supported CMake
+// versions, so this file may still be compiled during an sm_70+ pass in a
+// default multi-arch build. Keep the kernel definitions visible for every pass
+// and use warp intrinsics that compile cleanly across those targets.
 
 #include <cstdint>
 #include <cuda_runtime.h>
@@ -22,14 +22,22 @@
 namespace dflash::common {
 namespace flashprefill {
 
+__device__ inline float scalar_shfl_xor(float v, int lane_mask) {
+    return __shfl_xor_sync(0xffffffffu, v, lane_mask);
+}
+
+__device__ inline unsigned scalar_ballot(bool pred) {
+    return __ballot_sync(0xffffffffu, pred);
+}
+
 // =============================================================================
 // Kernel 1: compute_mean_vector (F16, scalar)
 // =============================================================================
 // Each block of (D_HEAD threads, 1 K-head, 1 batch) reduces one K-block
 // along the sequence dim, computing the mean per dim.
 
 template <int BLOCK, int D_HEAD>
-__global__ void compute_mean_vector_kernel_f16(
+__global__ void compute_mean_vector_kernel_f16_pascal(
     const half * __restrict__ K,
     half       * __restrict__ mean_K,
     int batch, int seq_len, int n_kv_heads,
@@ -73,7 +81,7 @@ extern "C" void launch_compute_mean_vector_f16_pascal(
     dim3 grid(n_k_blocks, batch * n_kv_heads, 1);
     dim3 block(head_dim, 1, 1);
     if (head_dim == 128 && block_size == 128) {
-        compute_mean_vector_kernel_f16<128, 128><<<grid, block, 0, stream>>>(
+        compute_mean_vector_kernel_f16_pascal<128, 128><<<grid, block, 0, stream>>>(
             (const half *)K, (half *)mean_K,
             batch, seq_len, n_kv_heads,
             s_K_b, s_K_n, s_K_h, s_K_d,
@@ -87,7 +95,7 @@ extern "C" void launch_compute_mean_vector_f16_pascal(
 // Per (q_block, k_block), compute the attention score via Q · mean_K^T.
 
 template <int BLOCK, int D_HEAD, int N_BLOCKS_TILE>
-__global__ void compute_block_score_kernel_f16(
+__global__ void compute_block_score_kernel_f16_pascal(
     const half * __restrict__ Q,
     const half * __restrict__ mean_K,
     float sm_scale,
@@ -174,7 +182,7 @@ extern "C" void launch_compute_block_score_f16_pascal(
     dim3 block(block_size, 1, 1);
     size_t smem = block_size * sizeof(float);
     if (head_dim == 128 && block_size == 128) {
-        compute_block_score_kernel_f16<128, 128, 1><<<grid, block, smem, stream>>>(
+        compute_block_score_kernel_f16_pascal<128, 128, 1><<<grid, block, smem, stream>>>(
             (const half *)Q, (const half *)mean_K, sm_scale,
             (float *)score, (float *)score_max,
             batch, n_q_heads, n_k_heads, M, M,
@@ -192,7 +200,7 @@ extern "C" void launch_compute_block_score_f16_pascal(
 // Scalar F16×F16→F32 math, shared-memory tiled, no WMMA.
 
 template <int Q_TILE, int K_TILE, int BLOCK, int D_HEAD>
-__global__ void sparse_flash_forward_kernel_f16(
+__global__ void sparse_flash_forward_kernel_f16_pascal(
     const half * __restrict__ Q,
     const half * __restrict__ K,
     const half * __restrict__ V,
@@ -404,7 +412,7 @@ extern "C" void launch_sparse_flash_forward_f16_pascal(
                            + sizeof(half) * (K_TILE * D_HEAD)     // KV tile
                            + sizeof(half) * (Q_TILE * K_TILE)     // P tile
                            + sizeof(float) * (2 * Q_TILE);        // row_m + row_l
-        sparse_flash_forward_kernel_f16<Q_TILE, K_TILE, BLOCK, D_HEAD><<<grid, block, smem_bytes, stream>>>(
+        sparse_flash_forward_kernel_f16_pascal<Q_TILE, K_TILE, BLOCK, D_HEAD><<<grid, block, smem_bytes, stream>>>(
             (const half *)Q, (const half *)K,
             (const half *)V, (half *)O,
             block_index, counts, scale,
@@ -453,10 +461,9 @@ __global__ void block_select_kernel_pascal(
         float v = valid ? sp[(size_t)n * s_n] : NEG_INF;
         local_max = fmaxf(local_max, v);
     }
-    // Pascal: __shfl_xor (no _sync suffix)
     #pragma unroll
     for (int off = 16; off > 0; off >>= 1)
-        local_max = fmaxf(local_max, __shfl_xor(local_max, off));
+        local_max = fmaxf(local_max, scalar_shfl_xor(local_max, off));
     const float max_score = local_max;
     const float thresh = max_score * alpha;
 
@@ -473,7 +480,7 @@ __global__ void block_select_kernel_pascal(
                 || ((m - n) < window)
                 || (v >= thresh);
         }
-        unsigned mask = __ballot(keep);  // Pascal: __ballot (no _sync suffix)
+        unsigned mask = scalar_ballot(keep);
         int rank = __popc(mask & ((1u << lane) - 1u));
         if (keep) {
             idxp[(size_t)(total + rank) * idx_s_n] = (int32_t)n;
@@ -513,5 +520,3 @@ extern "C" void launch_block_select_pascal(
 
 } // namespace flashprefill
 } // namespace dflash::common
-
-#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600 && __CUDA_ARCH__ < 700)