diff --git a/README.md b/README.md index ad9edd1cc..fe64ca5bf 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ harness/clients/run_codex.sh ## Run the Server -Default: Qwen 3.6-27B Q4_K_M target + Lucebox Q8_0 DFlash drafter on RTX 3090. DDTree budget=22, TQ3_0 KV cache, sliding FA window 2048. OpenAI-compatible HTTP on `:8000`. +Default: Qwen 3.6-27B Q4_K_M target + Lucebox Q4_K_M DFlash drafter on RTX 3090. DDTree budget=22, TQ3_0 KV cache, sliding FA window 2048. OpenAI-compatible HTTP on `:8000`. ```bash # build (CUDA 12+, CMake 3.18+) @@ -139,12 +139,12 @@ cmake --build server/build --target dflash_server -j # default weights (~18 GB) hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir server/models/ -hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/ +hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir server/models/draft/ # run (TQ3_0 KV auto-enabled; set =0 to disable) DFLASH27B_KV_TQ3=1 \ ./server/build/dflash_server server/models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft server/models/draft/dflash-draft-3.6-q8_0.gguf \ + --draft server/models/draft/dflash-draft-3.6-q4_k_m.gguf \ --ddtree --ddtree-budget 22 --fa-window 2048 --port 8000 ``` diff --git a/docs/specs/openapi-props.yaml b/docs/specs/openapi-props.yaml index 71e3c64c4..c2976265f 100644 --- a/docs/specs/openapi-props.yaml +++ b/docs/specs/openapi-props.yaml @@ -104,7 +104,7 @@ paths: lifetime_hits: 0 model: arch: "qwen35" - draft_path: "/.../dflash-draft-3.6-q8_0.gguf" + draft_path: "/.../dflash-draft-3.6-q4_k_m.gguf" tokenizer_id: null model_alias: "dflash" model_card: @@ -451,7 +451,7 @@ components: description: | Filesystem path of the loaded speculative-decode draft GGUF; `null` when no draft is loaded. - example: "/.../dflash-draft-3.6-q8_0.gguf" + example: "/.../dflash-draft-3.6-q4_k_m.gguf" tokenizer_id: type: ["string", "null"] description: | diff --git a/docs/specs/props-endpoint.md b/docs/specs/props-endpoint.md index b71a4fb73..e4238df3d 100644 --- a/docs/specs/props-endpoint.md +++ b/docs/specs/props-endpoint.md @@ -570,7 +570,7 @@ version increments. }, "model": { "arch": "qwen35", - "draft_path": "/.../dflash-draft-3.6-q8_0.gguf", + "draft_path": "/.../dflash-draft-3.6-q4_k_m.gguf", "tokenizer_id": "qwen3" }, "model_alias": "dflash", diff --git a/harness/README.md b/harness/README.md index cd337a3a4..e32730837 100644 --- a/harness/README.md +++ b/harness/README.md @@ -53,7 +53,7 @@ paths and profile: ```bash DFLASH_SERVER_BIN=server/build/dflash_server \ TARGET=server/models/Qwen3.6-27B-Q4_K_M.gguf \ -DRAFT=server/models/draft/dflash-draft-3.6-q8_0.gguf \ +DRAFT=server/models/draft/dflash-draft-3.6-q4_k_m.gguf \ MODEL_ID=luce-dflash \ MAX_CTX=32768 MAX_TOKENS=512 \ BUDGET=22 VERIFY_MODE=ddtree FA_WINDOW=2048 \ @@ -64,7 +64,7 @@ To test an already-running native server: ```bash server/build/dflash_server server/models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft server/models/draft/dflash-draft-3.6-q8_0.gguf \ + --draft server/models/draft/dflash-draft-3.6-q4_k_m.gguf \ --host 127.0.0.1 --port 18080 \ --max-ctx 32768 --max-tokens 512 \ --fa-window 2048 \ diff --git a/harness/benchmarks/run_lucebox_vs_llamacpp.sh b/harness/benchmarks/run_lucebox_vs_llamacpp.sh index b94b2d47f..810f8edfc 100755 --- a/harness/benchmarks/run_lucebox_vs_llamacpp.sh +++ b/harness/benchmarks/run_lucebox_vs_llamacpp.sh @@ -8,7 +8,7 @@ STAMP="${STAMP:-generation-baseline-$(date +%Y%m%d-%H%M%S)}" LOG_DIR="$RUN_DIR/$STAMP" TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}" -DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}" +DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q4_k_m.gguf}" LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-$REPO_DIR/server/deps/llama.cpp/build/bin/llama-server}" HOST="${HOST:-127.0.0.1}" diff --git a/harness/clients/common.sh b/harness/clients/common.sh index 19d741a70..d7f7db73c 100755 --- a/harness/clients/common.sh +++ b/harness/clients/common.sh @@ -9,7 +9,7 @@ CLIENT_WORK_DIR="${CLIENT_WORK_DIR:-/workspace/lucebox-harness-work}" RUN_DIR="${RUN_DIR:-/workspace/lucebox-client-harness-runs}" TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}" -DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}" +DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q4_k_m.gguf}" MODEL_SERVER="${MODEL_SERVER:-lucebox}" DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/server/build/dflash_server}" LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}" diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 345ee8aee..71298ff6a 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -428,7 +428,10 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda") target_compile_definitions(dflash_common PRIVATE DFLASH27B_HAVE_CUDA_SCALAR_FLASHPREFILL=1 DFLASH27B_HAVE_PASCAL_FLASHPREFILL=1) - # Scalar Pascal kernels only target sm_60-69. + # Best-effort narrowing for CMake versions/toolchains that honor + # per-source CUDA_ARCHITECTURES; flashprefill_scalar.cu also keeps + # its kernels visible in newer arch passes so default fatbinary builds + # do not end up with host launchers but missing device symbols. set_source_files_properties(src/flashprefill_scalar.cu PROPERTIES CUDA_ARCHITECTURES "60;61;62") endif() diff --git a/server/DEVELOPER.md b/server/DEVELOPER.md index 6252a13d7..3a2a4b2ef 100644 --- a/server/DEVELOPER.md +++ b/server/DEVELOPER.md @@ -92,8 +92,8 @@ Download models before running the server: # Target model (Q4_K_M quantized Qwen3.6-27B) hf download --local-dir server/models/ -# Draft model (1.84 GB default Qwen3.6 GGUF draft) -hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/ +# Draft model (0.98 GB default Qwen3.6 GGUF draft) +hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir server/models/draft/ ``` Expected layout: @@ -102,7 +102,7 @@ Expected layout: server/models/ ├── Qwen3.6-27B-Q4_K_M.gguf # --target (GGUF) └── draft/ - └── dflash-draft-3.6-q8_0.gguf # --draft (GGUF) + └── dflash-draft-3.6-q4_k_m.gguf # --draft (GGUF) ``` The target path can also be set via the `DFLASH_TARGET` environment variable. @@ -159,12 +159,12 @@ cd server/build # Numerics tests ./test_vs_oracle --target ../models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft ../models/draft/dflash-draft-3.6-q8_0.gguf + --draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf # Smoke tests ./smoke_load_target --target ../models/Qwen3.6-27B-Q4_K_M.gguf -./smoke_load_draft --draft ../models/draft/dflash-draft-3.6-q8_0.gguf -./smoke_draft_graph --draft ../models/draft/dflash-draft-3.6-q8_0.gguf +./smoke_load_draft --draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf +./smoke_draft_graph --draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf ``` ### Integration tests (require running server) @@ -200,7 +200,7 @@ server/ │ └── Block-Sparse-Attention/ # BSA kernels (submodule) ├── models/ # Model files (not in git) │ ├── Qwen3.6-27B-Q4_K_M.gguf -│ └── draft/dflash-draft-3.6-q8_0.gguf +│ └── draft/dflash-draft-3.6-q4_k_m.gguf ├── scripts/ │ ├── run.py # CLI text generation │ ├── test_server_prefix_cache.py # Integration test (--url or auto-spawn) diff --git a/server/README.md b/server/README.md index a23c4d05d..c66703e80 100644 --- a/server/README.md +++ b/server/README.md @@ -10,7 +10,7 @@

GGUF DFlash speculative decoding for Qwen3.5/Qwen3.6 27B.
- C++/CUDA runtime on top of ggml. Default path: Qwen3.6-27B Q4_K_M target + Lucebox Q8_0 GGUF DFlash draft.
+ C++/CUDA runtime on top of ggml. Default path: Qwen3.6-27B Q4_K_M target + Lucebox Q4_K_M GGUF DFlash draft.
Qwen3.5 reference: 129.5 tok/s mean on HumanEval (3.43x vs AR); best demo run: 207.6 tok/s vs 38.0 tok/s AR (5.46x).

Blog post · Benchmarks · Discord · lucebox.com

@@ -135,7 +135,7 @@ Qwen3.6-27B is the default integration path. It uses the same `qwen35` target ar hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir models/ # 2. matched 3.6 draft (GGUF, used by default by scripts/run.py and dflash_server) -hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir models/draft/ +hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir models/draft/ # 3. bench DFLASH_TARGET=models/Qwen3.6-27B-Q4_K_M.gguf python3 scripts/bench_he.py --n-gen 128 @@ -162,7 +162,7 @@ Run it directly: ```bash ./build/dflash_server models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft models/draft/dflash-draft-3.6-q8_0.gguf \ + --draft models/draft/dflash-draft-3.6-q4_k_m.gguf \ --host 127.0.0.1 --port 18080 \ --max-ctx 32768 --max-tokens 512 \ --fa-window 2048 \ @@ -326,12 +326,12 @@ cd lucebox-hub/dflash cmake -B build -S . -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=86 cmake --build build --target test_dflash -j -# Fetch models: ~16 GB target + 1.84 GB Lucebox Q8_0 GGUF DFlash draft. +# Fetch models: ~16 GB target + 0.98 GB Lucebox Q4_K_M GGUF DFlash draft. # Quickstart pins to Qwen3.6-27B (latest release). For Qwen3.5-27B swap in # unsloth/Qwen3.5-27B-GGUF + z-lab/Qwen3.5-27B-DFlash; arch is identical so # no rebuild is needed. hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir models/ -hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir models/draft/ +hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir models/draft/ # Streaming one-shot generate (run.py defaults to models/Qwen3.6-27B-Q4_K_M.gguf; # override with --target or DFLASH_TARGET=... env var). @@ -342,7 +342,7 @@ python3 examples/chat.py # OpenAI-compatible HTTP server (drop-in for Open WebUI / LM Studio / Cline). ./build/dflash_server models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft models/draft/dflash-draft-3.6-q8_0.gguf --port 8000 + --draft models/draft/dflash-draft-3.6-q4_k_m.gguf --port 8000 # Reproduce paper numbers python3 scripts/bench_llm.py # HE + GSM8K + Math500 @@ -353,7 +353,7 @@ python3 scripts/bench_he.py --n-gen 256 --ddtree-budget 22 # minimal HE bench ```bash DFLASH27B_KV_TQ3=1 DFLASH27B_PREFILL_UBATCH=16 \ build/test_dflash models/Qwen3.6-27B-Q4_K_M.gguf \ - models/draft/dflash-draft-3.6-q8_0.gguf /tmp/long_prompt.bin 64 /tmp/out.bin \ + models/draft/dflash-draft-3.6-q4_k_m.gguf /tmp/long_prompt.bin 64 /tmp/out.bin \ --fast-rollback --ddtree --ddtree-budget=16 --max-ctx=4096 # align_up(prompt + n_gen + 64, 256); raise up to 262144 for long prompts ``` diff --git a/server/scripts/bench_agent.py b/server/scripts/bench_agent.py index b8f84c41b..6953746c6 100644 --- a/server/scripts/bench_agent.py +++ b/server/scripts/bench_agent.py @@ -41,7 +41,7 @@ "DFLASH_TARGET", str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"), ) -_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf" +_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf" _LOCAL_DRAFT_ROOT = ROOT / "models" / "draft" DRAFT = None TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}")) diff --git a/server/scripts/bench_he.py b/server/scripts/bench_he.py index 57b6c93d4..a4da24f6d 100644 --- a/server/scripts/bench_he.py +++ b/server/scripts/bench_he.py @@ -26,7 +26,7 @@ "DFLASH_TARGET", str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"), ) -_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf" +_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf" _LOCAL_DRAFT_ROOT = ROOT / "models" / "draft" DRAFT = None TEST_DFLASH = os.environ.get( diff --git a/server/scripts/bench_llm.py b/server/scripts/bench_llm.py index 1d8c5a707..91ba498c5 100644 --- a/server/scripts/bench_llm.py +++ b/server/scripts/bench_llm.py @@ -25,7 +25,7 @@ "DFLASH_TARGET", str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"), ) -_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf" +_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf" _LOCAL_DRAFT_ROOT = ROOT / "models" / "draft" DRAFT = None TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}")) diff --git a/server/src/flashprefill_scalar.cu b/server/src/flashprefill_scalar.cu index 1d90fcbce..792701083 100644 --- a/server/src/flashprefill_scalar.cu +++ b/server/src/flashprefill_scalar.cu @@ -4,16 +4,16 @@ // - F16 (half) instead of __nv_bfloat16 — Pascal has no BF16 hardware // - Scalar F16×F16→F32 math instead of WMMA tensor cores // - Cooperative shared-memory loads instead of cp.async (Pascal has no async copy) -// - __shfl / __shfl_down instead of __shfl_sync / __shfl_down_sync -// - membar.gl instead of fence.acq_rel.gpu +// - Uses full-warp _sync vote/shuffle intrinsics, which CUDA 12 accepts for +// Pascal and newer targets. // // Dispatched from flashprefill.cpp when DFLASH27B_HAVE_PASCAL_FLASHPREFILL is set. // The drafter's persistent buffers must be GGML_TYPE_F16. // -// Guarded to compile only for sm_60-69 so Pascal-specific intrinsics -// (__shfl without _sync, etc.) don't affect sm_70+ codepaths. - -#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600 && __CUDA_ARCH__ < 700) +// CMake's CUDA_ARCHITECTURES is a target property in many supported CMake +// versions, so this file may still be compiled during an sm_70+ pass in a +// default multi-arch build. Keep the kernel definitions visible for every pass +// and use warp intrinsics that compile cleanly across those targets. #include #include @@ -22,6 +22,14 @@ namespace dflash::common { namespace flashprefill { +__device__ inline float scalar_shfl_xor(float v, int lane_mask) { + return __shfl_xor_sync(0xffffffffu, v, lane_mask); +} + +__device__ inline unsigned scalar_ballot(bool pred) { + return __ballot_sync(0xffffffffu, pred); +} + // ============================================================================= // Kernel 1: compute_mean_vector (F16, scalar) // ============================================================================= @@ -29,7 +37,7 @@ namespace flashprefill { // along the sequence dim, computing the mean per dim. template -__global__ void compute_mean_vector_kernel_f16( +__global__ void compute_mean_vector_kernel_f16_pascal( const half * __restrict__ K, half * __restrict__ mean_K, int batch, int seq_len, int n_kv_heads, @@ -73,7 +81,7 @@ extern "C" void launch_compute_mean_vector_f16_pascal( dim3 grid(n_k_blocks, batch * n_kv_heads, 1); dim3 block(head_dim, 1, 1); if (head_dim == 128 && block_size == 128) { - compute_mean_vector_kernel_f16<128, 128><<>>( + compute_mean_vector_kernel_f16_pascal<128, 128><<>>( (const half *)K, (half *)mean_K, batch, seq_len, n_kv_heads, s_K_b, s_K_n, s_K_h, s_K_d, @@ -87,7 +95,7 @@ extern "C" void launch_compute_mean_vector_f16_pascal( // Per (q_block, k_block), compute the attention score via Q · mean_K^T. template -__global__ void compute_block_score_kernel_f16( +__global__ void compute_block_score_kernel_f16_pascal( const half * __restrict__ Q, const half * __restrict__ mean_K, float sm_scale, @@ -174,7 +182,7 @@ extern "C" void launch_compute_block_score_f16_pascal( dim3 block(block_size, 1, 1); size_t smem = block_size * sizeof(float); if (head_dim == 128 && block_size == 128) { - compute_block_score_kernel_f16<128, 128, 1><<>>( + compute_block_score_kernel_f16_pascal<128, 128, 1><<>>( (const half *)Q, (const half *)mean_K, sm_scale, (float *)score, (float *)score_max, batch, n_q_heads, n_k_heads, M, M, @@ -192,7 +200,7 @@ extern "C" void launch_compute_block_score_f16_pascal( // Scalar F16×F16→F32 math, shared-memory tiled, no WMMA. template -__global__ void sparse_flash_forward_kernel_f16( +__global__ void sparse_flash_forward_kernel_f16_pascal( const half * __restrict__ Q, const half * __restrict__ K, const half * __restrict__ V, @@ -404,7 +412,7 @@ extern "C" void launch_sparse_flash_forward_f16_pascal( + sizeof(half) * (K_TILE * D_HEAD) // KV tile + sizeof(half) * (Q_TILE * K_TILE) // P tile + sizeof(float) * (2 * Q_TILE); // row_m + row_l - sparse_flash_forward_kernel_f16<<>>( + sparse_flash_forward_kernel_f16_pascal<<>>( (const half *)Q, (const half *)K, (const half *)V, (half *)O, block_index, counts, scale, @@ -453,10 +461,9 @@ __global__ void block_select_kernel_pascal( float v = valid ? sp[(size_t)n * s_n] : NEG_INF; local_max = fmaxf(local_max, v); } - // Pascal: __shfl_xor (no _sync suffix) #pragma unroll for (int off = 16; off > 0; off >>= 1) - local_max = fmaxf(local_max, __shfl_xor(local_max, off)); + local_max = fmaxf(local_max, scalar_shfl_xor(local_max, off)); const float max_score = local_max; const float thresh = max_score * alpha; @@ -473,7 +480,7 @@ __global__ void block_select_kernel_pascal( || ((m - n) < window) || (v >= thresh); } - unsigned mask = __ballot(keep); // Pascal: __ballot (no _sync suffix) + unsigned mask = scalar_ballot(keep); int rank = __popc(mask & ((1u << lane) - 1u)); if (keep) { idxp[(size_t)(total + rank) * idx_s_n] = (int32_t)n; @@ -513,5 +520,3 @@ extern "C" void launch_block_select_pascal( } // namespace flashprefill } // namespace dflash::common - -#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600 && __CUDA_ARCH__ < 700)