Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ harness/clients/run_codex.sh

## Run the Server

Default: Qwen 3.6-27B Q4_K_M target + Lucebox Q8_0 DFlash drafter on RTX 3090. DDTree budget=22, TQ3_0 KV cache, sliding FA window 2048. OpenAI-compatible HTTP on `:8000`.
Default: Qwen 3.6-27B Q4_K_M target + Lucebox Q4_K_M DFlash drafter on RTX 3090. DDTree budget=22, TQ3_0 KV cache, sliding FA window 2048. OpenAI-compatible HTTP on `:8000`.

```bash
# build (CUDA 12+, CMake 3.18+)
Expand All @@ -139,12 +139,12 @@ cmake --build server/build --target dflash_server -j

# default weights (~18 GB)
hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir server/models/
hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/
hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir server/models/draft/

# run (TQ3_0 KV auto-enabled; set =0 to disable)
DFLASH27B_KV_TQ3=1 \
./server/build/dflash_server server/models/Qwen3.6-27B-Q4_K_M.gguf \
--draft server/models/draft/dflash-draft-3.6-q8_0.gguf \
--draft server/models/draft/dflash-draft-3.6-q4_k_m.gguf \
--ddtree --ddtree-budget 22 --fa-window 2048 --port 8000
```

Expand Down
4 changes: 2 additions & 2 deletions docs/specs/openapi-props.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ paths:
lifetime_hits: 0
model:
arch: "qwen35"
draft_path: "/.../dflash-draft-3.6-q8_0.gguf"
draft_path: "/.../dflash-draft-3.6-q4_k_m.gguf"
tokenizer_id: null
model_alias: "dflash"
model_card:
Expand Down Expand Up @@ -451,7 +451,7 @@ components:
description: |
Filesystem path of the loaded speculative-decode draft
GGUF; `null` when no draft is loaded.
example: "/.../dflash-draft-3.6-q8_0.gguf"
example: "/.../dflash-draft-3.6-q4_k_m.gguf"
tokenizer_id:
type: ["string", "null"]
description: |
Expand Down
2 changes: 1 addition & 1 deletion docs/specs/props-endpoint.md
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ version increments.
},
"model": {
"arch": "qwen35",
"draft_path": "/.../dflash-draft-3.6-q8_0.gguf",
"draft_path": "/.../dflash-draft-3.6-q4_k_m.gguf",
"tokenizer_id": "qwen3"
},
"model_alias": "dflash",
Expand Down
4 changes: 2 additions & 2 deletions harness/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ paths and profile:
```bash
DFLASH_SERVER_BIN=server/build/dflash_server \
TARGET=server/models/Qwen3.6-27B-Q4_K_M.gguf \
DRAFT=server/models/draft/dflash-draft-3.6-q8_0.gguf \
DRAFT=server/models/draft/dflash-draft-3.6-q4_k_m.gguf \
MODEL_ID=luce-dflash \
MAX_CTX=32768 MAX_TOKENS=512 \
BUDGET=22 VERIFY_MODE=ddtree FA_WINDOW=2048 \
Expand All @@ -64,7 +64,7 @@ To test an already-running native server:

```bash
server/build/dflash_server server/models/Qwen3.6-27B-Q4_K_M.gguf \
--draft server/models/draft/dflash-draft-3.6-q8_0.gguf \
--draft server/models/draft/dflash-draft-3.6-q4_k_m.gguf \
--host 127.0.0.1 --port 18080 \
--max-ctx 32768 --max-tokens 512 \
--fa-window 2048 \
Expand Down
2 changes: 1 addition & 1 deletion harness/benchmarks/run_lucebox_vs_llamacpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ STAMP="${STAMP:-generation-baseline-$(date +%Y%m%d-%H%M%S)}"
LOG_DIR="$RUN_DIR/$STAMP"

TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}"
DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}"
DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q4_k_m.gguf}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-$REPO_DIR/server/deps/llama.cpp/build/bin/llama-server}"

HOST="${HOST:-127.0.0.1}"
Expand Down
2 changes: 1 addition & 1 deletion harness/clients/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ CLIENT_WORK_DIR="${CLIENT_WORK_DIR:-/workspace/lucebox-harness-work}"
RUN_DIR="${RUN_DIR:-/workspace/lucebox-client-harness-runs}"

TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}"
DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}"
DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q4_k_m.gguf}"
MODEL_SERVER="${MODEL_SERVER:-lucebox}"
DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/server/build/dflash_server}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}"
Expand Down
5 changes: 4 additions & 1 deletion server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,10 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
target_compile_definitions(dflash_common PRIVATE
DFLASH27B_HAVE_CUDA_SCALAR_FLASHPREFILL=1
DFLASH27B_HAVE_PASCAL_FLASHPREFILL=1)
# Scalar Pascal kernels only target sm_60-69.
# Best-effort narrowing for CMake versions/toolchains that honor
# per-source CUDA_ARCHITECTURES; flashprefill_scalar.cu also keeps
# its kernels visible in newer arch passes so default fatbinary builds
# do not end up with host launchers but missing device symbols.
set_source_files_properties(src/flashprefill_scalar.cu PROPERTIES
CUDA_ARCHITECTURES "60;61;62")
endif()
Expand Down
14 changes: 7 additions & 7 deletions server/DEVELOPER.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ Download models before running the server:
# Target model (Q4_K_M quantized Qwen3.6-27B)
hf download <repo-id> --local-dir server/models/

# Draft model (1.84 GB default Qwen3.6 GGUF draft)
hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/
# Draft model (0.98 GB default Qwen3.6 GGUF draft)
hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir server/models/draft/
```

Expected layout:
Expand All @@ -102,7 +102,7 @@ Expected layout:
server/models/
├── Qwen3.6-27B-Q4_K_M.gguf # --target (GGUF)
└── draft/
└── dflash-draft-3.6-q8_0.gguf # --draft (GGUF)
└── dflash-draft-3.6-q4_k_m.gguf # --draft (GGUF)
```

The target path can also be set via the `DFLASH_TARGET` environment variable.
Expand Down Expand Up @@ -159,12 +159,12 @@ cd server/build

# Numerics tests
./test_vs_oracle --target ../models/Qwen3.6-27B-Q4_K_M.gguf \
--draft ../models/draft/dflash-draft-3.6-q8_0.gguf
--draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf

# Smoke tests
./smoke_load_target --target ../models/Qwen3.6-27B-Q4_K_M.gguf
./smoke_load_draft --draft ../models/draft/dflash-draft-3.6-q8_0.gguf
./smoke_draft_graph --draft ../models/draft/dflash-draft-3.6-q8_0.gguf
./smoke_load_draft --draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf
./smoke_draft_graph --draft ../models/draft/dflash-draft-3.6-q4_k_m.gguf
```

### Integration tests (require running server)
Expand Down Expand Up @@ -200,7 +200,7 @@ server/
│ └── Block-Sparse-Attention/ # BSA kernels (submodule)
├── models/ # Model files (not in git)
│ ├── Qwen3.6-27B-Q4_K_M.gguf
│ └── draft/dflash-draft-3.6-q8_0.gguf
│ └── draft/dflash-draft-3.6-q4_k_m.gguf
├── scripts/
│ ├── run.py # CLI text generation
│ ├── test_server_prefix_cache.py # Integration test (--url or auto-spawn)
Expand Down
14 changes: 7 additions & 7 deletions server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

<p align="center">
<strong>GGUF DFlash speculative decoding for Qwen3.5/Qwen3.6 27B.</strong><br/>
C++/CUDA runtime on top of ggml. Default path: Qwen3.6-27B Q4_K_M target + Lucebox Q8_0 GGUF DFlash draft.<br/>
C++/CUDA runtime on top of ggml. Default path: Qwen3.6-27B Q4_K_M target + Lucebox Q4_K_M GGUF DFlash draft.<br/>
Qwen3.5 reference: 129.5 tok/s mean on HumanEval (3.43x vs AR); best demo run: 207.6 tok/s vs 38.0 tok/s AR (5.46x).<br/><br/>
<a href="https://lucebox.com/blog/dflash27b">Blog post</a> · <a href="RESULTS.md">Benchmarks</a> · <a href="https://discord.gg/yHfswqZmJQ">Discord</a> · <a href="https://lucebox.com">lucebox.com</a>
</p>
Expand Down Expand Up @@ -135,7 +135,7 @@ Qwen3.6-27B is the default integration path. It uses the same `qwen35` target ar
hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir models/

# 2. matched 3.6 draft (GGUF, used by default by scripts/run.py and dflash_server)
hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir models/draft/
hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir models/draft/

# 3. bench
DFLASH_TARGET=models/Qwen3.6-27B-Q4_K_M.gguf python3 scripts/bench_he.py --n-gen 128
Expand All @@ -162,7 +162,7 @@ Run it directly:

```bash
./build/dflash_server models/Qwen3.6-27B-Q4_K_M.gguf \
--draft models/draft/dflash-draft-3.6-q8_0.gguf \
--draft models/draft/dflash-draft-3.6-q4_k_m.gguf \
--host 127.0.0.1 --port 18080 \
--max-ctx 32768 --max-tokens 512 \
--fa-window 2048 \
Expand Down Expand Up @@ -326,12 +326,12 @@ cd lucebox-hub/dflash
cmake -B build -S . -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=86
cmake --build build --target test_dflash -j

# Fetch models: ~16 GB target + 1.84 GB Lucebox Q8_0 GGUF DFlash draft.
# Fetch models: ~16 GB target + 0.98 GB Lucebox Q4_K_M GGUF DFlash draft.
# Quickstart pins to Qwen3.6-27B (latest release). For Qwen3.5-27B swap in
# unsloth/Qwen3.5-27B-GGUF + z-lab/Qwen3.5-27B-DFlash; arch is identical so
# no rebuild is needed.
hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir models/
hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir models/draft/
hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q4_k_m.gguf --local-dir models/draft/

# Streaming one-shot generate (run.py defaults to models/Qwen3.6-27B-Q4_K_M.gguf;
# override with --target or DFLASH_TARGET=... env var).
Expand All @@ -342,7 +342,7 @@ python3 examples/chat.py

# OpenAI-compatible HTTP server (drop-in for Open WebUI / LM Studio / Cline).
./build/dflash_server models/Qwen3.6-27B-Q4_K_M.gguf \
--draft models/draft/dflash-draft-3.6-q8_0.gguf --port 8000
--draft models/draft/dflash-draft-3.6-q4_k_m.gguf --port 8000

# Reproduce paper numbers
python3 scripts/bench_llm.py # HE + GSM8K + Math500
Expand All @@ -353,7 +353,7 @@ python3 scripts/bench_he.py --n-gen 256 --ddtree-budget 22 # minimal HE bench
```bash
DFLASH27B_KV_TQ3=1 DFLASH27B_PREFILL_UBATCH=16 \
build/test_dflash models/Qwen3.6-27B-Q4_K_M.gguf \
models/draft/dflash-draft-3.6-q8_0.gguf /tmp/long_prompt.bin 64 /tmp/out.bin \
models/draft/dflash-draft-3.6-q4_k_m.gguf /tmp/long_prompt.bin 64 /tmp/out.bin \
--fast-rollback --ddtree --ddtree-budget=16 --max-ctx=4096 # align_up(prompt + n_gen + 64, 256); raise up to 262144 for long prompts
```

Expand Down
2 changes: 1 addition & 1 deletion server/scripts/bench_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"DFLASH_TARGET",
str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
)
_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf"
_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
DRAFT = None
TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}"))
Expand Down
2 changes: 1 addition & 1 deletion server/scripts/bench_he.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"DFLASH_TARGET",
str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
)
_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf"
_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
DRAFT = None
TEST_DFLASH = os.environ.get(
Expand Down
2 changes: 1 addition & 1 deletion server/scripts/bench_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"DFLASH_TARGET",
str(ROOT / "models" / "Qwen3.6-27B-Q4_K_M.gguf"),
)
_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q8_0.gguf"
_LOCAL_DRAFT_FILE = ROOT / "models" / "draft" / "dflash-draft-3.6-q4_k_m.gguf"
_LOCAL_DRAFT_ROOT = ROOT / "models" / "draft"
DRAFT = None
TEST_DFLASH = os.environ.get("DFLASH_BIN", str(ROOT / "build" / f"test_dflash{BIN_SUFFIX}"))
Expand Down
39 changes: 22 additions & 17 deletions server/src/flashprefill_scalar.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
// - F16 (half) instead of __nv_bfloat16 — Pascal has no BF16 hardware
// - Scalar F16×F16→F32 math instead of WMMA tensor cores
// - Cooperative shared-memory loads instead of cp.async (Pascal has no async copy)
// - __shfl / __shfl_down instead of __shfl_sync / __shfl_down_sync
// - membar.gl instead of fence.acq_rel.gpu
// - Uses full-warp _sync vote/shuffle intrinsics, which CUDA 12 accepts for
// Pascal and newer targets.
//
// Dispatched from flashprefill.cpp when DFLASH27B_HAVE_PASCAL_FLASHPREFILL is set.
// The drafter's persistent buffers must be GGML_TYPE_F16.
//
// Guarded to compile only for sm_60-69 so Pascal-specific intrinsics
// (__shfl without _sync, etc.) don't affect sm_70+ codepaths.

#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600 && __CUDA_ARCH__ < 700)
// CMake's CUDA_ARCHITECTURES is a target property in many supported CMake
// versions, so this file may still be compiled during an sm_70+ pass in a
// default multi-arch build. Keep the kernel definitions visible for every pass
// and use warp intrinsics that compile cleanly across those targets.

#include <cstdint>
#include <cuda_runtime.h>
Expand All @@ -22,14 +22,22 @@
namespace dflash::common {
namespace flashprefill {

__device__ inline float scalar_shfl_xor(float v, int lane_mask) {
return __shfl_xor_sync(0xffffffffu, v, lane_mask);
}

__device__ inline unsigned scalar_ballot(bool pred) {
return __ballot_sync(0xffffffffu, pred);
}

// =============================================================================
// Kernel 1: compute_mean_vector (F16, scalar)
// =============================================================================
// Each block of (D_HEAD threads, 1 K-head, 1 batch) reduces one K-block
// along the sequence dim, computing the mean per dim.

template <int BLOCK, int D_HEAD>
__global__ void compute_mean_vector_kernel_f16(
__global__ void compute_mean_vector_kernel_f16_pascal(
const half * __restrict__ K,
half * __restrict__ mean_K,
int batch, int seq_len, int n_kv_heads,
Expand Down Expand Up @@ -73,7 +81,7 @@ extern "C" void launch_compute_mean_vector_f16_pascal(
dim3 grid(n_k_blocks, batch * n_kv_heads, 1);
dim3 block(head_dim, 1, 1);
if (head_dim == 128 && block_size == 128) {
compute_mean_vector_kernel_f16<128, 128><<<grid, block, 0, stream>>>(
compute_mean_vector_kernel_f16_pascal<128, 128><<<grid, block, 0, stream>>>(
(const half *)K, (half *)mean_K,
batch, seq_len, n_kv_heads,
s_K_b, s_K_n, s_K_h, s_K_d,
Expand All @@ -87,7 +95,7 @@ extern "C" void launch_compute_mean_vector_f16_pascal(
// Per (q_block, k_block), compute the attention score via Q · mean_K^T.

template <int BLOCK, int D_HEAD, int N_BLOCKS_TILE>
__global__ void compute_block_score_kernel_f16(
__global__ void compute_block_score_kernel_f16_pascal(
const half * __restrict__ Q,
const half * __restrict__ mean_K,
float sm_scale,
Expand Down Expand Up @@ -174,7 +182,7 @@ extern "C" void launch_compute_block_score_f16_pascal(
dim3 block(block_size, 1, 1);
size_t smem = block_size * sizeof(float);
if (head_dim == 128 && block_size == 128) {
compute_block_score_kernel_f16<128, 128, 1><<<grid, block, smem, stream>>>(
compute_block_score_kernel_f16_pascal<128, 128, 1><<<grid, block, smem, stream>>>(
(const half *)Q, (const half *)mean_K, sm_scale,
(float *)score, (float *)score_max,
batch, n_q_heads, n_k_heads, M, M,
Expand All @@ -192,7 +200,7 @@ extern "C" void launch_compute_block_score_f16_pascal(
// Scalar F16×F16→F32 math, shared-memory tiled, no WMMA.

template <int Q_TILE, int K_TILE, int BLOCK, int D_HEAD>
__global__ void sparse_flash_forward_kernel_f16(
__global__ void sparse_flash_forward_kernel_f16_pascal(
const half * __restrict__ Q,
const half * __restrict__ K,
const half * __restrict__ V,
Expand Down Expand Up @@ -404,7 +412,7 @@ extern "C" void launch_sparse_flash_forward_f16_pascal(
+ sizeof(half) * (K_TILE * D_HEAD) // KV tile
+ sizeof(half) * (Q_TILE * K_TILE) // P tile
+ sizeof(float) * (2 * Q_TILE); // row_m + row_l
sparse_flash_forward_kernel_f16<Q_TILE, K_TILE, BLOCK, D_HEAD><<<grid, block, smem_bytes, stream>>>(
sparse_flash_forward_kernel_f16_pascal<Q_TILE, K_TILE, BLOCK, D_HEAD><<<grid, block, smem_bytes, stream>>>(
(const half *)Q, (const half *)K,
(const half *)V, (half *)O,
block_index, counts, scale,
Expand Down Expand Up @@ -453,10 +461,9 @@ __global__ void block_select_kernel_pascal(
float v = valid ? sp[(size_t)n * s_n] : NEG_INF;
local_max = fmaxf(local_max, v);
}
// Pascal: __shfl_xor (no _sync suffix)
#pragma unroll
for (int off = 16; off > 0; off >>= 1)
local_max = fmaxf(local_max, __shfl_xor(local_max, off));
local_max = fmaxf(local_max, scalar_shfl_xor(local_max, off));
const float max_score = local_max;
const float thresh = max_score * alpha;

Expand All @@ -473,7 +480,7 @@ __global__ void block_select_kernel_pascal(
|| ((m - n) < window)
|| (v >= thresh);
}
unsigned mask = __ballot(keep); // Pascal: __ballot (no _sync suffix)
unsigned mask = scalar_ballot(keep);
int rank = __popc(mask & ((1u << lane) - 1u));
if (keep) {
idxp[(size_t)(total + rank) * idx_s_n] = (int32_t)n;
Expand Down Expand Up @@ -513,5 +520,3 @@ extern "C" void launch_block_select_pascal(

} // namespace flashprefill
} // namespace dflash::common

#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600 && __CUDA_ARCH__ < 700)
Loading