From f71a6a7d1a3dd78d012425970dcc5e38c9df1453 Mon Sep 17 00:00:00 2001 From: Hermes Bot Date: Tue, 23 Jun 2026 18:54:32 -0400 Subject: [PATCH] feat(model-config): detailed, concept-explaining flag tooltips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand each flag's HELP from a one-liner to a short explanation of the underlying concept + the practical tradeoff — e.g. RoPE/YaRN context extension, the KV cache and its quantization, Flash Attention, parallel slots, mmap on bind mounts, and MTP speculative decoding. ~266 chars avg. Served via GET /model-config descriptors; the dashboard already renders `help` as a tooltip, so no UI change is needed. Co-Authored-By: Claude Opus 4.8 (1M context) --- ops-controller/llamacpp_flags.py | 108 +++++++++++++++++++++++++------ 1 file changed, 88 insertions(+), 20 deletions(-) diff --git a/ops-controller/llamacpp_flags.py b/ops-controller/llamacpp_flags.py index ac4c0dd..64c9412 100644 --- a/ops-controller/llamacpp_flags.py +++ b/ops-controller/llamacpp_flags.py @@ -142,27 +142,95 @@ def defaults(): "LLAMACPP_KV_CACHE_TYPE_V": sorted(_KV_TYPES), } -# One-line explanations surfaced as tooltips in the dashboard flag UI. +# Tooltips for the dashboard flag UI — each explains the underlying concept, +# what the value controls, and the practical tradeoff (concise but educational). HELP = { - "LLAMACPP_MODEL": "The GGUF weights file llama.cpp loads as the chat model.", - "LLAMACPP_CTX_SIZE": "Context window in tokens. Stack-wide cap (Open WebUI, Cline, etc.); larger = more KV-cache VRAM.", - "LLAMACPP_GPU_LAYERS": "How many model layers to offload to the GPU. -1 = all on GPU.", - "LLAMACPP_ROPE_SCALING": "Method to stretch context beyond the model's native length. 'none' = native; 'yarn'/'linear' extend it.", - "LLAMACPP_ROPE_SCALE": "Context-extension factor used with rope scaling (e.g. 2 = double the native length).", - "LLAMACPP_YARN_ORIG_CTX": "The model's native (pre-extension) context length, for YaRN math. 0 = unset.", - "LLAMACPP_OVERRIDE_KV": "Override a GGUF metadata key as key=type:value (e.g. raise the declared context_length). Empty = none.", - "LLAMACPP_FLASH_ATTN": "Flash Attention. 'auto' lets llama.cpp decide; 'on' forces it (required by quantized KV cache).", - "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": "Quantize the KV cache to fit longer context in VRAM (1 = on).", - "LLAMACPP_KV_CACHE_TYPE_K": "KV-cache quantization for keys. q8_0 = best quality of the quantized set; smaller types save more VRAM.", - "LLAMACPP_KV_CACHE_TYPE_V": "KV-cache quantization for values. q8_0 = best quality; smaller types save more VRAM.", - "LLAMACPP_N_PREDICT": "Hard ceiling on tokens generated per request — a backstop against runaway generation.", - "LLAMACPP_REASONING_BUDGET": "Max tokens the model may spend inside per response.", - "LLAMACPP_MMPROJ": "Vision projector (mmproj GGUF) that enables image input. Empty = text-only.", - "LLAMACPP_PARALLEL": "Number of concurrent request slots the server handles.", - "LLAMACPP_USE_MMAP": "Memory-map the model file. 0 = off (avoids stale page-cache on Docker bind mounts).", - "LLAMACPP_EXTRA_ARGS": "Raw llama-server flags appended verbatim — escape hatch for anything without a dedicated field.", - "MTP_ENABLED": "Multi-Token Prediction speculative decoding (~1.7× faster), using the model's built-in draft head.", - "MTP_N_MAX": "Max speculative draft tokens per step (1–6). Hardware-dependent; try a few values.", + "LLAMACPP_MODEL": + "The model weights llama.cpp loads. GGUF is the quantized on-disk format; the " + "filename usually encodes the model, size, and quant level (e.g. Q4_K_M ≈ 4-bit — " + "smaller and faster than Q6/Q8, with a little quality loss). Changing this swaps " + "which model the stack-wide `local-chat` alias serves.", + "LLAMACPP_CTX_SIZE": + "The context window: the max tokens (prompt + reply) the model can attend to at once " + "(~0.75 words per token). This is the stack-wide cap every client sees (Open WebUI, " + "Cline, agents). Bigger = the model 'remembers' more, but the KV cache grows ~linearly " + "with it, costing VRAM and slowing prompt processing.", + "LLAMACPP_GPU_LAYERS": + "A transformer model is a stack of layers; this sets how many are offloaded to the GPU " + "(the rest run on the much slower CPU). -1 = put every layer on the GPU — fastest, but " + "needs enough VRAM for the whole model. Lower it only when a model doesn't fully fit.", + "LLAMACPP_ROPE_SCALING": + "Models encode token positions with RoPE (rotary position embeddings) and are trained " + "for a fixed 'native' context length. This stretches positions to run BEYOND that " + "length: 'none' = native only; 'linear' = naive interpolation; 'yarn' = a smarter scheme " + "that keeps quality much better. Only enable when you need more context than the model " + "was trained for.", + "LLAMACPP_ROPE_SCALE": + "The context-extension multiplier, used with rope scaling. e.g. 2 runs at 2× the native " + "length (256K→512K). Reaching further costs long-range accuracy, so use the smallest " + "factor that fits your need. Ignored when rope_scaling = none.", + "LLAMACPP_YARN_ORIG_CTX": + "Tells YaRN the model's native (pre-extension) context length so it scales correctly — " + "set it to the model's trained context (e.g. 262144). The effective window then ≈ this × " + "rope_scale. 0 = unset/auto.", + "LLAMACPP_OVERRIDE_KV": + "Force-override a value baked into the GGUF's metadata at load time, as key=type:value " + "(e.g. `qwen3.context_length=int:524288` to raise a declared limit). An expert escape " + "hatch — leave empty unless you know the exact metadata key; a wrong key/type can break " + "loading.", + "LLAMACPP_FLASH_ATTN": + "Flash Attention is a fused, memory-efficient attention kernel: same math/results, but " + "faster and far lower VRAM at long context. 'auto' lets llama.cpp choose for the " + "build/model; 'on' forces it (required when the KV cache is quantized); 'off' disables it.", + "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": + "The KV cache stores keys/values for every past token so they aren't recomputed each " + "step — it's the dominant VRAM cost of long context. Turning this on stores it " + "compressed (quantized), roughly halving that memory so you can fit a bigger window, for " + "a small quality cost. Uses the KV-cache type settings below and generally needs Flash " + "Attention on.", + "LLAMACPP_KV_CACHE_TYPE_K": + "Numeric format for the KEYS half of the (quantized) KV cache. q8_0 (8-bit) is the " + "highest-quality quantized option and the safe default; q4_0/q5_* save more VRAM at some " + "accuracy cost; f16 = unquantized (largest). Only used when KV-cache quantization is on.", + "LLAMACPP_KV_CACHE_TYPE_V": + "Numeric format for the VALUES half of the (quantized) KV cache. Same scale as keys: " + "q8_0 = best quality, smaller types save more VRAM, f16 = unquantized. Only used when " + "KV-cache quantization is on.", + "LLAMACPP_N_PREDICT": + "Hard upper bound on tokens generated in one response. It's a safety backstop: if a " + "model gets stuck in a loop (e.g. never closing its reasoning), this force-stops it " + "instead of running forever. Set high enough not to truncate legitimate answers.", + "LLAMACPP_REASONING_BUDGET": + "For 'thinking' models that emit a hidden block before answering, this " + "caps tokens spent reasoning per response, so a runaway chain of thought can't consume " + "the whole budget. It relies on the model emitting a clean end-of-thinking token; " + "N_PREDICT is the unconditional backstop.", + "LLAMACPP_MMPROJ": + "Path to a multimodal projector (an mmproj GGUF) that lets the model accept images, not " + "just text — it projects vision features into the model's token space. Set it to enable " + "vision; leave empty for text-only (saves ~1 GB VRAM). Must match the model family.", + "LLAMACPP_PARALLEL": + "How many requests the server handles concurrently ('slots'). Each slot reserves its own " + "slice of the context/KV cache, so more slots = more throughput but a smaller window per " + "request. 1 = maximum context for a single request.", + "LLAMACPP_USE_MMAP": + "Memory-mapping loads the model lazily via the OS page cache instead of reading it all " + "into RAM up front. 0 = off here because Docker bind mounts (virtiofs/9p) don't reuse the " + "page cache across restarts, so mmap gives no benefit and can slow loads. Turn on only " + "with a native-filesystem model path.", + "LLAMACPP_EXTRA_ARGS": + "Raw flags passed straight to llama-server, appended after the managed ones — an escape " + "hatch for options without a dedicated field yet. Whitespace-split into argv (not " + "shell-evaluated). e.g. `--reasoning-format deepseek`.", + "MTP_ENABLED": + "Multi-Token Prediction = speculative decoding using the model's own built-in 'draft' " + "head: it cheaply guesses several next tokens, then the full model verifies them in one " + "pass and keeps the correct ones. Net effect ≈ 1.5–2× faster generation with identical " + "output. Only works on models shipped with MTP weights.", + "MTP_N_MAX": + "How many tokens MTP drafts ahead per step (speculation depth, 1–6). More draft tokens = " + "bigger speedups when the guesses are accepted, but wasted work when they're rejected — " + "the sweet spot is hardware/model-dependent, so try a few values.", }