diff --git a/.github/build.bat b/.github/build.bat
index a9df77f5..d2c755b3 100755
--- a/.github/build.bat
+++ b/.github/build.bat
@@ -60,6 +60,28 @@ REM was wired in as the launcher.
 if defined LAUNCH (
     echo build.bat: sccache --show-stats
     sccache --show-stats
+    REM KISS per-job cache summary in the GitHub Actions job summary (like upstream llama.cpp's
+    REM ccache-action table). Parse the text stats: the top-level "Compile requests" line is the
+    REM total and the top-level "Cache hits" line is the hits (the per-language "Cache hits (C/C++)"
+    REM line has "(" after the label, so the digit-anchored findstr regex skips it). Only in CI
+    REM (GITHUB_STEP_SUMMARY set); local runs are untouched. Best-effort -- skipped if the two
+    REM numbers can't be parsed or there were no requests. Integer math with rounding to one decimal.
+    if defined GITHUB_STEP_SUMMARY (
+        set "SCC_REQ="
+        set "SCC_HITS="
+        for /f "tokens=3" %%a in ('sccache --show-stats 2^>nul ^| findstr /r /c:"^Compile requests  *[0-9]"') do set "SCC_REQ=%%a"
+        for /f "tokens=3" %%a in ('sccache --show-stats 2^>nul ^| findstr /r /c:"^Cache hits  *[0-9]"') do set "SCC_HITS=%%a"
+        if defined SCC_REQ if defined SCC_HITS if !SCC_REQ! gtr 0 (
+            set /a SCC_RATE10=^(!SCC_HITS! * 1000 + !SCC_REQ! / 2^) / !SCC_REQ!
+            set /a SCC_WHOLE=!SCC_RATE10! / 10
+            set /a SCC_DEC=!SCC_RATE10! %% 10
+            >>"%GITHUB_STEP_SUMMARY%" echo ### sccache statistics
+            >>"%GITHUB_STEP_SUMMARY%" echo.
+            >>"%GITHUB_STEP_SUMMARY%" echo ^| Cache hits ^| Requests ^| Hit rate ^|
+            >>"%GITHUB_STEP_SUMMARY%" echo ^|------------^|----------^|----------^|
+            >>"%GITHUB_STEP_SUMMARY%" echo ^| !SCC_HITS! ^| !SCC_REQ! ^| !SCC_WHOLE!.!SCC_DEC!%% ^|
+        )
+    )
 )
 
 REM Propagate a build failure as a non-zero exit (a prior bug let a failed `cmake
diff --git a/.github/build.sh b/.github/build.sh
index 7a47ab65..6257904b 100755
--- a/.github/build.sh
+++ b/.github/build.sh
@@ -160,5 +160,26 @@ rm -f "$build_log"
 # crashing sccache (or the mid-build retry disabled it), re-invoking it here would just repeat
 # the crash output (harmless but noisy).
 if [ -n "$LAUNCH" ] && command -v sccache >/dev/null 2>&1; then
-  sccache --show-stats || true
+  sccache_stats="$(sccache --show-stats 2>/dev/null || true)"
+  printf '%s\n' "$sccache_stats"
+  # KISS per-job cache summary in the GitHub Actions job summary (like upstream llama.cpp's
+  # ccache-action table). Parse the text stats: the top-level "Compile requests" line is the
+  # total and the top-level "Cache hits" line is the hits (the per-language "Cache hits (C/C++)"
+  # line has "(" after the label, so the digit-anchored regex skips it). Only runs in CI
+  # (GITHUB_STEP_SUMMARY set); local runs are untouched. Best-effort — skips silently if the two
+  # numbers can't be parsed or there were no requests.
+  if [ -n "${GITHUB_STEP_SUMMARY:-}" ] && [ -n "$sccache_stats" ]; then
+    sccache_req="$(printf '%s\n' "$sccache_stats" | awk '/^Compile requests[[:space:]]+[0-9]/{print $NF; exit}')"
+    sccache_hits="$(printf '%s\n' "$sccache_stats" | awk '/^Cache hits[[:space:]]+[0-9]/{print $NF; exit}')"
+    if [ -n "$sccache_req" ] && [ -n "$sccache_hits" ] && [ "$sccache_req" -gt 0 ] 2>/dev/null; then
+      sccache_rate="$(awk "BEGIN{printf \"%.1f\", ($sccache_hits/$sccache_req)*100}")"
+      {
+        echo "### sccache statistics"
+        echo ""
+        echo "| Cache hits | Requests | Hit rate |"
+        echo "|------------|----------|----------|"
+        echo "| ${sccache_hits} | ${sccache_req} | ${sccache_rate}% |"
+      } >> "$GITHUB_STEP_SUMMARY"
+    fi
+  fi
 fi
diff --git a/.github/scripts/llama-next-version.sh b/.github/scripts/llama-next-version.sh
new file mode 100755
index 00000000..517d2e9b
--- /dev/null
+++ b/.github/scripts/llama-next-version.sh
@@ -0,0 +1,123 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+#
+# SPDX-License-Identifier: MIT
+#
+# Pick the NEXT llama.cpp tag to bump the pin to, one reviewable chunk at a time.
+#
+# The runbook this supports is docs/upgrade/llama-cpp-version-bump.md. Strategy:
+#   * TARGET   = the topmost RELEASE on the GitHub releases page (read from the release atom feed),
+#                or an explicit "b<nnnn>" passed as $1.
+#   * CURRENT  = the pinned tag in llama/CMakeLists.txt (GIT_TAG b<nnnn>).
+#   * If `git diff CURRENT..TARGET` is smaller than the threshold (default 100 KiB), bump straight
+#     to TARGET. Otherwise CHUNK: pick the largest intermediate b<nnnn> tag whose diff from CURRENT
+#     is still under the threshold, so each bump stays a small, reviewable patch. Re-run after each
+#     bump to walk the remaining chunks up to TARGET.
+#
+# This tool only READS (a cached mirror clone + the pin file); it never edits the repo. Apply the
+# bump by hand per the runbook. It prints the compare/.patch URLs for the chosen step.
+#
+# Env:
+#   LLAMA_BUMP_MAX_DIFF_KB   per-step diff-size threshold in KiB (default 100)
+#   LLAMA_BUMP_EXCLUDE_WEBUI if "1", size the diff EXCLUDING tools/ui (the auto-followed WebUI, which
+#                            does not need per-bump review); default 0 = the full diff you paste/review
+#   LLAMA_BUMP_CACHE         mirror-clone location (default ~/.cache/jllama-llamacpp-mirror)
+#
+# Network: needs read access to github.com (git clone/fetch + the release atom feed). No token.
+
+set -euo pipefail
+
+THRESHOLD_KB="${LLAMA_BUMP_MAX_DIFF_KB:-100}"
+THRESHOLD=$((THRESHOLD_KB * 1024))
+EXCLUDE_WEBUI="${LLAMA_BUMP_EXCLUDE_WEBUI:-0}"
+REPO="ggml-org/llama.cpp"
+GIT_URL="https://github.com/${REPO}.git"
+CACHE="${LLAMA_BUMP_CACHE:-$HOME/.cache/jllama-llamacpp-mirror}"
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+CMAKELISTS="$ROOT/llama/CMakeLists.txt"
+
+# --- current pinned tag number, e.g. "GIT_TAG b9866" -> 9866 -----------------------------------
+cur="$(grep -oE 'GIT_TAG[[:space:]]+b[0-9]+' "$CMAKELISTS" | grep -oE '[0-9]+' | head -1 || true)"
+[ -n "$cur" ] || { echo "ERROR: could not read 'GIT_TAG b<nnnn>' from $CMAKELISTS" >&2; exit 1; }
+
+# --- cached blobless mirror of llama.cpp (clone once, then fetch tags) --------------------------
+if [ -d "$CACHE/.git" ]; then
+    git -C "$CACHE" fetch --quiet --tags --prune origin || true
+else
+    echo "cloning ${REPO} (blobless) into $CACHE (one-time) ..." >&2
+    git clone --filter=blob:none --no-checkout --quiet "$GIT_URL" "$CACHE"
+fi
+
+# --- target: explicit "$1" (b<nnnn>) or the latest RELEASE from the atom feed -------------------
+if [ "${1:-}" != "" ]; then
+    target="$(printf '%s' "$1" | grep -oE '[0-9]+' | head -1)"
+    [ -n "$target" ] || { echo "ERROR: '$1' is not a b<nnnn> tag" >&2; exit 1; }
+else
+    feed="$(curl -sSL --fail --retry 4 --retry-delay 2 "https://github.com/${REPO}/releases.atom" 2>/dev/null || true)"
+    [ -n "$feed" ] || { echo "ERROR: cannot fetch the releases feed (network/rate limit). Read the topmost release at https://github.com/${REPO}/releases and pass it: $0 b<nnnn>" >&2; exit 2; }
+    target="$(printf '%s' "$feed" | grep -oE 'releases/tag/b[0-9]+' | grep -oE '[0-9]+' | sort -un | tail -1)"
+    [ -n "$target" ] || { echo "ERROR: parsed no release tags from the feed." >&2; exit 3; }
+fi
+
+git -C "$CACHE" rev-parse -q --verify "b${cur}^{commit}"    >/dev/null 2>&1 || { echo "ERROR: b$cur is not a tag in the mirror" >&2; exit 3; }
+git -C "$CACHE" rev-parse -q --verify "b${target}^{commit}" >/dev/null 2>&1 || { echo "ERROR: b$target is not a tag in the mirror" >&2; exit 3; }
+
+# diff byte size between two tag numbers, honoring the WebUI-exclusion toggle
+diffsize() {
+    if [ "$EXCLUDE_WEBUI" = "1" ]; then
+        git -C "$CACHE" diff "b$1" "b$2" -- . ':(exclude)tools/ui' 2>/dev/null | wc -c
+    else
+        git -C "$CACHE" diff "b$1" "b$2" 2>/dev/null | wc -c
+    fi
+}
+
+scope="full diff"
+[ "$EXCLUDE_WEBUI" = "1" ] && scope="diff excluding tools/ui"
+echo "current pin    : b$cur"
+echo "latest release : b$target"
+echo "threshold      : ${THRESHOLD_KB} KiB per step (${scope})"
+
+if [ "$cur" -ge "$target" ]; then
+    echo "=> up to date — no bump needed."
+    exit 0
+fi
+
+# --- choose next step: TARGET if it fits, else the largest intermediate tag under the threshold -
+if [ "$(diffsize "$cur" "$target")" -lt "$THRESHOLD" ]; then
+    next="$target"
+else
+    # existing b-tags strictly after cur, up to and including target, ascending
+    # shellcheck disable=SC2207
+    cands=($(git -C "$CACHE" tag -l 'b*' | grep -oE 'b[0-9]+' | grep -oE '[0-9]+' | sort -un \
+             | awk -v c="$cur" -v t="$target" '$1 > c && $1 <= t'))
+    # binary search for the largest candidate whose diff from cur is under the threshold
+    # (diff size grows monotonically enough with the tag number for this to be a safe heuristic)
+    lo=0; hi=$(( ${#cands[@]} - 1 )); best=""
+    while [ "$lo" -le "$hi" ]; do
+        mid=$(( (lo + hi) / 2 )); T="${cands[$mid]}"
+        if [ "$(diffsize "$cur" "$T")" -lt "$THRESHOLD" ]; then best="$T"; lo=$(( mid + 1 )); else hi=$(( mid - 1 )); fi
+    done
+    if [ -n "$best" ]; then
+        next="$best"
+    else
+        next="${cands[0]}"
+        echo "NOTE: even b$cur..b$next exceeds ${THRESHOLD_KB} KiB — a single-commit step this large is unavoidable." >&2
+    fi
+fi
+
+full=$(git -C "$CACHE" diff "b$cur" "b$next" | wc -c)
+noui=$(git -C "$CACHE" diff "b$cur" "b$next" -- . ':(exclude)tools/ui' | wc -c)
+commits=$(git -C "$CACHE" rev-list --count "b$cur".."b$next")
+echo
+echo "next step      : b$cur -> b$next"
+echo "  diff size    : $((full / 1024)) KiB full  /  $((noui / 1024)) KiB excluding tools/ui (auto-followed WebUI)"
+echo "  commits      : $commits"
+if [ "$next" -eq "$target" ]; then
+    echo "  progress     : reaches the latest release — final chunk"
+else
+    echo "  progress     : intermediate chunk — re-run this script after the bump for the next one"
+fi
+echo "  review diff  : https://github.com/${REPO}/compare/b$cur...b$next"
+echo "  raw .patch   : https://github.com/${REPO}/compare/b$cur...b$next.patch"
+echo
+echo "Apply this bump per docs/upgrade/llama-cpp-version-bump.md (b$cur -> b$next)."
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 4966652c..c8cc9b2d 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -437,6 +437,137 @@ jobs:
           name: Linux-aarch64-libraries
           path: ${{ github.workspace }}/llama/src/main/resources/net/ladenthin/llama/
 
+  build-linux-s390x:
+    name: Build and Test Linux s390x (big-endian, qemu)
+    needs: [startgate, build-webui]
+    # Cross-compile for IBM Z (s390x, BIG-ENDIAN) with the GCC cross toolchain, then run the full
+    # C++ unit suite under qemu-user — a real big-endian correctness gate for our helpers and
+    # serializers (esp. the little-endian WAV writer, JSON/token/embedding transforms). The BUILD
+    # is native speed (x86 cross-gcc); only the tiny test binary is emulated. s390x is a DEFAULT-jar
+    # CPU platform (like aarch64), so the artifact merges via the `*-libraries` glob (no classifier /
+    # pom profile). Model-backed Java tests are NOT run under emulation (a JVM + GGUF inference under
+    # qemu-user is slow/flaky); the C++ gate covers the actual byte-order risk since the Java<->JNI
+    # boundary uses host-native array copies. GGML_OPENMP=OFF avoids cross-libgomp issues (ggml uses
+    # its own std::thread pool). CMAKE_CROSSCOMPILING_EMULATOR makes ctest run the s390x exe via qemu;
+    # QEMU_LD_PREFIX lets the emulated binary find the s390x sysroot libs.
+    runs-on: ubuntu-latest
+    env:
+      QEMU_LD_PREFIX: /usr/s390x-linux-gnu
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: ${{ env.JAVA_VERSION }}
+      - name: Install s390x cross toolchain + qemu-user
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y gcc-s390x-linux-gnu g++-s390x-linux-gnu qemu-user-static
+      - name: Build libraries (cross-compile s390x)
+        shell: bash
+        run: |
+          mvn --no-transfer-progress -f llama/pom.xml compile
+          .github/build.sh "-DGGML_NATIVE=OFF -DGGML_OPENMP=OFF -DBUILD_TESTING=ON -DCMAKE_SYSTEM_NAME=Linux -DCMAKE_SYSTEM_PROCESSOR=s390x -DCMAKE_C_COMPILER=s390x-linux-gnu-gcc -DCMAKE_CXX_COMPILER=s390x-linux-gnu-g++ -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-s390x-static -DOS_NAME=Linux -DOS_ARCH=s390x"
+      - name: Run C++ unit tests under qemu-s390x (big-endian gate)
+        run: ctest --test-dir llama/build --output-on-failure
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Linux-s390x-libraries
+          path: ${{ github.workspace }}/llama/src/main/resources/net/ladenthin/llama/
+
+  build-linux-x86_64-vulkan:
+    name: Build Linux x86_64 Vulkan
+    needs: [startgate, build-webui]
+    # Native ubuntu build (NOT dockcross) — the Vulkan SDK is trivial to apt-install here, and
+    # upstream llama.cpp builds its ubuntu-vulkan artifact the same way. GPU runtime libvulkan.so.1
+    # is supplied by the consumer's driver (nothing bundled). GitHub runners have NO GPU, so this
+    # is a BUILD-ONLY job (no -DBUILD_TESTING/ctest: a Vulkan-linked jllama_test errors enumerating
+    # devices on a GPU-less runner — same rationale as the Windows GPU jobs). GGML_NATIVE=OFF keeps
+    # the artifact portable across x86_64 CPU generations. Trade-off vs the manylinux CPU jar: the
+    # glibc floor rises to the ubuntu-latest baseline (same as the native aarch64 job). build.sh
+    # self-fetches sccache; the probe guards it (a miss just builds uncached).
+    runs-on: ubuntu-latest
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: ${{ env.JAVA_VERSION }}
+      - name: Install Vulkan SDK (headers + loader + glslc shader compiler)
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libvulkan-dev glslc glslang-tools spirv-headers
+      - name: Build libraries
+        shell: bash
+        run: |
+          mvn --no-transfer-progress -f llama/pom.xml compile
+          .github/build.sh "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64"
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Linux-x86_64-vulkan
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-linux-aarch64-vulkan:
+    name: Build Linux aarch64 Vulkan
+    needs: [startgate, build-webui]
+    # Native ARM64 Vulkan build on GitHub's free arm64 runner (same runner as the aarch64 CPU job).
+    # Build-only (GPU-less runner); GGML_NATIVE=OFF for portability across ARMv8 generations; GCC 14
+    # to match the aarch64 CPU job. Reuses the resources_linux_vulkan tree (arch subdir Linux/aarch64);
+    # the vulkan-linux-aarch64 Maven profile packages only that subtree.
+    runs-on: ubuntu-24.04-arm
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: ${{ env.JAVA_VERSION }}
+      - name: Install toolchain (GCC 14) + Vulkan SDK
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y gcc-14 g++-14 libvulkan-dev glslc glslang-tools spirv-headers
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+      - name: Build libraries
+        shell: bash
+        run: |
+          mvn --no-transfer-progress -f llama/pom.xml compile
+          .github/build.sh "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=aarch64"
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Linux-aarch64-vulkan
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/
+          if-no-files-found: error
+
   crosscompile-android-aarch64:
     name: Cross-Compile Android aarch64
     needs: [startgate, build-webui]
@@ -788,6 +919,57 @@ jobs:
           name: Windows-x86-libraries
           path: ${{ github.workspace }}/llama/src/main/resources/net/ladenthin/llama/
 
+  build-windows-arm64:
+    name: Build and Test Windows 11 arm64 (Ninja Multi-Config, default)
+    needs: [startgate, build-webui]
+    # Native arm64 build on GitHub's free windows-11-arm runner. Goes into the DEFAULT JAR (no
+    # classifier): OSInfo maps a Windows-on-ARM JVM (os.arch=aarch64) to Windows/aarch64, the same
+    # path CMake emits here, and the `*-libraries` glob in the package/publish jobs merges it into
+    # src/main/resources. sccache is intentionally omitted (the existing install step pulls the
+    # x86_64 sccache zip; an arm64 build would need the aarch64 release — not worth the extra path
+    # for one CPU job, so build.bat just builds uncached when sccache is absent).
+    #
+    # Compiler: clang-cl, NOT MSVC cl.exe. ggml's ggml-cpu/CMakeLists.txt aborts with "MSVC is not
+    # supported for ARM, use clang" via `if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")`.
+    # clang-cl (LLVM's MSVC-compatible driver) satisfies that guard (its compiler id is "Clang")
+    # while still leaving CMake's MSVC=TRUE, so our static /MT CRT block (CMAKE_MSVC_RUNTIME_LIBRARY
+    # in CMakeLists.txt) keeps applying and the generator stays Ninja Multi-Config. msvc-dev-cmd
+    # (arm64) supplies the MSVC headers/libs/linker AND the bundled clang-cl / lld-link under
+    # VC\Tools\Llvm\ARM64, so no separate LLVM install is needed.
+    #
+    # GGML_OPENMP=OFF: with clang-cl, ggml links LLVM's OpenMP (libomp.lib -> needs libomp140.aarch64.dll
+    # at runtime), which is NOT on PATH like MSVC's ambient vcomp140.dll on x64 — so gtest_discover_tests
+    # (and any consumer) failed to launch the binary with 0xc0000135 STATUS_DLL_NOT_FOUND. Turning OpenMP
+    # off makes ggml use its own std::thread threadpool, so the arm64 jllama.dll (and the test exe) are
+    # self-contained with no libomp dependency to ship. The x86_64/x86 jobs keep OpenMP (MSVC vcomp).
+    runs-on: windows-11-arm
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - name: Set up MSVC developer environment (arm64)
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: arm64
+      - name: Build libraries
+        shell: cmd
+        # No mvn compile needed: the JNI header (jllama.h) is committed and the native build
+        # uses the bundled JNI headers in .github/include, and OS_NAME/OS_ARCH are passed
+        # explicitly (so the OSInfo-class OS-detection path is skipped) — same as the x86_64 job.
+        # clang-cl (see the job comment) is required: ggml refuses MSVC cl.exe on ARM.
+        run: |
+          .github\build.bat -G "Ninja Multi-Config" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DGGML_OPENMP=OFF -DOS_NAME=Windows -DOS_ARCH=aarch64 -DBUILD_TESTING=ON
+      - name: Run C++ unit tests
+        run: ctest --test-dir llama/build --output-on-failure
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Windows-aarch64-libraries
+          path: ${{ github.workspace }}/llama/src/main/resources/net/ladenthin/llama/
+
   # ---------------------------------------------------------------------------
   # Windows GPU classifiers (x86_64 only) — CUDA, Vulkan, OpenCL.
   # All three use the same Ninja Multi-Config + MSVC + sccache toolchain as the
@@ -951,6 +1133,328 @@ jobs:
           path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/
           if-no-files-found: error
 
+  # ---------------------------------------------------------------------------
+  # Additional GPU-backend classifiers (fail-loud, same wiring as the CUDA/Vulkan/
+  # OpenCL jobs): AMD ROCm/HIP, Intel SYCL (oneAPI), Windows-on-ARM OpenCL (Adreno),
+  # Intel OpenVINO. All BUILD-ONLY (GitHub runners have no AMD/Intel/Adreno GPU, and
+  # no ctest — a GPU-linked jllama_test can't enumerate a device). GPU runtime libs
+  # are NOT bundled — the consumer's driver/toolkit supplies them. CMakeLists.txt
+  # routes each backend to its own src/main/resources_* tree; the matching Maven
+  # profile turns it into a classifier JAR. Toolchain install steps are first-pass —
+  # if a vendor URL/version 404s in CI, adjust it (the failure is intentional signal).
+  # ---------------------------------------------------------------------------
+
+  build-linux-x86_64-rocm:
+    name: Build Linux x86_64 ROCm/HIP (AMD)
+    needs: [startgate, build-webui]
+    runs-on: ubuntu-latest
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: ${{ env.JAVA_VERSION }}
+      - name: Install ROCm/HIP (AMD apt repo)
+        run: |
+          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+          wget -qO- https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+          echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.3.4 noble main" | sudo tee /etc/apt/sources.list.d/rocm.list
+          printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600\n' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+          sudo apt-get update
+          sudo apt-get install -y rocm-hip-sdk rocblas-dev hipblas-dev
+          echo "/opt/rocm/bin" >> "$GITHUB_PATH"
+          echo "ROCM_PATH=/opt/rocm" >> "$GITHUB_ENV"
+      - name: Build libraries
+        shell: bash
+        run: |
+          mvn --no-transfer-progress -f llama/pom.xml compile
+          .github/build.sh "-DGGML_HIP=ON -DAMDGPU_TARGETS=gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102 -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64"
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Linux-x86_64-rocm
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_rocm/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-windows-x86_64-rocm:
+    name: Build Windows x86_64 ROCm/HIP (AMD)
+    needs: [startgate, build-webui]
+    # windows-2022 (MSVC 14.4x), NOT windows-2025-vs2026 (VS 2026 / MSVC 14.51): ROCm 7.1's
+    # HIP clang headers (__clang_hip_cmath.h) cannot overload the __host__ __device__
+    # isgreater/isless/... that the very new MSVC <cmath> declares via _CLANG_BUILTIN2, so the
+    # device-code compile fails. Upstream llama.cpp builds win-hip on windows-2022 for the same
+    # reason (it drives the HIP SDK's own clang and relies on the older MSVC STL).
+    runs-on: windows-2022
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - name: Set up MSVC developer environment (x64)
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+      - name: Install AMD HIP SDK for Windows
+        shell: pwsh
+        # Mirrors upstream llama.cpp's windows-hip release job: HIP SDK 26.Q1, then
+        # resolve HIP_PATH from the installed ROCm dir and point the compilers +
+        # CMAKE_PREFIX_PATH at it so ggml-hip's find_package(hip) resolves.
+        run: |
+          $url = "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-26.Q1-Win11-For-HIP.exe"
+          Invoke-WebRequest -Uri $url -OutFile "$env:RUNNER_TEMP\rocm-install.exe"
+          $proc = Start-Process "$env:RUNNER_TEMP\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru -Wait
+          if ($proc.ExitCode -ne 0) { Write-Error "HIP SDK install failed with exit code $($proc.ExitCode)"; exit 1 }
+          $hip = $(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Split-Path | Split-Path)
+          "HIP_PATH=$hip" | Out-File -FilePath $env:GITHUB_ENV -Append
+          "$hip\bin" | Out-File -FilePath $env:GITHUB_PATH -Append
+      - name: Build libraries
+        shell: cmd
+        run: |
+          .github\build.bat -G "Ninja Multi-Config" -DGGML_HIP=ON -DGPU_TARGETS=gfx1030;gfx1100;gfx1101;gfx1102 -DCMAKE_PREFIX_PATH="%HIP_PATH%" -DCMAKE_C_COMPILER="%HIP_PATH%\bin\clang.exe" -DCMAKE_CXX_COMPILER="%HIP_PATH%\bin\clang++.exe" -DOS_NAME=Windows -DOS_ARCH=x86_64
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Windows-x86_64-rocm
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_rocm/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-linux-x86_64-sycl-fp16:
+    name: Build Linux x86_64 SYCL fp16 (Intel oneAPI)
+    needs: [startgate, build-webui]
+    runs-on: ubuntu-latest
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: ${{ env.JAVA_VERSION }}
+      - name: Install Intel oneAPI (DPC++ + MKL)
+        run: |
+          wget -qO- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+          sudo apt-get update
+          sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp intel-oneapi-mkl-devel
+      - name: Build libraries
+        shell: bash
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mvn --no-transfer-progress -f llama/pom.xml compile
+          .github/build.sh "-DGGML_SYCL=ON -DGGML_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64"
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Linux-x86_64-sycl-fp16
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-linux-x86_64-sycl-fp32:
+    name: Build Linux x86_64 SYCL fp32 (Intel oneAPI)
+    needs: [startgate, build-webui]
+    runs-on: ubuntu-latest
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: ${{ env.JAVA_VERSION }}
+      - name: Install Intel oneAPI (DPC++ + MKL)
+        run: |
+          wget -qO- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+          sudo apt-get update
+          sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp intel-oneapi-mkl-devel
+      - name: Build libraries
+        shell: bash
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mvn --no-transfer-progress -f llama/pom.xml compile
+          .github/build.sh "-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64"
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Linux-x86_64-sycl-fp32
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-windows-x86_64-sycl:
+    name: Build Windows 2025 x86_64 SYCL (Intel oneAPI)
+    needs: [startgate, build-webui]
+    runs-on: windows-2025-vs2026
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - name: Set up MSVC developer environment (x64)
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+      - name: Install Intel oneAPI (DPC++ + MKL + oneDNN + TBB)
+        shell: cmd
+        # Mirrors upstream llama.cpp's windows-sycl release job: extract the offline
+        # installer, then run its bootstrapper with the DPC++/MKL/oneDNN/TBB components.
+        run: |
+          curl -fSL -o "%RUNNER_TEMP%\oneapi.exe" "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe"
+          "%RUNNER_TEMP%\oneapi.exe" -s -x -f "%RUNNER_TEMP%\oneapi_extracted" --log "%RUNNER_TEMP%\extract.log"
+          "%RUNNER_TEMP%\oneapi_extracted\bootstrapper.exe" -s --action install --components=intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel --eula=accept -p=NEED_VS2022_INTEGRATION=0 --log-dir="%RUNNER_TEMP%"
+      - name: Build libraries
+        shell: cmd
+        run: |
+          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+          .github\build.bat -G "Ninja Multi-Config" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DOS_NAME=Windows -DOS_ARCH=x86_64
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Windows-x86_64-sycl
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_sycl/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-windows-arm64-opencl:
+    name: Build Windows 11 arm64 OpenCL (Adreno)
+    needs: [startgate, build-webui]
+    # Windows-on-ARM OpenCL (Snapdragon X / Adreno). Same clang-cl + GGML_OPENMP=OFF
+    # toolchain as the arm64 CPU job (ggml refuses MSVC cl.exe on ARM). Reuses the
+    # resources_windows_opencl tree under Windows/aarch64; the opencl-windows-aarch64
+    # Maven profile packages only that subtree. build_opencl_windows.bat stages the
+    # OpenCL headers + ICD loader before delegating to build.bat.
+    runs-on: windows-11-arm
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - name: Set up MSVC developer environment (arm64)
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: arm64
+      - name: Build libraries
+        shell: cmd
+        run: |
+          .github\build_opencl_windows.bat -G "Ninja Multi-Config" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DGGML_OPENMP=OFF -DGGML_OPENCL=ON -DGGML_OPENCL_EMBED_KERNELS=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DOS_NAME=Windows -DOS_ARCH=aarch64
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Windows-aarch64-opencl
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-linux-x86_64-openvino:
+    name: Build Linux x86_64 OpenVINO (Intel)
+    needs: [startgate, build-webui]
+    runs-on: ubuntu-latest
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: ${{ env.JAVA_VERSION }}
+      - name: Install OpenCL dev + Intel OpenVINO 2026.2.1 (archive)
+        run: |
+          # Intel's OpenVINO APT repo only publishes up to ~2025 (the /openvino/2026 path 404s), and
+          # 2025.x has the older ov::Allocator API that breaks ggml-openvino's template compile. So use
+          # the ARCHIVE for 2026.2.1 — exactly what upstream llama.cpp's linux-setup-openvino action does.
+          # OpenCL headers (incl. the C++ CL/cl2.hpp via opencl-clhpp-headers) come from Ubuntu's own repos.
+          sudo apt-get update
+          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+          url="https://storage.openvinotoolkit.org/repositories/openvino/packages/2026.2.1/linux/openvino_toolkit_ubuntu24_2026.2.1.21919.ede283a88e3_x86_64.tgz"
+          sudo mkdir -p /opt/intel/openvino
+          curl -fSL "$url" | sudo tar -xz --strip-components=1 -C /opt/intel/openvino
+          echo "OpenVINO_DIR=/opt/intel/openvino/runtime/cmake" >> "$GITHUB_ENV"
+      - name: Build libraries
+        shell: bash
+        run: |
+          source /opt/intel/openvino/setupvars.sh || true
+          mvn --no-transfer-progress -f llama/pom.xml compile
+          .github/build.sh "-DGGML_OPENVINO=ON -DOpenVINO_DIR=$OpenVINO_DIR -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64"
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Linux-x86_64-openvino
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_openvino/net/ladenthin/llama/
+          if-no-files-found: error
+
+  build-windows-x86_64-openvino:
+    name: Build Windows 2025 x86_64 OpenVINO (Intel)
+    needs: [startgate, build-webui]
+    runs-on: windows-2025-vs2026
+    steps:
+      - uses: actions/checkout@v7
+      - name: Download shared WebUI assets
+        uses: actions/download-artifact@v8
+        with:
+          name: webui-generated
+          path: ${{ github.workspace }}/llama/webui-generated/
+      - name: Set up MSVC developer environment (x64)
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x64
+      - name: Install OpenCL headers (vcpkg) + Intel OpenVINO 2026.2.1
+        shell: pwsh
+        # vcpkg's opencl port ships the full C++ headers incl. CL/cl2.hpp that OpenVINO's
+        # ocl_wrapper.hpp needs (the Khronos OpenCL-Headers dropped cl2.hpp) — same as upstream
+        # llama.cpp's windows-openvino job. OpenVINO 2026.2.1 matches ggml-openvino's target API.
+        run: |
+          C:\vcpkg\vcpkg install opencl:x64-windows
+          $url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/2026.2.1/windows/openvino_toolkit_windows_2026.2.1.21919.ede283a88e3_x86_64.zip"
+          Invoke-WebRequest -Uri $url -OutFile "$env:RUNNER_TEMP\openvino.zip"
+          Expand-Archive -Path "$env:RUNNER_TEMP\openvino.zip" -DestinationPath "C:\openvino" -Force
+          # The archive extracts into a nested versioned folder; point OpenVINO_DIR at its runtime/cmake.
+          $root = (Get-ChildItem "C:\openvino" -Directory | Select-Object -First 1).FullName
+          "OpenVINO_DIR=$root\runtime\cmake" | Out-File -FilePath $env:GITHUB_ENV -Append
+      - name: Build libraries
+        shell: cmd
+        # vcpkg toolchain file wires in the OpenCL (incl. cl2.hpp) that ggml-openvino needs.
+        run: |
+          .github\build.bat -G "Ninja Multi-Config" -DGGML_OPENVINO=ON -DOpenVINO_DIR="%OpenVINO_DIR%" -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake -DOS_NAME=Windows -DOS_ARCH=x86_64
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: Windows-x86_64-openvino
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_openvino/net/ladenthin/llama/
+          if-no-files-found: error
+
   # ---------------------------------------------------------------------------
   # CI-only jobs — no release artifact, purely for test coverage
   # ---------------------------------------------------------------------------
@@ -1521,15 +2025,27 @@ jobs:
     needs:
       - crosscompile-linux-x86_64-cuda
       - crosscompile-linux-aarch64
+      - build-linux-s390x
+      - build-linux-x86_64-vulkan
+      - build-linux-aarch64-vulkan
       - crosscompile-android-aarch64
       - crosscompile-android-aarch64-opencl
       - build-windows-x86_64
       - build-windows-x86
+      - build-windows-arm64
       - build-windows-x86_64-msvc
       - build-windows-x86-msvc
       - build-windows-x86_64-cuda
       - build-windows-x86_64-vulkan
       - build-windows-x86_64-opencl
+      - build-linux-x86_64-rocm
+      - build-windows-x86_64-rocm
+      - build-linux-x86_64-sycl-fp16
+      - build-linux-x86_64-sycl-fp32
+      - build-windows-x86_64-sycl
+      - build-windows-arm64-opencl
+      - build-linux-x86_64-openvino
+      - build-windows-x86_64-openvino
       - test-cpp-linux-x86_64
       - build-macos-arm64-metal-15
       - test-java-linux-x86_64
@@ -1550,6 +2066,16 @@ jobs:
         with:
           name: linux-libraries-cuda
           path: ${{ github.workspace }}/llama/src/main/resources_linux_cuda/net/ladenthin/llama/
+      # Linux Vulkan classifiers (x86_64 + aarch64) share one tree; the two Maven profiles
+      # split it by arch subdir into one single-arch classifier JAR each.
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-vulkan
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-aarch64-vulkan
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/
       - uses: actions/download-artifact@v8
         with:
           name: android-libraries-opencl
@@ -1577,6 +2103,38 @@ jobs:
         with:
           name: Windows-x86_64-opencl
           path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-rocm
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_rocm/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-rocm
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_rocm/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-sycl-fp16
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-sycl-fp32
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-sycl
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_sycl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-aarch64-opencl
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-openvino
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_openvino/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-openvino
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_openvino/net/ladenthin/llama/
       - uses: actions/setup-java@v5
         with:
           distribution: 'temurin'
@@ -1590,7 +2148,7 @@ jobs:
         # Windows classifier JARs: `windows-msvc` (MSVC-built CPU natives) plus the GPU
         # backends `cuda-windows` / `vulkan-windows` / `opencl-windows`. The default JAR's
         # Windows natives are the Ninja `*-libraries` merged into src/main/resources/ above.
-        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,assembly -Dmaven.test.skip=true -Dgpg.skip=true package
+        run: mvn --batch-mode --no-transfer-progress -P release,cuda,vulkan-linux,vulkan-linux-aarch64,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,rocm-linux,rocm-windows,sycl-fp16-linux,sycl-fp32-linux,sycl-windows,opencl-windows-aarch64,openvino-linux,openvino-windows,assembly -Dmaven.test.skip=true -Dgpg.skip=true package
       - name: Upload JARs
         uses: actions/upload-artifact@v7
         with:
@@ -1664,6 +2222,14 @@ jobs:
         with:
           name: linux-libraries-cuda
           path: ${{ github.workspace }}/llama/src/main/resources_linux_cuda/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-vulkan
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-aarch64-vulkan
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/
       - uses: actions/download-artifact@v8
         with:
           name: android-libraries-opencl
@@ -1688,6 +2254,38 @@ jobs:
         with:
           name: Windows-x86_64-opencl
           path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-rocm
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_rocm/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-rocm
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_rocm/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-sycl-fp16
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-sycl-fp32
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-sycl
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_sycl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-aarch64-opencl
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-openvino
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_openvino/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-openvino
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_openvino/net/ladenthin/llama/
       - name: Set up Maven Central Repository
         uses: actions/setup-java@v5
         with:
@@ -1712,7 +2310,7 @@ jobs:
       # :llama-langchain4j. The `release` profile (GPG + Central Publishing) is inherited
       # from the parent, so every module — including the parent pom — is signed.
       - name: Publish snapshot (reactor - parent + llama + llama-langchain4j)
-        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows -Dmaven.test.skip=true deploy
+        run: mvn --batch-mode --no-transfer-progress -P release,cuda,vulkan-linux,vulkan-linux-aarch64,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,rocm-linux,rocm-windows,sycl-fp16-linux,sycl-fp32-linux,sycl-windows,opencl-windows-aarch64,openvino-linux,openvino-windows -Dmaven.test.skip=true deploy
         env:
           MAVEN_USERNAME: ${{ secrets.CENTRAL_USERNAME }}
           MAVEN_PASSWORD: ${{ secrets.CENTRAL_TOKEN }}
@@ -1774,6 +2372,14 @@ jobs:
         with:
           name: linux-libraries-cuda
           path: ${{ github.workspace }}/llama/src/main/resources_linux_cuda/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-vulkan
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-aarch64-vulkan
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/
       - uses: actions/download-artifact@v8
         with:
           name: android-libraries-opencl
@@ -1798,6 +2404,38 @@ jobs:
         with:
           name: Windows-x86_64-opencl
           path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-rocm
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_rocm/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-rocm
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_rocm/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-sycl-fp16
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-sycl-fp32
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-sycl
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_sycl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-aarch64-opencl
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Linux-x86_64-openvino
+          path: ${{ github.workspace }}/llama/src/main/resources_linux_openvino/net/ladenthin/llama/
+      - uses: actions/download-artifact@v8
+        with:
+          name: Windows-x86_64-openvino
+          path: ${{ github.workspace }}/llama/src/main/resources_windows_openvino/net/ladenthin/llama/
       - name: Set up Maven Central Repository
         uses: actions/setup-java@v5
         with:
@@ -1813,7 +2451,7 @@ jobs:
       # :llama-langchain4j. The `release` profile (GPG + Central Publishing) is inherited
       # from the parent, so every module — including the parent pom — is signed.
       - name: Publish release (reactor - parent + llama + llama-langchain4j)
-        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows -Dmaven.test.skip=true deploy
+        run: mvn --batch-mode --no-transfer-progress -P release,cuda,vulkan-linux,vulkan-linux-aarch64,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,rocm-linux,rocm-windows,sycl-fp16-linux,sycl-fp32-linux,sycl-windows,opencl-windows-aarch64,openvino-linux,openvino-windows -Dmaven.test.skip=true deploy
         env:
           MAVEN_USERNAME: ${{ secrets.CENTRAL_USERNAME }}
           MAVEN_PASSWORD: ${{ secrets.CENTRAL_TOKEN }}
diff --git a/.gitignore b/.gitignore
index dfead6a8..d160476a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,13 +39,21 @@ replay_pid*
 
 models/*.gguf
 llama/src/main/cpp/net_ladenthin_llama_*.h
-llama/src/main/resources_cuda_linux/
+llama/src/main/resources_linux_cuda/
 # Per-classifier native trees, staged by CI before the matching Maven profile runs,
 # never committed (same policy as the default-tree native libs below).
+llama/src/main/resources_linux_vulkan/
 llama/src/main/resources_windows_msvc/
 llama/src/main/resources_windows_cuda/
 llama/src/main/resources_windows_vulkan/
 llama/src/main/resources_windows_opencl/
+llama/src/main/resources_linux_rocm/
+llama/src/main/resources_windows_rocm/
+llama/src/main/resources_linux_sycl_fp16/
+llama/src/main/resources_linux_sycl_fp32/
+llama/src/main/resources_windows_sycl/
+llama/src/main/resources_linux_openvino/
+llama/src/main/resources_windows_openvino/
 llama/src/main/resources/**/*.so
 llama/src/main/resources/**/*.dylib
 llama/src/main/resources/**/*.dll
diff --git a/CLAUDE.md b/CLAUDE.md
index e6da18c3..1c861a53 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b9859**
+Current llama.cpp pinned version: **b9870**
 
 ## Upgrading CUDA Version
 
@@ -198,7 +198,8 @@ Wiring (mirrors the CUDA-Linux / OpenCL-Android classifier pattern):
 
 1. **`llama/CMakeLists.txt`** — the `if(GGML_CUDA) … elseif(GGML_VULKAN) … elseif(GGML_OPENCL) … else()`
    chain is **OS-aware**: CUDA → `resources_windows_cuda` on Windows (else `resources_linux_cuda`),
-   Vulkan → `resources_windows_vulkan`, OpenCL → `resources_windows_opencl` on Windows (else
+   Vulkan → `resources_windows_vulkan` on Windows (else `resources_linux_vulkan` — see "Linux Vulkan
+   classifiers" above), OpenCL → `resources_windows_opencl` on Windows (else
    `resources_android_opencl`). The default CPU build (both generators) still emits to the canonical
    `src/main/resources/.../Windows/{x86_64,x86}/`, so the Ninja-vs-MSVC split is purely a
    CI-artifact-name + pom-profile concern (no CMake change for it).
@@ -253,6 +254,95 @@ ctest --test-dir build --output-on-failure
 .github\build_opencl_windows.bat -G "Ninja Multi-Config" -DGGML_OPENCL=ON -DGGML_OPENCL_EMBED_KERNELS=ON -DOS_NAME=Windows -DOS_ARCH=x86_64
 ```
 
+## Linux Vulkan classifiers + Windows arm64 CPU
+
+Three additional artifacts extend the matrix toward upstream llama.cpp's release set. They follow
+the same classifier/resource-tree pattern as CUDA-Linux and Vulkan-Windows.
+
+**Linux Vulkan (`vulkan-linux-x86-64` + `vulkan-linux-aarch64`).** A vendor-neutral GPU jar for
+Linux (NVIDIA / AMD / Intel) with no CUDA toolkit — the intersection of the existing Vulkan-Windows
+and CUDA-Linux wiring. Four places:
+
+1. **`llama/CMakeLists.txt`** — the `elseif(GGML_VULKAN)` branch is now **OS-aware** (mirrors
+   `GGML_CUDA`): Windows → `resources_windows_vulkan`, else → `resources_linux_vulkan`
+   (`.../Linux/${OS_ARCH}/`). One tree holds both arches under `Linux/{x86_64,aarch64}`.
+2. **`.github/workflows/publish.yml`** — `build-linux-x86_64-vulkan` (native `ubuntu-latest`, **not**
+   dockcross — the Vulkan SDK is a trivial apt install and upstream builds ubuntu-vulkan the same way)
+   and `build-linux-aarch64-vulkan` (`ubuntu-24.04-arm` + GCC 14). Both `apt-get install libvulkan-dev
+   glslc glslang-tools`, build `-DGGML_VULKAN=ON -DGGML_NATIVE=OFF`, and are **build-only** (no
+   `ctest`: a Vulkan-linked `jllama_test` errors enumerating devices on a GPU-less runner — same as the
+   Windows GPU jobs). Artifacts `Linux-{x86_64,aarch64}-vulkan` → both downloaded into the **one**
+   `resources_linux_vulkan/` tree by `package`/`publish-*`. Glibc floor rises to the ubuntu baseline
+   (like the aarch64 CPU jar); acceptable for a GPU artifact.
+3. **`llama/pom.xml`** — profiles `vulkan-linux` (classifier `vulkan-linux-x86-64`) and
+   `vulkan-linux-aarch64` (classifier `vulkan-linux-aarch64`). Both read the shared
+   `resources_linux_vulkan` tree but the resource-copy `<includes>` is **arch-scoped**
+   (`net/ladenthin/llama/Linux/{x86_64,aarch64}/**`), so each classifier JAR carries only its own
+   arch (verified: each jar contains exactly one `libjllama.so`). Separate output dirs
+   `_linux_vulkan` / `_linux_vulkan_aarch64` avoid collision. Activated in CI via
+   `-P …,vulkan-linux,vulkan-linux-aarch64,…`.
+4. **`README.md`** — classifier table + dependency snippets.
+
+`src/main/resources_linux_vulkan/` is git-ignored (staged by CI, never committed). GPU runtime
+`libvulkan.so.1` is supplied by the consumer's driver — nothing is bundled (same policy as every GPU
+classifier).
+
+**Windows arm64 CPU (default JAR, no classifier).** `build-windows-arm64` runs natively on GitHub's
+free `windows-11-arm` runner (`ilammy/msvc-dev-cmd` `arch: arm64`, Ninja Multi-Config, `-DOS_ARCH=aarch64`,
+build + `ctest`). It emits to the **canonical** `resources/.../Windows/aarch64/` and uploads
+`Windows-aarch64-libraries`, which the `package`/`publish-*` `*-libraries` glob merges into the default
+tree — so it ships in the **default** JAR alongside Windows x86-64 / x86 (like those, it is not a
+classifier). No Java change was needed: `OSInfo` already maps a Windows-on-ARM JVM (`os.arch=aarch64`)
+to `Windows/aarch64` (it isn't in `archMapping`, so it falls through `translateArchNameToFolderName`).
+sccache is intentionally omitted (the shared install step pulls the x86_64 sccache zip; not worth an
+arm64 path for one CPU job — `build.bat` just builds uncached). **Compiler: `clang-cl`, not MSVC
+`cl.exe`.** ggml's `ggml-cpu/CMakeLists.txt` aborts with *"MSVC is not supported for ARM, use clang"*
+via `if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")`; `clang-cl` (LLVM's MSVC-compatible driver)
+satisfies that guard (compiler id `"Clang"`) while keeping CMake's `MSVC=TRUE`, so the static `/MT` CRT
+block still applies and the generator stays Ninja Multi-Config. The job passes
+`-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl`; `msvc-dev-cmd` supplies the MSVC
+headers/libs/linker **and the bundled clang-cl/lld-link** (`VC\Tools\Llvm\ARM64`), so no separate
+LLVM install is needed. It also passes **`-DGGML_OPENMP=OFF`**: with clang-cl, ggml links LLVM's
+OpenMP (`libomp.lib` → `libomp140.aarch64.dll` at runtime), which — unlike MSVC's ambient
+`vcomp140.dll` on x64 — is not on `PATH`, so the test exe (and any consumer) failed to launch with
+`0xc0000135` (`STATUS_DLL_NOT_FOUND`). Disabling OpenMP makes ggml use its own `std::thread`
+threadpool, leaving the arm64 `jllama.dll` self-contained (the x86_64/x86 jobs keep OpenMP via MSVC
+`vcomp`). (Upstream llama.cpp instead cross-compiles arm64 from an
+x64 runner with `vcvarsall amd64_arm64` + a `clang`/`clang++` toolchain file and no arm64 tests; the
+native-runner + `clang-cl` route here keeps the `/MT` CRT and lets `ctest` run on real ARM hardware.)
+
+## Additional GPU-backend classifiers (ROCm/HIP, SYCL, Win-arm64 OpenCL, OpenVINO)
+
+Eight further GPU classifiers extend the matrix toward upstream llama.cpp's full release set. They
+follow the **exact same 5-place wiring** as the CUDA/Vulkan classifiers (no special cases — KISS): a
+`CMakeLists.txt` backend branch, a `publish.yml` build job (in `package.needs`, **fail-loud** — a
+broken build reds the pipeline, same policy as every GPU job), a `pom.xml` classifier profile, a
+`README.md` row, and a git-ignored `resources_*` tree. All are **build-only** (GitHub runners have no
+matching GPU) and bundle **no** vendor runtime.
+
+| Classifier | GGML flag(s) | Job runner / toolchain | Tree |
+|---|---|---|---|
+| `rocm-linux-x86-64` | `GGML_HIP=ON -DAMDGPU_TARGETS=…` | `ubuntu-latest` + ROCm apt repo (`/opt/rocm/llvm/bin/clang`) | `resources_linux_rocm` |
+| `rocm-windows-x86-64` | `GGML_HIP=ON` | `windows-2025-vs2026` + AMD HIP SDK | `resources_windows_rocm` |
+| `sycl-fp16-linux-x86-64` | `GGML_SYCL=ON -DGGML_SYCL_F16=ON` (`icx`/`icpx`) | `ubuntu-latest` + Intel oneAPI apt | `resources_linux_sycl_fp16` |
+| `sycl-fp32-linux-x86-64` | `GGML_SYCL=ON` (`icx`/`icpx`) | `ubuntu-latest` + Intel oneAPI apt | `resources_linux_sycl_fp32` |
+| `sycl-windows-x86-64` | `GGML_SYCL=ON` (`icx`) | `windows-2025-vs2026` + oneAPI installer | `resources_windows_sycl` |
+| `opencl-windows-aarch64` | `GGML_OPENCL=ON …ADRENO_KERNELS=ON` (clang-cl, `GGML_OPENMP=OFF`) | `windows-11-arm` (arm64 CPU job's toolchain) | `resources_windows_opencl` (arch subdir `aarch64`) |
+| `openvino-linux-x86-64` | `GGML_OPENVINO=ON` | `ubuntu-latest` + OpenVINO apt | `resources_linux_openvino` |
+| `openvino-windows-x86-64` | `GGML_OPENVINO=ON` | `windows-2025-vs2026` + OpenVINO archive | `resources_windows_openvino` |
+
+Two routing notes mirror existing precedent: **Linux SYCL** ships two precision variants at the *same*
+arch, so `CMakeLists.txt` routes them to two *distinct* trees by `GGML_SYCL_F16` (fp16 vs fp32).
+**Windows OpenCL** now holds both `x86_64` (desktop ICD) and `aarch64` (Snapdragon/Adreno) in the one
+`resources_windows_opencl` tree, split by the `opencl-windows` / `opencl-windows-aarch64` profiles'
+arch-scoped `<includes>` — exactly like the `vulkan-linux` / `vulkan-linux-aarch64` split.
+
+The vendor toolchain install steps in `publish.yml` are **first-pass** (apt repos / vendor installers
+pinned to a specific version): if a URL/version 404s in CI, the job fails loud and the step is adjusted
+— the failure is intentional signal, not a regression to hide behind `continue-on-error`.
+`src/main/resources_{linux_rocm,windows_rocm,linux_sycl_fp16,linux_sycl_fp32,windows_sycl,linux_openvino,windows_openvino}/`
+are all git-ignored (staged by CI, never committed).
+
 ## WebUI (llama.cpp Svelte UI) embedding
 
 The llama.cpp WebUI is **built once in CI and shared to every native build**, then
@@ -286,7 +376,7 @@ needs no extra step here, `build-webui` re-reads the tag and rebuilds the matchi
 ships no UI):
 ```bash
 # needs node/npm + network; embed.cpp is plain C++17 (no npm)
-git clone --depth 1 --branch b9859 https://github.com/ggml-org/llama.cpp /tmp/lc
+git clone --depth 1 --branch b9870 https://github.com/ggml-org/llama.cpp /tmp/lc
 ( cd /tmp/lc/tools/ui && npm ci && npm run build \
   && ( cd dist && find . -type f -not -path './_gzip/*' \
        | while read -r f; do mkdir -p "_gzip/$(dirname "$f")"; gzip -9 -c "$f" > "_gzip/$f"; done ) \
@@ -314,13 +404,19 @@ jobs therefore set `BUILD_JOBS: 2` to bound peak memory.
 **`sccache` → Depot Cache — shared compiler cache.** When `USE_CACHE=true` **and** `sccache`
 plus a cache token are present, `build.sh` adds
 `-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache` and prints
-`sccache --show-stats`. The cache lives in **Depot Cache** over sccache's **WebDAV** backend:
+`sccache --show-stats`. **Per-job cache summary:** when running in CI (`GITHUB_STEP_SUMMARY` set),
+`build.sh`/`build.bat` also parse those stats and append a small `### sccache statistics` table
+(`Cache hits | Requests | Hit rate`) to the job summary — the sccache/Depot analogue of upstream
+llama.cpp's `ccache-action` "CCache Statistics" table, per-job (GitHub does not merge job
+summaries). It is best-effort (skipped silently if the numbers can't be parsed) and only emitted
+when sccache was actually the launcher; local runs (no `GITHUB_STEP_SUMMARY`) are untouched. The
+cache lives in **Depot Cache** over sccache's **WebDAV** backend:
 
 - `SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev`
 - `SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}` — a Depot **organization** token, stored
   as the repo secret **`DEPOT_TOKEN`**.
 
-Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9859`), the
+Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9870`), the
 ~280 upstream object files are byte-identical every run, so a warm cache recompiles only the
 *changed* files. Depot's cache is **shared across all branches** (unlike GitHub's
 per-branch `actions/cache`), so every branch builds incrementally; a `b<nnnn>` version bump
@@ -432,6 +528,8 @@ Current patches:
 | `0002-server-preserve-caller-load-progress-callback.patch` | Load-progress-callback regression introduced in llama.cpp **b9789**: `server_context::load_model` (`tools/server/server-context.cpp`) now **unconditionally** installs the server's own load-progress reporter on `params_base.load_progress_callback` immediately before `common_init_from_params`, clobbering any callback the embedding caller already set. libjllama's `LoadProgressCallback` feature wires `common_params.load_progress_callback` to a JNI trampoline *before* calling `load_model`, so the bump silently killed it — `LoadProgressCallbackTest` saw zero progress updates and the abort-on-`false` path never threw. The patch guards the assignment with `if (params_base.load_progress_callback == nullptr)`, so the server installs its own reporter **only when the caller hasn't** — a caller-supplied callback survives and fires during load. Standalone `llama-server` (no caller callback, so the field is null) is unaffected. Same JNI-vs-standalone divergence class as `0001`. |
 | `0003-pr22393-server-add-slot-prompt-similarity-getter-setter.patch` | **Upstream-PR carry** of [ggml-org/llama.cpp#22393](https://github.com/ggml-org/llama.cpp/pull/22393) ("server : add slot_prompt_similarity getter/setter") while it is still open upstream. Purely additive: adds `server_context::get_slot_prompt_similarity()` / `set_slot_prompt_similarity(float)` (`tools/server/server-context.{cpp,h}`) so an embedding/JNI caller can query and tune the slot-selection threshold at runtime without reloading the model. Verbatim copy of the PR — drop it once a pinned `b<nnnn>` includes the change. |
 | `0004-pr23116-server-per-request-reasoning-budget-tokens.patch` | **Upstream-PR carry** of [ggml-org/llama.cpp#23116](https://github.com/ggml-org/llama.cpp/pull/23116) ("server: honour per-request reasoning_budget_tokens in chat completions"), motivated by java-llama.cpp#140, while it is still open upstream. `oaicompat_chat_params_parse` (`tools/server/server-common.cpp`) only read the Anthropic `thinking_budget_tokens` alias and always wrote the server-level `reasoning_budget_message`, so a per-request `reasoning_budget_tokens` / `reasoning_budget_message` on a chat-completions request was ignored. The patch reads both overrides **before** the generic copy loop (precedence: `reasoning_budget_tokens` > `thinking_budget_tokens` alias > server default) and threads the per-request message through. Carries the upstream `tests/test-chat.cpp` additions verbatim so the patch is submittable as-is; like `0001`'s test/call-site flips they are **applied-but-not-compiled** here (`LLAMA_BUILD_TESTS` is OFF for the FetchContent subproject). Drop it once a pinned `b<nnnn>` includes the change. |
+| `0005-server-recurrent-near-prompt-end-checkpoints.patch` | **Multi-turn tool-calling perf fix for recurrent/hybrid models (e.g. Granite-4)**, upstream-submittable. In `server_context::update_slots` (`tools/server/server-context.cpp`) the near-prompt-end context checkpoints are gated by `checkpoint_min_step` (default 8192 tokens). An agentic conversation that appends only assistant/tool messages never produces a new user-message checkpoint (`is_user_start`/`is_last_user_message` match `COMMON_CHAT_ROLE_USER` only), so after turn 1 no new checkpoint is ever created and — because recurrent state can only roll back to a checkpoint — **every turn re-prefills the whole conversation tail** (measured on a synthetic granitehybrid model: prefilled tokens grew 901 → 1544 → 2187 → 2830 → 3473 over turns 2–6). The patch (1) exempts near-prompt-end checkpoints from the min-step spacing when the memory can only roll back via checkpoints (`ctx_tgt_seq_rm_type` is `FULL` or `RS` — SWA-only models are unaffected), and (2) skips creating a checkpoint whose position equals the newest one (the last-user-message checkpoint was re-created identically on every turn, flooding the 32-entry list). After the patch each turn restores the previous turn's near-end checkpoint and prefill is constant (~new-turn-sized; 647 tokens/turn in the same measurement, ≈5.4× less prefill at turn 6 and growing with conversation length). Validated output-identical (`temperature=0`) vs. unpatched. Complements — not duplicates — open upstream PRs #24035/#24899/#24891 (they fix checkpoint *invalidation/retention*; this fixes checkpoint *starvation*). Drop once upstream solves agentic checkpoint placement (e.g. a merged role-boundary checkpointing design, cf. #21885 / #22826 discussion). |
+| `0006-server-embed-native-server-jni.patch` | **Makes `server.cpp`'s `llama_server` embeddable in the JVM** so the `NativeServer` JNI bridge can run the full upstream HTTP server (WebUI included) inside `libjllama` — see "Two server modes" below. b9870 already exposes `int llama_server(int, char**)` (non-static; no `main` in the file), so the patch only adds embedded-mode support: (1) a `g_llama_server_embedded` flag + `llama_server_set_embedded()` / `llama_server_request_shutdown()` (declared in the committed `src/main/cpp/native_server_bridge.h`); (2) skips installing the process-wide SIGINT/SIGTERM handlers when embedded (they would hijack the JVM's); (3) in embedded mode parses the **forwarded** argv via `common_params_parse` instead of `common_params_parse_main` (whose `GetCommandLineW` recovery would pick up `java.exe`'s command line — the same Windows class of bug `0001` fixes). `llama_server_request_shutdown()` mirrors the SIGTERM path (invokes the installed `shutdown_handler` → `ctx_server.terminate()` unblocks `start_loop()`), giving JNI an out-of-band stop since `ctx_server` is loop-local. Applies **after `0001`** (which flips this call site to `common_params_parse_main`), so its context is the post-`0001` tree; regenerate against `0001`+source on a bump. Only touches `tools/server/server.cpp`. |
 
 ## OuteTTS build-time extraction (`cmake/generate-tts-upstream.cmake`)
 
@@ -470,6 +568,13 @@ re-verify the generator the same way you re-verify `patches/`.
 
 ## Upgrading/Downgrading llama.cpp Version
 
+**Runbook (documentation root):** [`docs/upgrade/llama-cpp-version-bump.md`](docs/upgrade/llama-cpp-version-bump.md)
+covers the full bump process end-to-end — picking the target (topmost GitHub release, via the atom
+feed), **chunking by `git diff` byte-size** (bump straight to the target when the diff is < 100 KiB,
+else step through the largest intermediate tag still under the threshold), the
+`.github/scripts/llama-next-version.sh` helper that computes the next reviewable step, and the
+edit/verify/commit loop below. Use it for any non-trivial bump; the steps here are the mechanical core.
+
 To change the llama.cpp version, update the following **three** files (and re-verify `patches/`):
 
 1. **llama/CMakeLists.txt** — the `GIT_TAG` line for llama.cpp: `GIT_TAG        b8831`
@@ -834,7 +939,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
 - `LlamaLoader` — Extracts the platform-specific native library from the JAR to a temp directory, or finds it on `java.library.path`.
 - `OSInfo` — Detects OS and architecture for library resolution.
 - **`server` package — OpenAI-compatible HTTP endpoint (a single implementation).**
-  - `server.OpenAiCompatServer` — built only on the JDK's `com.sun.net.httpserver` (no new dependency), both embeddable and the fat-jar `Main-Class`. Serves `POST /v1/chat/completions` (streaming via SSE + non-streaming), `POST /v1/completions`, `POST /v1/embeddings`, `POST /v1/rerank`, `POST /infill`, `GET /v1/models` and `GET /health` (every route is also reachable without the `/v1` prefix), so editors that speak the OpenAI protocol (e.g. VS Code Copilot "Custom Endpoint", Cline, Roo Code, Continue) can drive a local model. Streaming chat uses the native OAI chunk path (`LlamaModel.streamChatCompletion` → `requestChatCompletionStream` / `receiveChatCompletionChunk` + the C++ `wrap_stream_chunk` helper), preserving `delta.tool_calls`; completions/embeddings/infill forward verbatim to the matching `LlamaModel.handle*`; rerank reshapes `handleRerank` into the OAI `results`/`data` shape. The chat mapper forwards `stream_options` and `response_format` and defaults `cache_prompt=true`; a CORS `Filter` answers `OPTIONS` preflights; `OpenAiSseFormatter.ensureUsageCachedTokens` guarantees `usage.prompt_tokens_details.cached_tokens` on the streamed usage chunk (Copilot crash fix, microsoft/vscode #273482). **Agentic tool-calling is the primary target**; a C++ guard (`test_server.cpp`) pins `tool_calls.function.arguments` as a JSON string (llama.cpp #20198).
+  - `server.OpenAiCompatServer` — built only on the JDK's `com.sun.net.httpserver` (no new dependency), embeddable and runnable via `java -cp <jar> net.ladenthin.llama.server.OpenAiCompatServer …` (the fat-jar default `Main-Class` is now `NativeServer` — see "Two server modes"). Serves `POST /v1/chat/completions` (streaming via SSE + non-streaming), `POST /v1/completions`, `POST /v1/embeddings`, `POST /v1/rerank`, `POST /infill`, `GET /v1/models` and `GET /health` (every route is also reachable without the `/v1` prefix), so editors that speak the OpenAI protocol (e.g. VS Code Copilot "Custom Endpoint", Cline, Roo Code, Continue) can drive a local model. Streaming chat uses the native OAI chunk path (`LlamaModel.streamChatCompletion` → `requestChatCompletionStream` / `receiveChatCompletionChunk` + the C++ `wrap_stream_chunk` helper), preserving `delta.tool_calls`; completions/embeddings/infill forward verbatim to the matching `LlamaModel.handle*`; rerank reshapes `handleRerank` into the OAI `results`/`data` shape. The chat mapper forwards `stream_options` and `response_format` and defaults `cache_prompt=true`; a CORS `Filter` answers `OPTIONS` preflights; `OpenAiSseFormatter.ensureUsageCachedTokens` guarantees `usage.prompt_tokens_details.cached_tokens` on the streamed usage chunk (Copilot crash fix, microsoft/vscode #273482). **Agentic tool-calling is the primary target**; a C++ guard (`test_server.cpp`) pins `tool_calls.function.arguments` as a JSON string (llama.cpp #20198).
   - **Alternative protocol surfaces** (pure translation over the OpenAI chat core — no second inference path; each reconstructs streamed tool calls via `ToolCallDeltaAccumulator`): **Ollama-native** (`GET /api/version`, `/api/tags`, `POST /api/show`, `/api/chat` with NDJSON streaming, `/api/generate` prompt-completion/FIM — `OllamaApiSupport`; `/api/show` advertises tools/insert/vision capabilities + context length for Copilot's Ollama provider), **Anthropic Messages** (`POST /v1/messages`, SSE event stream — `AnthropicApiSupport` + `AnthropicStreamTranslator`), and **OpenAI Responses** (`POST /v1/responses`, SSE event stream — `ResponsesApiSupport` + `ResponsesStreamTranslator`). The llama.cpp-native `GET /props` (context length + `modalities`) is served via `OpenAiSseFormatter.propsJson` for autocomplete clients that size their context from it.
   - Supporting classes: `OpenAiServerConfig` (builder; optional bearer auth; binds `127.0.0.1`; `corsAllowOrigin`; `supportsVision`), `OpenAiServerCli` (testable CLI arg parser → `ModelParameters` + `OpenAiServerConfig`; flags incl. `--mmproj`/`--embedding`/`--reranking`), `OpenAiRequestMapper` (OAI chat request → `InferenceParameters`), `OpenAiSseFormatter` (SSE/models/error JSON + usage normalization), `OaiRerankSupport` (pure rerank request/response shaping), and the model-free test seam `OpenAiBackend`/`ChunkSink` + `LlamaModelBackend`. The streaming envelope is parsed by `json.ChatStreamChunkParser`.
   - The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`); `noInternalJdkImports` carries an explicit exception for the supported `com.sun.net.httpserver` (the exported `jdk.httpserver` module, which `module-info.java` `requires`). See README "OpenAI-compatible HTTP server".
@@ -845,7 +950,14 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
 - `json_helpers.hpp` — Pure JSON transformation helpers (no JNI, no llama state). Independently unit-testable.
 - `jni_helpers.hpp` — JNI bridge helpers (handle management + server orchestration). Includes `json_helpers.hpp`.
 - Uses `nlohmann/json` for JSON deserialization of parameters.
-- The upstream server library (`server-context.cpp`, `server-queue.cpp`, `server-task.cpp`, `server-schema.cpp`, `server-models.cpp`, and — since b9829 — `server-stream.cpp`) is compiled directly into `jllama` via CMake — there is no hand-ported `server.hpp` fork. **`server-stream.cpp` is mandatory, not optional:** it defines the resumable-streaming SSE replay buffer (`g_stream_sessions`, `stream_session_attach_pipe`, `stream_aware_should_stop`, `stream_conv_id_from_headers`, the `stream_pipe_*` types) that `server-context.cpp` / `server-http.cpp` / `server-models.cpp` now `#include "server-stream.h"` and call, so omitting it fails the link with undefined references. It is platform-neutral (threads + std mutex/condvar, no `subprocess.h`/`posix_spawn_*`), so it builds on Android too and sits outside the `server-models.cpp` Android guard. `jllama` wires its own JNI routes and never calls `g_stream_sessions.start_gc()` (only the excluded standalone `server.cpp` `main()` does), so its GC thread stays dormant. **Phase 2:** the upstream HTTP transport (`tools/server/server-http.cpp`) and its `cpp-httplib` backend (`vendor/cpp-httplib/httplib.cpp`) are now compiled into `jllama` too, so the OpenAI-compatible server can be driven natively from JNI *inside* `libjllama` — no separate `llama-server` executable (a JNI shared library loads anywhere a JVM runs, which a standalone binary does not). `server-http.cpp` does `#include "ui.h"` (the WebUI asset table that `tools/ui`/`llama-ui` normally generates); since the Svelte WebUI is not shipped, `src/main/cpp/webui_stub/ui.h` supplies the upstream **empty-asset** interface and leaves `LLAMA_UI_HAS_ASSETS` undefined (all static-asset-serving blocks compile out). `<cpp-httplib/httplib.h>` already resolves via `llama-common`'s `vendor/` include dir (same nlohmann/json 3.12.0 as the FetchContent copy). No SSL: `CPPHTTPLIB_OPENSSL_SUPPORT` is left undefined (plain-HTTP; bind localhost / front with a TLS proxy). Only `server.cpp` (the standalone `main()` + route wiring) remains excluded — wiring the routes to JNI is the next step.
+- The upstream server library (`server-context.cpp`, `server-queue.cpp`, `server-task.cpp`, `server-schema.cpp`, `server-models.cpp`, and — since b9829 — `server-stream.cpp`) is compiled directly into `jllama` via CMake — there is no hand-ported `server.hpp` fork. **`server-stream.cpp` is mandatory, not optional:** it defines the resumable-streaming SSE replay buffer (`g_stream_sessions`, `stream_session_attach_pipe`, `stream_aware_should_stop`, `stream_conv_id_from_headers`, the `stream_pipe_*` types) that `server-context.cpp` / `server-http.cpp` / `server-models.cpp` now `#include "server-stream.h"` and call, so omitting it fails the link with undefined references. It is platform-neutral (threads + std mutex/condvar, no `subprocess.h`/`posix_spawn_*`), so it builds on Android too and sits outside the `server-models.cpp` Android guard. `jllama` wires its own JNI routes and never calls `g_stream_sessions.start_gc()` (only the excluded standalone `server.cpp` `main()` does), so its GC thread stays dormant. **Phase 2:** the upstream HTTP transport (`tools/server/server-http.cpp`) and its `cpp-httplib` backend (`vendor/cpp-httplib/httplib.cpp`) are now compiled into `jllama` too, so the OpenAI-compatible server can be driven natively from JNI *inside* `libjllama` — no separate `llama-server` executable (a JNI shared library loads anywhere a JVM runs, which a standalone binary does not). `server-http.cpp` does `#include "ui.h"` (the WebUI asset table that `tools/ui`/`llama-ui` normally generates); since the Svelte WebUI is not shipped, `src/main/cpp/webui_stub/ui.h` supplies the upstream **empty-asset** interface and leaves `LLAMA_UI_HAS_ASSETS` undefined (all static-asset-serving blocks compile out). `<cpp-httplib/httplib.h>` already resolves via `llama-common`'s `vendor/` include dir (same nlohmann/json 3.12.0 as the FetchContent copy). No SSL: `CPPHTTPLIB_OPENSSL_SUPPORT` is left undefined (plain-HTTP; bind localhost / front with a TLS proxy). **`server.cpp` is now compiled in too** (on non-Android — it and `server-tools.cpp` pull in `subprocess.h`/`posix_spawn_*`, so they share `server-models.cpp`'s Android guard): b9870 exposes its entry as `int llama_server(int, char**)` (no `main` in the file), and `patches/0006` makes it embeddable (no process signal handlers, forwarded-argv parse, out-of-band shutdown). The `NativeServer` JNI bridge (`src/main/cpp/native_server.cpp`) calls `llama_server` on a worker thread, so the **full** upstream server — WebUI and all — runs inside `libjllama`. See "Two server modes" below.
+
+### Two server modes (`OpenAiCompatServer` vs `NativeServer`)
+
+The library exposes **two** ways to serve a model over HTTP, on two different transports. The fat jar's `Main-Class` is `server.ServerLauncher`, a tiny dispatcher: it runs `OpenAiCompatServer` when `--jllama-openai-compat` is present (that marker is stripped, the rest forwarded) and the default `NativeServer` otherwise. Both mains are also runnable directly by class name via `java -cp`. The two modes:
+
+1. **`server.OpenAiCompatServer` (Java transport).** OpenAI/Ollama/Anthropic-compatible JSON API on the JDK's `com.sun.net.httpserver`, driving the compiled server *core* over JNI. Embeddable, no extra dependency, and it can share/reuse a `LlamaModel`. It serves **no** static assets — its `/` route is a 404, so **no WebUI**. It has its own `main` (run via `java -cp <jar> net.ladenthin.llama.server.OpenAiCompatServer …`); its CLI (`OpenAiServerCli`) maps a curated flag subset (`-m/-c/-b/-ub/-ngl/-t/-tb/-ctk/-ctv/--jinja/--chat-template-kwargs/--host/--port/--parallel/--mmproj/--api-key/--embedding/--reranking`).
+2. **`server.NativeServer` (native transport) — the default fat-jar server (when `--jllama-openai-compat` is absent).** Runs the **full upstream `llama_server`** (via `patches/0006` + `native_server.cpp`) inside `libjllama`, forwarding the raw llama-server argv verbatim — so **every** llama-server flag works and the **embedded WebUI is served** (when the assets are compiled in; CI's released jars have them, local `cmake` builds use the empty-asset stub). It is an **independent lifecycle** (loads its own model from the argv, like `llama-server.exe`; owns the process's llama backend + stderr logging while running), **single-instance per process** (upstream keeps shutdown state in file-scope globals), and **not available on Android** (the `subprocess.h` guard). Reusing an already-loaded `LlamaModel`'s context is a documented TODO. `libjllama` loading anywhere a JVM runs is what makes this "no separate `llama-server.exe`" possible.
 
 ### Native Helper Architecture
 
@@ -955,6 +1067,24 @@ Wiring (mirrors the macOS native jobs, not the dockcross jobs):
 - Branch protection: if a required check pinned the old name "Cross-Compile Linux aarch64 (LTS)",
   repoint it to "Build and Test Linux aarch64".
 
+### Linux s390x: big-endian cross-build + qemu test gate
+
+`build-linux-s390x` extends the default JAR to **IBM Z (s390x, big-endian)** — the one target whose
+byte order differs from every other platform. It **cross-compiles** with the GCC s390x toolchain
+(`g++-s390x-linux-gnu`, native x86 speed — no emulated build) and then runs the **full C++ unit suite
+under `qemu-user`** (`CMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-s390x-static`, `QEMU_LD_PREFIX=/usr/s390x-linux-gnu`).
+That `ctest` run is a **real big-endian correctness gate** for the byte-order-sensitive surface — the
+little-endian WAV writer (`tts_wav.hpp`), the JSON/token/embedding transforms, and the JNI helpers —
+which is where an endian bug in *our* code could hide. Model-backed **Java** tests are deliberately
+**not** run under emulation (a JVM + GGUF inference under `qemu-user` is slow and flaky); the Java↔JNI
+boundary uses host-native array copies (endian-transparent), so the C++ gate covers the actual risk.
+`-DGGML_OPENMP=OFF` sidesteps cross-libgomp issues (ggml uses its own `std::thread` pool). s390x is a
+CPU platform like aarch64, so it ships in the **default** JAR (`Linux-s390x-libraries` merges via the
+`*-libraries` glob; `OSInfo` maps `os.arch=s390x` → `Linux/s390x`) — no classifier, no pom profile.
+**Fail-loud** and in `package.needs` like every other build. (Upstream llama.cpp already supports s390x
+— it ships `ubuntu-s390x` with GGUF big-endian handling — so the native inference path is upstream's
+concern; this job validates only *our* layer's endian-safety.)
+
 ## Testing
 
 ### Java tests
@@ -1017,17 +1147,17 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson"
 | File | Tests | Scope |
 |------|-------|-------|
 | `src/test/cpp/test_utils.cpp` | 156 | Upstream helpers: `server_tokens`, `server_grammar_trigger`, `gen_tool_call_id`, `json_value`, `json_get_nested_values`, UTF-8 helpers, `format_response_rerank`, `format_embeddings_response_oaicompat`, `oaicompat_completion_params_parse`, `oaicompat_chat_params_parse`, `are_lora_equal`, `strip_flag_from_argv`, `token_piece_value`, `json_is_array_and_contains_numbers`, `format_oai_sse`, `format_oai_resp_sse`, `format_anthropic_sse` |
-| `src/test/cpp/test_server.cpp` | 194 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths + per-request `dry_*` field round-trips), `response_fields` projection |
+| `src/test/cpp/test_server.cpp` | 197 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths + per-request `dry_*` and `sse_ping_interval` field round-trips incl. hard-limit + server-default inheritance), `response_fields` projection |
 | `src/test/cpp/test_json_helpers.cpp` | 47 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config`, `wrap_stream_chunk` |
 | `src/test/cpp/test_log_helpers.cpp` | 13 | All functions in `log_helpers.hpp`: `log_level_name`, `format_log_as_json` |
 | `src/test/cpp/test_jni_helpers.cpp` | 47 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock |
 | `src/test/cpp/test_tts_wav.cpp` | 2 | The in-memory WAV writer `pcm_to_wav16_bytes` in `tts_wav.hpp` (WAV header/payload + little-endian clamping). The OuteTTS DSP it pairs with is derived from upstream `tts.cpp` and covered end-to-end by the Java `TtsIntegrationTest`, not unit-tested here. |
 
-**Current total: 459 tests (all passing).**
+**Current total: 462 tests (all passing).**
 
 #### Upstream source location (in CMake build tree)
 
-llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9859`.
+llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9870`.
 
 **GoogleTest** is a separate `BUILD_TESTING`-only FetchContent (`GIT_TAG v1.17.0`), used solely
 by the `jllama_test` C++ unit-test binary — not by the shipped library, and not coupled to the
diff --git a/README.md b/README.md
index ec6f5e75..772dd18f 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 **Build:**  
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)  
 ![Platform](https://img.shields.io/badge/Platform-Linux%20%7C%20macOS%20%7C%20Windows%20%7C%20Android-lightgrey)  
-[![llama.cpp b9859](https://img.shields.io/badge/llama.cpp-%23b9859-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9859)  
+[![llama.cpp b9870](https://img.shields.io/badge/llama.cpp-%23b9870-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9870)  
 [![JPMS](https://img.shields.io/badge/JPMS-modular%20JAR-25A162)](https://openjdk.org/projects/jigsaw/)  
 ![JUnit](https://img.shields.io/badge/tested%20with-JUnit6-25A162)  
 [![JSpecify](https://img.shields.io/badge/JSpecify-1.0.0%20%40NullMarked-25A162)](https://jspecify.dev)  
@@ -107,7 +107,7 @@ Inference of Meta's LLaMA model (and others) in pure C/C++.
 - **Infilling** (fill-in-the-middle) for code models.
 - **Tokenize / detokenize** and **JSON-schema → grammar** conversion.
 - **Raw JSON endpoint handlers** mirroring the upstream llama.cpp HTTP server (`/completions`, `/v1/completions`, `/embeddings`, `/infill`, `/tokenize`, `/detokenize`).
-- **Runnable OpenAI-compatible HTTP server** (`OpenAiCompatServer`, the fat-jar `Main-Class`, streaming SSE, zero extra dependency): `java -jar …-jar-with-dependencies.jar --model model.gguf --port 8080`.
+- **Two runnable HTTP server modes, one fat-jar entry.** The fat jar's `Main-Class` is `ServerLauncher`, which dispatches on the `--jllama-openai-compat` flag. Without it, `java -jar …-jar-with-dependencies.jar -m model.gguf --port 8080` runs the full upstream llama.cpp server (embedded **WebUI**, every llama-server flag forwarded) hosted inside `libjllama` over JNI — no separate `llama-server.exe`. With it, `java -jar … --jllama-openai-compat --model model.gguf --port 8080` runs the Java-transport, zero-extra-dependency **OpenAI-compatible** server (`OpenAiCompatServer`, streaming SSE) instead. Both are also runnable directly by class name via `java -cp … net.ladenthin.llama.server.{NativeServer,OpenAiCompatServer}`.
 - **Model metadata** access (`getModelMeta()`) and **server management** (metrics, slot save/restore, runtime thread reconfiguration).
 - Pre-built native binaries for Linux (x86-64, aarch64), macOS (x86-64, arm64), and Windows (x86-64, x86); CUDA, Metal, and Vulkan supported via local build.
 
@@ -164,20 +164,40 @@ If any of these match your platform, you can include the Maven dependency and ge
 
 The Maven coordinate `net.ladenthin:llama` publishes one default JAR (CPU-only;
 its Windows natives are built with the Ninja Multi-Config + MSVC toolchain) plus
-optional JARs selected via a Maven `<classifier>`: three Windows GPU builds
-(CUDA / Vulkan / OpenCL), the Linux CUDA and Android OpenCL builds, and an
-alternate-toolchain MSVC Windows CPU build. Pick at most one GPU/accelerator
-classifier — those are mutually exclusive — and optionally a CPU Windows build.
+optional JARs selected via a Maven `<classifier>`: NVIDIA CUDA (Linux / Windows),
+Vulkan (Linux x86-64 / aarch64, Windows), AMD ROCm/HIP (Linux / Windows), Intel
+SYCL (Linux fp16 / fp32, Windows) and OpenVINO (Linux / Windows) GPU builds, OpenCL
+(Android Adreno, Windows x86-64 / Snapdragon-arm64), and an alternate-toolchain MSVC
+Windows CPU build. Pick at most one GPU/accelerator classifier — those are mutually
+exclusive — and optionally a CPU Windows build.
 
 | Classifier | Backend | Target platform | Runtime requirement |
 |---|---|---|---|
-| _(none)_ | CPU | Linux x86-64 / aarch64, macOS x86-64 / aarch64, Windows x86-64 / x86 (Ninja Multi-Config + MSVC), Android aarch64 (CPU) | A JDK 8+ JVM. **Linux `aarch64` additionally requires glibc ≥ 2.39** (e.g. Ubuntu 24.04+, Debian 13+) — it is built natively on `ubuntu-24.04-arm`, matching upstream llama.cpp's own ARM binaries; older-glibc ARM hosts (Ubuntu 22.04, Debian 12, RHEL 8/9, Amazon Linux 2023) are not supported. Linux x86-64 keeps a glibc 2.17 floor (manylinux2014). |
+| _(none)_ | CPU | Linux x86-64 / aarch64 / s390x, macOS x86-64 / aarch64, Windows x86-64 / x86 / aarch64 (Ninja Multi-Config + MSVC), Android aarch64 (CPU) | A JDK 8+ JVM. **Linux `aarch64` additionally requires glibc ≥ 2.39** (e.g. Ubuntu 24.04+, Debian 13+) — it is built natively on `ubuntu-24.04-arm`, matching upstream llama.cpp's own ARM binaries; older-glibc ARM hosts (Ubuntu 22.04, Debian 12, RHEL 8/9, Amazon Linux 2023) are not supported. Linux x86-64 keeps a glibc 2.17 floor (manylinux2014). **Windows `aarch64`** (Windows on ARM — Snapdragon X / Surface) is built natively on `windows-11-arm` and ships in the default JAR alongside the x86-64 / x86 natives. |
 | `msvc-windows` | CPU (MSVC / Visual Studio generator) | Windows x86-64 and x86 | None beyond a JDK 8+ JVM. Same CPU backend as the default JAR's Windows natives, but compiled with the Visual Studio generator instead of `Ninja Multi-Config`. Both use the same MSVC toolchain (static `/MT` CRT), so they are functionally equivalent — provided as an alternate-toolchain option. |
 | `cuda13-windows-x86-64` | CUDA 13 | Windows x86-64 with NVIDIA GPU | NVIDIA driver + CUDA 13 Toolkit installed on the host (`cudart64_13.dll`, `cublas64_13.dll`, `cublasLt64_13.dll` resolvable on `PATH`). The runtime libraries are **not bundled** in the JAR; native-library load fails with `UnsatisfiedLinkError` if they are absent. No CPU fallback. |
 | `vulkan-windows-x86-64` | Vulkan | Windows x86-64 with a Vulkan 1.2+ GPU (NVIDIA / AMD / Intel) | A Vulkan runtime (`vulkan-1.dll`), which current GPU drivers install. No Vulkan SDK is needed at runtime. The most portable Windows GPU option (vendor-independent). |
 | `opencl-windows-x86-64` | OpenCL | Windows x86-64 with an OpenCL 2.0+ GPU | A vendor OpenCL ICD (`OpenCL.dll`, installed by the GPU driver). **Note:** the GGML OpenCL backend is Adreno-tuned; on desktop GPUs CUDA or Vulkan are better supported. |
 | `cuda13-linux-x86-64` | CUDA 13 | Linux x86-64 with NVIDIA GPU | NVIDIA driver + CUDA 13 runtime libraries (`libcudart.so.13`, `libcublas.so.13`) installed on the host. The shared library is dynamically linked against them and will fail to `dlopen` if they are absent — there is no automatic fallback to CPU. |
+| `vulkan-linux-x86-64` | Vulkan | Linux x86-64 with a Vulkan 1.2+ GPU (NVIDIA / AMD / Intel) | A Vulkan runtime (`libvulkan.so.1`), which current GPU drivers install. No Vulkan SDK is needed at runtime. The most portable Linux GPU option (vendor-independent, no CUDA toolkit). Built natively on `ubuntu-latest`, so it shares the aarch64 build's higher glibc floor (≈ 2.39). |
+| `vulkan-linux-aarch64` | Vulkan | Linux aarch64 with a Vulkan 1.2+ GPU | A Vulkan runtime (`libvulkan.so.1`) from the device/driver. glibc ≥ 2.39 (built on `ubuntu-24.04-arm`). |
 | `opencl-android-aarch64` | OpenCL (Adreno) | Android aarch64 with Qualcomm Adreno GPU | A device-supplied OpenCL ICD (`libOpenCL.so`). Devices without an ICD (e.g. most non-Snapdragon Android hardware) must use the default CPU JAR. |
+| `rocm-linux-x86-64` | ROCm / HIP | Linux x86-64 with AMD GPU | An installed AMD ROCm runtime (`libamdhip64.so`, `librocblas.so`, `libhipblas.so`) on the host. Not bundled; native load fails without it. No CPU fallback. |
+| `rocm-windows-x86-64` | ROCm / HIP | Windows x86-64 with AMD GPU | The AMD HIP SDK runtime DLLs (`amdhip64.dll`, `rocblas.dll`, `hipblas.dll`) on `PATH`. Not bundled. No CPU fallback. |
+| `sycl-fp16-linux-x86-64` | SYCL (Intel oneAPI, fp16) | Linux x86-64 with Intel GPU (Arc / iGPU) | An installed Intel oneAPI / Level-Zero runtime. fp16 accumulation (faster, slightly lower precision). Not bundled. |
+| `sycl-fp32-linux-x86-64` | SYCL (Intel oneAPI, fp32) | Linux x86-64 with Intel GPU (Arc / iGPU) | An installed Intel oneAPI / Level-Zero runtime. fp32 accumulation (higher precision). Not bundled. |
+| `sycl-windows-x86-64` | SYCL (Intel oneAPI) | Windows x86-64 with Intel GPU (Arc / iGPU) | The Intel oneAPI / Level-Zero runtime DLLs on `PATH`. Not bundled. |
+| `opencl-windows-aarch64` | OpenCL (Adreno) | Windows-on-ARM aarch64 (Snapdragon X) with Adreno GPU | A device-supplied OpenCL ICD (`OpenCL.dll`, from the Adreno driver). Not bundled. |
+| `openvino-linux-x86-64` | OpenVINO | Linux x86-64 (Intel GPU / NPU / CPU) | An installed Intel OpenVINO runtime. Not bundled. |
+| `openvino-windows-x86-64` | OpenVINO | Windows x86-64 (Intel GPU / NPU / CPU) | The Intel OpenVINO runtime DLLs on `PATH`. Not bundled. |
+
+> [!NOTE]
+> The AMD (`rocm-*`), Intel SYCL (`sycl-*`), Windows-on-ARM OpenCL
+> (`opencl-windows-aarch64`) and Intel OpenVINO (`openvino-*`) classifiers are
+> newly added GPU backends. Like the other GPU classifiers they are validated
+> **build-only** in CI (GitHub runners have no matching GPU), so end-to-end
+> inference is verified locally / on self-hosted hardware. As with every GPU JAR,
+> the vendor runtime is supplied by the consumer's driver/toolkit and is not bundled.
 
 ```xml
 <!-- CPU (default) -->
@@ -219,6 +239,22 @@ classifier — those are mutually exclusive — and optionally a CPU Windows bui
     <classifier>vulkan-windows-x86-64</classifier>
 </dependency>
 
+<!-- Vulkan on Linux x86-64 (NVIDIA/AMD/Intel; libvulkan.so.1 from the driver) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>vulkan-linux-x86-64</classifier>
+</dependency>
+
+<!-- Vulkan on Linux aarch64 (libvulkan.so.1 from the device/driver) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>vulkan-linux-aarch64</classifier>
+</dependency>
+
 <!-- OpenCL on Windows x86-64 (requires a driver-provided OpenCL ICD) -->
 <dependency>
     <groupId>net.ladenthin</groupId>
@@ -234,6 +270,70 @@ classifier — those are mutually exclusive — and optionally a CPU Windows bui
     <version>5.0.4</version>
     <classifier>msvc-windows</classifier>
 </dependency>
+
+<!-- ROCm/HIP on Linux x86-64 (requires an AMD ROCm runtime on the host) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>rocm-linux-x86-64</classifier>
+</dependency>
+
+<!-- ROCm/HIP on Windows x86-64 (requires the AMD HIP SDK runtime on the host) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>rocm-windows-x86-64</classifier>
+</dependency>
+
+<!-- SYCL (Intel oneAPI, fp16) on Linux x86-64 (requires the oneAPI/Level-Zero runtime) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>sycl-fp16-linux-x86-64</classifier>
+</dependency>
+
+<!-- SYCL (Intel oneAPI, fp32) on Linux x86-64 (requires the oneAPI/Level-Zero runtime) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>sycl-fp32-linux-x86-64</classifier>
+</dependency>
+
+<!-- SYCL (Intel oneAPI) on Windows x86-64 (requires the oneAPI/Level-Zero runtime) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>sycl-windows-x86-64</classifier>
+</dependency>
+
+<!-- OpenCL/Adreno on Windows-on-ARM aarch64 (Snapdragon X; device-provided OpenCL ICD) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>opencl-windows-aarch64</classifier>
+</dependency>
+
+<!-- OpenVINO on Linux x86-64 (requires the Intel OpenVINO runtime on the host) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>openvino-linux-x86-64</classifier>
+</dependency>
+
+<!-- OpenVINO on Windows x86-64 (requires the Intel OpenVINO runtime on the host) -->
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>llama</artifactId>
+    <version>5.0.4</version>
+    <classifier>openvino-windows-x86-64</classifier>
+</dependency>
 ```
 
 > [!IMPORTANT]
@@ -591,7 +691,9 @@ array alone at `GET /slots`. OpenAI responses preserve
 
 `net.ladenthin.llama.server.OpenAiCompatServer` turns a loaded model into a local
 OpenAI-compatible HTTP endpoint using only the JDK's built-in `com.sun.net.httpserver` — no extra
-dependency and no separate server process. It is both embeddable and the fat-jar `Main-Class`. It
+dependency and no separate server process. It is embeddable, and runnable via
+`java -cp <jar> net.ladenthin.llama.server.OpenAiCompatServer …` (the fat jar's default
+`Main-Class` is instead `NativeServer` — see "Native server with the built-in WebUI" below). It
 serves:
 
 | Method &amp; path | Backed by |
@@ -646,23 +748,27 @@ try (LlamaModel model = new LlamaModel(modelParams);
 }
 ```
 
-…or run it standalone. The fat jar built by the `assembly` profile (`mvn -P assembly package`) is
-runnable (its `Main-Class` is `net.ladenthin.llama.server.OpenAiCompatServer`); the plain library jar
-works too via `-cp`:
+…or run it standalone. The fat jar's `Main-Class` is the `ServerLauncher` dispatcher, so add
+`--jllama-openai-compat` to select this Java server (the launcher strips that flag and forwards the rest);
+or name the class explicitly via `-cp`:
 
 ```bash
-# fat jar (bundles the native lib + Java deps)
-java -jar target/llama-<version>-jar-with-dependencies.jar \
+# fat jar (bundles the native lib + Java deps) — select the Java server with --jllama-openai-compat
+java -jar target/llama-<version>-jar-with-dependencies.jar --jllama-openai-compat \
     --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99
 
-# or the plain jar
+# or name the class explicitly (fat jar or plain library jar)
 java -cp target/llama-<version>.jar net.ladenthin.llama.server.OpenAiCompatServer \
   --model models/model.gguf --port 8080 --model-id local-model
 ```
 
 Run with `--help` for the full option list (`-m/--model`, `--host`, `-p/--port`, `-c/--ctx-size`,
-`-ngl/--n-gpu-layers`, `-t/--threads`, `--parallel`, `--model-id`, `--api-key`, `--mmproj`,
-`--embedding`, `--reranking`).
+`-b/--batch-size`, `-ub/--ubatch-size`, `-ngl/--n-gpu-layers`, `-t/--threads`, `-tb/--threads-batch`,
+`-ctk/--cache-type-k`, `-ctv/--cache-type-v`, `--jinja`, `--chat-template-kwargs`, `--parallel`,
+`--model-id`, `--api-key`, `--mmproj`, `--embedding`, `--reranking`). The tuning flags mirror
+llama.cpp's server, so an invocation like
+`--jinja --chat-template-kwargs '{"reasoning_effort":"low"}' -ctk q8_0 -ctv q8_0 -b 4096 -ub 2048`
+works directly.
 
 Verify with curl (streaming chat):
 
@@ -706,6 +812,45 @@ tool calling depends on the model's own tool-calling quality. Pass `--api-key` (
 `OpenAiServerConfig.apiKey(...)`) to require an `Authorization: Bearer` token; the server binds to
 `127.0.0.1` by default.
 
+### Native server with the built-in WebUI (`NativeServer`)
+
+`OpenAiCompatServer` above is a JSON **API** server (its `/` is a 404 — no web page). If you want
+the **full upstream llama.cpp server, including its bundled Svelte WebUI**, use
+`net.ladenthin.llama.server.NativeServer`. It runs the real `llama_server` inside `libjllama` over
+JNI — no separate `llama-server.exe` — and **forwards the raw llama-server arguments verbatim**, so
+every flag works exactly as it does for the standalone binary. The fat jar runs it **by default**
+(when `--jllama-openai-compat` is absent), forwarding its args to the native server (pass `--help` for the
+full llama-server option list):
+
+```bash
+java -jar target/llama-<version>-jar-with-dependencies.jar \
+    -m models/model.gguf --host 127.0.0.1 --port 8080 -c 65536 --jinja
+# then open http://127.0.0.1:8080/ for the WebUI
+```
+
+Or embed it:
+
+```java
+try (NativeServer server = new NativeServer(
+        "-m", "gpt-oss-20b-UD-Q4_K_XL.gguf",
+        "--host", "127.0.0.1", "--port", "8080",
+        "-c", "65536", "-b", "4096", "-ub", "2048",
+        "--jinja", "-ngl", "0", "-t", "8", "-tb", "16",
+        "-ctk", "q8_0", "-ctv", "q8_0",
+        "--chat-template-kwargs", "{\"reasoning_effort\":\"low\"}",
+        "--parallel", "1").start()) {
+    // Open http://127.0.0.1:8080/ in a browser for the WebUI; the OpenAI API is at /v1/... too.
+    Thread.currentThread().join();
+}
+```
+
+Differences from `OpenAiCompatServer`: it **loads its own model** from the arguments (an independent
+lifecycle, like `llama-server.exe`, not a shared `LlamaModel`), it is **single-instance per
+process**, it serves the **WebUI** (in released jars — local `cmake` builds ship the empty-asset
+stub, so no UI there), and it is **not available on Android** (the upstream server needs
+`posix_spawn`). Readiness: poll `GET /health`. No SSL (plain HTTP — bind localhost or front with a
+TLS proxy).
+
 ### LangChain4j integration
 
 A separate artifact, **`net.ladenthin:llama-langchain4j`**, adapts a `LlamaModel` to
diff --git a/TODO.md b/TODO.md
index 66de274f..e6390529 100644
--- a/TODO.md
+++ b/TODO.md
@@ -13,6 +13,36 @@ cross-cutting initiative.
 
 ## Open — jllama-specific
 
+### NativeServer — reuse an already-loaded `LlamaModel` (open, enhancement)
+
+`net.ladenthin.llama.server.NativeServer` (the native-transport server mode that runs the full
+upstream `llama_server` — WebUI included — inside `libjllama` over JNI) currently loads its **own**
+model from the forwarded argv, exactly like running `llama-server.exe`. This is the "independent
+lifecycle" v1: simple, and every llama-server flag is forwarded verbatim.
+
+**Enhancement:** let `NativeServer` optionally attach to an **already-loaded** `LlamaModel`'s
+`server_context` instead of loading a second copy of the weights (saves the RAM/VRAM and load time
+of a duplicate model when a caller already has a `LlamaModel` open). Feasibility notes from the
+initial investigation:
+
+- The upstream HTTP transport (`server_http_context`) and the route bundle
+  (`server_routes routes(params, ctx_server)`) only need a reference to a `server_context`. A
+  `LlamaModel` already owns and drives one (`jllama_context` in `jni_helpers.hpp`), and its JNI
+  methods already post tasks to that context's queue — so a second driver (the HTTP routes) posting
+  to the same queue is plausible; the queue is the synchronization point.
+- The real work is **lifecycle/ownership**: today `llama_server()` owns the whole flow (parse →
+  backend init → `ctx_server.load_model` → `start_loop` on its own thread → cleanup). Reuse would
+  need a *different* entry that skips model loading and the `start_loop`/backend ownership (the
+  existing `LlamaModel` worker already runs the loop), registers the HTTP routes against the shared
+  `server_context`, and starts only `server_http_context`. That is a separate, smaller C++ entry
+  point (not `llama_server`), plus reconciling params (the loaded model's params vs. server params)
+  and ensuring only one thread drives `update_slots`.
+- Logging: `llama_server` calls `common_init()` which routes llama.cpp logging to stderr/file; a
+  reuse path must not clobber the JNI log callback a `LlamaModel` consumer may rely on.
+
+Until then, run `NativeServer` standalone (it owns the process's llama backend + logging while
+running), or use the Java-transport `OpenAiCompatServer` when sharing a `LlamaModel`.
+
 ### PIT gate not hermetic — `value.ContentPart.audioFile(Path)` (open)
 
 The PIT mutation gate reaches 100% **only when the audio test fixture is present**. Without it the
diff --git a/docs/history/llama-cpp-breaking-changes.md b/docs/history/llama-cpp-breaking-changes.md
index 6c6885af..cf745f86 100644
--- a/docs/history/llama-cpp-breaking-changes.md
+++ b/docs/history/llama-cpp-breaking-changes.md
@@ -412,3 +412,14 @@ Used during `llama.cpp` version bumps: when upgrading, scan this file from the r
 | b9842–b9859 | `common/arg.cpp` + `common/http.h` + `tools/server/server-{http,models}.cpp` + `tools/server/server-cors-proxy.h` | **IPv6 URL handling + hf-split primary fix**, all inside upstream-compiled TUs the project already builds. (1) `common/http.h` gains a `common_http_format_host()` helper that brackets an IPv6 literal host (`[::1]`) per RFC 3986, and `common_http_parse_url` now splits the authority so a bracketed IPv6 literal keeps its inner colons; `server-http.cpp` (listening-address string), `server-models.cpp` (proxy `Host` header) and `server-cors-proxy.h` (proxy log) each `#include "http.h"` and route the host through it. `server-http.cpp`/`server-models.cpp`/`server-cors-proxy.h` are already compiled into `jllama`; the project binds none of these symbols and passes host/port as plain params, so behaviour is unchanged for localhost binds. (2) `common/arg.cpp` `common_models_handler_apply` now threads a `primary` hf-split file (the `00001-of` part) through the `add_tasks` lambda instead of assuming index 0 — internal to the `--hf`/`--hf-repo-v`/`--spec-draft-hf` download planner, which the project never calls (`grep -rn "common_models_handler\|common_http_format_host" src/main/cpp src/test/cpp` → zero matches). No project source changes required. |
 | b9842–b9859 | `ggml/src/ggml-cpu/` + `ggml/src/ggml-cuda/` + `ggml/src/ggml-opencl/` + `ggml/src/ggml-vulkan/` + `ggml/src/ggml-webgpu/` + `ggml/src/ggml-hexagon/` + `ggml/src/ggml-backend.cpp` + `src/models/qwen3next.cpp` + `tools/ui/**` | Backend-internal only, no API surface visible to `jllama.cpp`. CPU adds an AVX2/AVX `ggml_vec_dot_nvfp4_q8_0` + a UE4M3 lookup table (`kvalues_mxfp4` renamed to shared `kvalues_fp4`); CUDA adds head-dim-512 flash-attention MMA/tile instances, a strided `get_rows_back` grid-clamp fix (new `test-backend-ops` case for row count > 65535), a gfx900 MMQ gate, and drops the CPU→CUDA async-copy path (scheduler now copies inputs synchronously); OpenCL adds full Q1_0 mul_mat/mul_mv + a `GGML_OPENCL_USE_ADRENO_BIN_KERNELS` prebuilt-binary-kernel loader (OFF by default; affects only the `opencl-*` classifiers); Vulkan rolls the mul_mm BK loop on Asahi/Honeykrisp; WebGPU adds NVFP4 support; Hexagon reworks HVX/HMX flash-attention (new `flash-attn-ops.h`/`hmx-fa-kernels.h`, MUL_MAT_ADD fusion). `qwen3next.cpp` records `t_layer_inp[il]` for MTP. All internal to upstream-compiled `libllama`/`ggml`/backends; the WebUI **auto-follows** the pinned `GIT_TAG` (the `build-webui` CI job rebuilds it), so its edits (PWA navigate-fallback, chat-store foreign-conversation guards) need no manual step. No project source changes required. |
 | b9842–b9859 | upstream verification (sandbox) | All four patches (`0001`–`0004`) re-verified to **apply cleanly** against b9859 via `git apply --check` over the actual b9859 sources fetched from `raw.githubusercontent.com` (github.com git-clone is blocked in this sandbox, so a full `FetchContent` build could not run — exit 0 for `common/arg.{cpp,h}`, `tests/test-arg-parser.cpp`, `tools/server/server-context.{cpp,h}`, `server-common.cpp`, `tests/test-chat.cpp`). The only patch-target file that changed in this range is `common/arg.cpp`, whose b9859 edit is in `common_models_handler_apply` (~L496) — disjoint from patch 0001's `make_utf8_argv`/`common_params_parse` hunks (~L931/L971) and the ~34 standalone-main flips (unchanged in this range), so patch 0001 still applies. Patches 0002/0003/0004 target files untouched in b9842→b9859, so their hunks are byte-identical to b9842. OuteTTS generator anchors hold (`tools/tts/tts.cpp` unchanged). Full build + `ctest` (target 459/459) to be confirmed by the CI pipeline. |
+| b9859–b9862 | `include/llama.h` + `src/llama-model-loader.cpp` + `src/llama-model.{cpp,h}` + `tools/server/server-context.{cpp,h}` + `tools/cli/cli.cpp` | **New feature (additive C API), no break.** Upstream promoted the previously-`static` `llama_model_ftype_name(llama_ftype)` (in `llama-model-loader.cpp`) to a **public** `LLAMA_API const char * llama_ftype_name(enum llama_ftype)` and added `LLAMA_API enum llama_ftype llama_model_ftype(const llama_model *)` (backed by a new `llama_model::ftype()` / `impl::ftype` cached from `ml.ftype` at `load_hparams`). `server_context::get_meta()` now fills a **new `std::string model_ftype`** field on `server_context_meta` (`server-context.h`) and `server_routes::get_model_info()` emits a `"ftype"` key — so the **NativeServer** mode's model-info/`/props` surface gains the quant type automatically (WebUI + `llama-server` clients). `cli.cpp` prints an `ftype :` line. **All inside upstream-compiled `libllama`/server TUs the project already links** — the project binds none of the new symbols (`grep` → only a *comment* mentions `server_context_meta` in `jllama.cpp`; nothing constructs it, and adding a trailing field is source-additive). No project source changes required for the bump itself. **Follow-up (done):** the quant type is now also surfaced through the Java layer — `getModelMetaJson` emits `"ftype"` (from `server_context_meta::model_ftype`), `ModelMeta.getFtype()` / `LlamaModel.getModelFtype()` expose it, and the Java `OpenAiCompatServer` advertises it as `data[].ftype` in `GET /v1/models` (threaded through `OpenAiServerConfig.modelFtype`, mirroring how `supportsVision` is threaded), matching the upstream `get_model_info()` key. |
+| b9859–b9862 | `ggml/src/ggml-cuda/gated_delta_net.{cu,cuh}` + `ggml/src/ggml-cuda/ggml-cuda.cu` + `vendor/cpp-httplib/httplib.{cpp,h}` (v0.48.0→v0.49.0) | Backend/vendor-internal only, no API surface visible to `jllama.cpp`. (1) **CUDA gated-delta-net perf**: a fused `gated_delta_net → cpy` path (`ggml_cuda_op_gated_delta_net_fused_cache` + `ggml_cuda_try_gdn_cache_fusion`) lets the kernel scatter recurrent-state snapshots straight into the rollback cache and skip the follow-up strided copy (a decode win for gated-delta / hybrid-recurrent models, e.g. Qwen3-Next); plus a `ggml_cuda_is_view_or_noop` refactor. Affects only the `cuda13-*` classifiers. (2) **cpp-httplib bumped to v0.49.0** (the vendored copy inside llama.cpp, compiled into `jllama` via `server-http.cpp`): locale-independent ASCII classifiers (`is_ascii_digit/alpha/alnum` replacing `std::isdigit`/`isalnum`), a new additive `MultipartFormDataWriter` + `is_valid_multipart_boundary`, multipart field-name/filename escaping (WHATWG), an unsigned base64 accumulator (UB fix), a `ThreadPool` `idle_timeout_sec` ctor param (defaulted — backward-compatible), a `perform_websocket_handshake` `is_ssl` arg (internal), and a `path_encode_`-gated query-normalization skip. All internal to the compiled TU; the project binds no httplib symbol directly (it uses the upstream `server-http.cpp` transport). No project source changes required. |
+| b9859–b9862 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9862. The b9859→b9862 diff touches only two patch-target files — `tools/server/server-context.cpp` and `server-context.h` (the `model_ftype`/`get_meta`/`get_model_info` additions at ~L3989/~L5121 and the new struct field at ~L50). Patches **0002** (load-progress guard, ~L1152), **0003** (slot-prompt-similarity getter/setter, ~L3965 + `server_context` struct ~L106) and **0005** (near-prompt-end checkpoints, `update_slots` ~L3560) were **applied in sequence** against the actual b9862 `server-context.{cpp,h}` fetched from `raw.githubusercontent.com` — all three applied cleanly (their regions are disjoint from and far from the b9862 additions). Patches **0001** (`common/arg.{cpp,h}`, `test-arg-parser.cpp`, ~34 standalone mains), **0004** (`server-common.cpp`, `test-chat.cpp`) and **0006** (`server.cpp`) target files **not present** in the b9859→b9862 changed-file list, so their hunks are byte-identical to b9859 and apply unchanged. OuteTTS generator anchors hold (`tools/tts/tts.cpp` unchanged in this range). Full build + `ctest` (target 459/459) to be confirmed by the CI pipeline. |
+| b9862–b9864 | `tools/server/server-context.cpp` + `server-schema.cpp` + `server-task.h` + `tools/server/README.md` + `tools/ui/**` | **New feature (additive), no break.** Adds a **per-request `sse_ping_interval`** to the completion API: `task_params` gains `int32_t sse_ping_interval = 30` (`server-task.h`), `make_llama_cmpl_schema` exposes it as a `field_num` with hard limits `[-1, INT32_MAX]` and `eval_llama_cmpl_schema` seeds it from `params_base.sse_ping_interval` (`server-schema.cpp`), and `handle_completions_impl` (`server-context.cpp`, ~L4089) captures the per-task value (instead of the server-level `params.sse_ping_interval`) into the SSE `next` lambda so a request can override the server `--sse-ping-interval` (`-1` disables pings). All inside upstream-compiled server TUs the project already links; the project binds no new symbol. **NativeServer** mode gets it for free (full `llama_server`). The rest of the diff is the **Svelte WebUI** (`tools/ui/**`: MCP server recommendations dialog, a bearer-token Authorization field, migration of the MCP default-enabled key into settings config, `STREAM_VISIBILITY_KICK_MS` 1000→3000, + Vitest units) — the WebUI **auto-follows** the pinned `GIT_TAG` (the `build-webui` CI job rebuilds it), so no manual step. No project source changes required for the bump itself. **Follow-up (done):** `InferenceParameters.withSsePingInterval(int)` now emits the `sse_ping_interval` key (it flows through the OAI-compat completion path via `eval_llama_cmpl_schema`), covered by a Java wither test + three C++ schema round-trip guards (round-trip, `-1` disables, below-hard-limit throws, absent inherits the server default). The same follow-up **audited the completion schema for other already-parseable-but-unexposed fields** and added the plain-scalar wins as withers: `withXtcProbability`/`withXtcThreshold` (XTC sampler), `withNDiscard`, `withNIndent`, `withTMaxPredictMs`, `withPostSamplingProbs`, `withTimingsPerToken`, `withReturnTokens`. (`t_max_prompt_ms` was deliberately skipped — it is commented out `// TODO: implement` in b9864's `make_llama_cmpl_schema`, so it is not parseable.) Remaining schema fields left unexposed on purpose: OAI aliases already covered (`max_tokens`/`max_completion_tokens` → `n_predict`), OAI/server-internal or array-shaped/advanced knobs (`n`/`n_cmpl`, `logprobs`, `echo`, `verbose`, `include_usage`, `return_progress`, `response_fields`, `lora`, `grammar_lazy`/`grammar_triggers`/`preserved_tokens`, `chat_format`, `parse_tool_calls`, `reasoning_control`, `backend_sampling`, `adaptive_*`). |
+| b9862–b9864 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9864. The b9862→b9864 diff touches exactly one patch-target file — `tools/server/server-context.cpp` — and only in `handle_completions_impl` (~L4089), far below every patched region (0002 load-progress guard ~L1152, 0005 near-prompt-end checkpoints ~L3560, 0003 slot-prompt-similarity getter/setter ~L3965). Patches **0002/0003/0005** were **applied in sequence** against the actual b9864 `server-context.{cpp,h}` fetched from `raw.githubusercontent.com` — all clean. `server-context.h` is unchanged in this range (so 0003's `.h` hunk is byte-identical); `server-schema.cpp`/`server-task.h` are **not** patch targets. Patches **0001** (`common/arg.*`, `test-arg-parser.cpp`, ~34 mains), **0004** (`server-common.cpp`, `test-chat.cpp`) and **0006** (`server.cpp`) target files **not** in the changed-file list, so they apply unchanged. Confirmed end-to-end by a clean `cmake` configure: b9864 fetched and **all six patches applied via the fail-loud `PATCH_COMMAND`** (exit 0; 0005's `is_ckpt_only_rollback` marker present), OuteTTS generator anchors held (`tools/tts/tts.cpp` unchanged). Full build + `ctest` (target 459/459) to be confirmed by the CI pipeline. |
+| b9864–b9866 | `ggml/src/ggml-cuda/topk-moe.cu` + `tests/test-backend-ops.cpp` + `tools/ui/**` | Backend/WebUI-only, no API surface. (1) **CUDA topk-moe** gains a `case 288` instantiation (`topk_moe_cuda<288>`) and `ggml_cuda_should_use_topk_moe` now also accepts `n_expert == 288` (the non-power-of-2 expert count of **StepFun 3.7**) — a device-side kernel add, internal to `ggml-cuda`, affecting only the `cuda13-*` classifiers (a StepFun-3.7 MoE GGUF now uses the fused topk-moe path on CUDA instead of the generic fallback). (2) `test-backend-ops.cpp` adds the matching `test_topk_moe({288,22,1,1}, …)` case — **not built here** (`LLAMA_BUILD_TESTS` OFF for the FetchContent subproject). (3) **WebUI** (`tools/ui/**`): a `config-type-normalization-v1` migration coercing legacy string-encoded booleans in persisted config back to real booleans (the strict server schema now rejects `"true"`/`"false"` strings), and a thinking-enabled default flip to `true` — the WebUI **auto-follows** the pinned `GIT_TAG` (the `build-webui` CI job rebuilds it), so no manual step. No project source changes required. |
+| b9864–b9866 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9866. The b9864→b9866 diff touches **no** patch-target file (`common/arg.*`, `tools/server/server-context.{cpp,h}`, `server-common.cpp`, `server-schema.cpp`, `server-task.h`, `server.cpp`, `test-arg-parser.cpp`, `test-chat.cpp`, the ~34 standalone mains) and **no** OuteTTS generator anchor (`tools/tts/tts.cpp` unchanged) — the only edits are `ggml-cuda/topk-moe.cu`, `tests/test-backend-ops.cpp` and `tools/ui/**` — so every patch hunk/offset is byte-identical to b9864. Confirmed end-to-end by a clean `cmake` configure: b9866 fetched and **all six patches applied via the fail-loud `PATCH_COMMAND`** (exit 0; 0005's `is_ckpt_only_rollback` marker present), OuteTTS generator anchors held. Full build + `ctest` (target 462/462) to be confirmed by the CI pipeline. |
+| b9866–b9867 | `common/speculative.cpp` | Internal-only, no API surface. A tweak to the **DFlash** block-diffusion speculative draft path (`common_speculative_impl_draft_dflash`, from the b9829–b9839 DFlash feature): (1) the block-size clamp now also clamps `params.n_min` (not just `n_max`) to `block_size - 1` and logs both; (2) the per-step draft sampler's `top_k` goes `1 → 10`; (3) drafting now **stops early** when the top candidate's probability drops below `params.p_min` (upstream b9867 title "spec: support spec-draft-p-min in DFlash"), and a step that produced fewer than `params.n_min` tokens is discarded (`result.clear()`). All three use **already-existing** `common_speculative_params` fields (`n_min`/`n_max`/`p_min`) — no struct/header/API change (`common/speculative.h` untouched). Entirely inside upstream-compiled `common`; the project binds no `common_speculative_*` symbol and exposes no `--spec-*` inference param, so it flows through `libllama` unchanged. No project source changes required. |
+| b9866–b9867 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9867. The b9866→b9867 diff touches **no** patch-target file (`common/arg.*`, `tools/server/server-context.{cpp,h}`, `server-common.cpp`, `server-schema.cpp`, `server-task.h`, `server.cpp`, `test-arg-parser.cpp`, `test-chat.cpp`, the ~34 standalone mains) and **no** OuteTTS generator anchor (`tools/tts/tts.cpp` unchanged) — the only edit is `common/speculative.cpp` — so every patch hunk/offset is byte-identical to b9866. Confirmed end-to-end by a clean `cmake` configure: b9867 fetched and **all six patches applied via the fail-loud `PATCH_COMMAND`** (exit 0; 0005's `is_ckpt_only_rollback` and 0006's `g_llama_server_embedded` markers present), OuteTTS generator anchors held. First bump driven by `.github/scripts/llama-next-version.sh` (b9866→b9867, 2 KiB single-commit final chunk). Full build + `ctest` (target 462/462) to be confirmed by the CI pipeline. |
+| b9867–b9870 | `common/chat.cpp` + `models/templates/stepfun-ai-Step-3.5-Flash.jinja` (removed) + `tests/test-chat*.cpp` | Internal-only, no API surface. Adds a **StepFun** message-content whitespace workaround (issue #24181): `common_chat_templates_apply_jinja` detects a StepFun template (`src.find("You have access to the following functions in JSONSchema format")`) and, before rendering, trims leading/trailing whitespace from each `common_chat_msg`'s `content`/`reasoning_content` and its `"text"` `content_parts` via a new `static` `workaround::trim_all_content(...)` — otherwise leftover whitespace drove the model into reasoning loops. Uses only existing `common_chat_msg` fields; `common/chat.h` is untouched (no struct/API change). The removed `stepfun-ai-Step-3.5-Flash.jinja` embedded template and the `test-chat*.cpp` additions are **not built here** (`LLAMA_BUILD_TESTS` OFF for the FetchContent subproject). All inside upstream-compiled `common`, flowing through the embedded server / `LlamaModel` chat path automatically. No project source changes required. |
+| b9867–b9870 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9870. The b9867→b9870 diff touches **no** patch-target file (`common/arg.*`, `tools/server/server-context.{cpp,h}`, `server-common.cpp`, `server-schema.cpp`, `server-task.h`, `server.cpp`, `test-arg-parser.cpp`, the ~34 standalone mains) and **no** OuteTTS generator anchor (`tools/tts/tts.cpp` unchanged) — the only source edit is `common/chat.cpp` (a StepFun whitespace workaround), plus `tools/ui/**` (WebUI, auto-followed) and `tests/test-chat*.cpp` (not built) — so every patch hunk/offset is byte-identical to b9867. **Note:** patch `0004` also targets `tests/test-chat.cpp`, which b9870 edits, but `0004`'s hunks add the reasoning-budget cases in a disjoint region (verified clean by the configure below). Confirmed end-to-end by a clean `cmake` configure: b9870 fetched and **all six patches applied via the fail-loud `PATCH_COMMAND`** (exit 0; 0005's `is_ckpt_only_rollback` and 0006's `g_llama_server_embedded` markers present, b9870's `trim_all_content` present), OuteTTS generator anchors held. Full build + `ctest` (target 462/462) to be confirmed by the CI pipeline. |
diff --git a/docs/upgrade/llama-cpp-version-bump.md b/docs/upgrade/llama-cpp-version-bump.md
new file mode 100644
index 00000000..c12c4d57
--- /dev/null
+++ b/docs/upgrade/llama-cpp-version-bump.md
@@ -0,0 +1,138 @@
+<!--
+SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+
+SPDX-License-Identifier: MIT
+-->
+
+# llama.cpp version-bump runbook
+
+This is the **documentation root** for bumping the pinned llama.cpp version. It links the
+mechanical edit steps in [`../../CLAUDE.md`](../../CLAUDE.md#upgradingdowngrading-llamacpp-version)
+together with a repeatable **target-selection + chunking** strategy so a bump never lands an
+unreviewably large diff in one step.
+
+The current pin lives in `llama/CMakeLists.txt` as `GIT_TAG b<nnnn>`. llama.cpp tags **every**
+master commit as `b<nnnn>`, but only a subset get GitHub *Releases*.
+
+---
+
+## TL;DR
+
+```bash
+# From the repo root. Prints the next reviewable step (b<cur> -> b<next>) and its compare/.patch URLs.
+.github/scripts/llama-next-version.sh                 # target = latest RELEASE (atom feed)
+.github/scripts/llama-next-version.sh b9900           # target = an explicit tag
+```
+
+Then apply the printed `b<cur> -> b<next>` step per [§ Applying a bump](#applying-a-bump) and re-run
+the script to walk the next chunk, until it prints **"reaches the latest release — final chunk"**.
+
+---
+
+## 1. Pick the target (topmost release)
+
+The **target candidate is the topmost release** on
+<https://github.com/ggml-org/llama.cpp/releases>. Read it from the release **atom feed**, which is
+reachable from restricted sandboxes where the ggml-org REST API is blocked:
+
+```
+https://github.com/ggml-org/llama.cpp/releases.atom
+```
+
+The first `<entry>`'s `releases/tag/b<nnnn>` is the latest release. `llama-next-version.sh` does this
+for you; if the feed is rate-limited (repeated unauthenticated fetches can return empty), open the
+releases page in a browser and pass the tag explicitly: `llama-next-version.sh b<nnnn>`.
+
+> **Why releases, not just the newest `b<nnnn>` tag:** releases are the versions upstream deems
+> shippable; an arbitrary master commit tag may be mid-refactor. Intermediate **chunk** steps
+> (below) are allowed to land on non-release tags — they are transient waypoints, not the target.
+
+## 2. Chunk by diff **byte-size**, not commit count
+
+The step size is governed by the **size of `git diff` between the pinned tag and the target**, not by
+how many commits separate them:
+
+- If `git diff b<cur> b<target>` is **< 100 KiB**, bump straight to the target in one step.
+- If it is **≥ 100 KiB**, pick an **intermediate** `b<nnnn>` tag whose diff from the current pin is the
+  largest still **under** the threshold, bump to that first, then repeat. Each step stays a small,
+  reviewable patch.
+
+The threshold is a knob (`LLAMA_BUMP_MAX_DIFF_KB`, default `100`). This is a heuristic: diff size grows
+monotonically enough with the tag number that the helper binary-searches the intermediate tags safely.
+
+> **`tools/ui` (the WebUI) dominates the full diff** and is *auto-followed* — CI rebuilds the matching
+> Svelte UI from the pinned `GIT_TAG`, so it needs no per-bump source review. To size the diff on the
+> code you actually review, set `LLAMA_BUMP_EXCLUDE_WEBUI=1` (the helper prints both figures regardless).
+
+### The helper: `.github/scripts/llama-next-version.sh`
+
+It only **reads** — a cached blobless mirror clone of llama.cpp plus `llama/CMakeLists.txt`; it never
+edits the repo. It prints the chosen `b<cur> -> b<next>` step, its full and WebUI-excluded diff size,
+the commit count, and the `compare` / `.patch` URLs. Environment:
+
+| Var | Default | Meaning |
+|---|---|---|
+| `LLAMA_BUMP_MAX_DIFF_KB` | `100` | Per-step diff-size threshold, in KiB. |
+| `LLAMA_BUMP_EXCLUDE_WEBUI` | `0` | `1` = size the diff **excluding** `tools/ui`. |
+| `LLAMA_BUMP_CACHE` | `~/.cache/jllama-llamacpp-mirror` | Mirror-clone location (cloned once, then fetched). |
+
+Worked example — pin `b9859`, latest release `b9866` (full diff 133 KiB ≥ 100 KiB, so it chunks):
+
+```
+$ .github/scripts/llama-next-version.sh b9866
+current pin    : b9859
+latest release : b9866
+threshold      : 100 KiB per step (full diff)
+
+next step      : b9859 -> b9862
+  diff size    : 45 KiB full  /  ...  KiB excluding tools/ui (auto-followed WebUI)
+  commits      : 3
+  progress     : intermediate chunk — re-run this script after the bump for the next one
+  review diff  : https://github.com/ggml-org/llama.cpp/compare/b9859...b9862
+  raw .patch   : https://github.com/ggml-org/llama.cpp/compare/b9859...b9862.patch
+```
+
+## 3. Review the chunk's diff
+
+Fetch the printed `compare/...patch` URL (or open the `compare` page). Walk it against the
+**priority-ordered API-compatibility review list** in
+[`../../CLAUDE.md`](../../CLAUDE.md#files-to-check-for-api-compatibility) — the 8 header rows that have
+historically caused breaks (`common.h`, `chat.h`, `speculative.h`, `mtmd.h`, `llama-cpp.h`, `arg.h`,
+`llama.h`, `download.h`), plus the project `CMakeLists.txt` for renamed link targets. Note any new
+API surface worth wiring through the Java layer (e.g. a new completion param or model-metadata getter).
+
+---
+
+## Applying a bump
+
+Once you have the `b<cur> -> b<next>` step, apply it exactly as
+[`CLAUDE.md § Upgrading/Downgrading`](../../CLAUDE.md#upgradingdowngrading-llamacpp-version) describes.
+Concretely:
+
+1. **Edit the pin — three files:**
+   - `llama/CMakeLists.txt` — the `GIT_TAG b<cur>` line **and** the `-DLLAMA_TAG=b<cur>` used by the
+     WebUI/TTS extraction (both must move together).
+   - `README.md` — the llama.cpp badge and link (version appears twice).
+   - `CLAUDE.md` — the "Current llama.cpp pinned version" line (and any build-example `b<nnnn>`).
+2. **Re-verify `patches/`** — a clean configure re-runs the fail-loud `PATCH_COMMAND`, so every patch
+   `0001`–`0006` must still apply. Use a **fresh** build dir (a stale one re-applies over an
+   already-patched tree and reports a false "does not apply"):
+   ```bash
+   cd llama && mvn -q compile          # generates the OSInfo class CMake's OS-detection needs
+   rm -rf build && cmake -B build       # fail-loud: aborts here if any patch no longer applies
+   ```
+   If a patch no longer applies, refresh its diff against the new source and recommit it.
+3. **Append the history rows** — add a pair of rows to
+   [`../history/llama-cpp-breaking-changes.md`](../history/llama-cpp-breaking-changes.md) covering the
+   `b<cur> -> b<next>` range (what broke / what was new; "no source change" is a valid row).
+4. **Commit + push** on the working branch (do not open a new PR if one already tracks the branch):
+   ```bash
+   git add llama/CMakeLists.txt README.md CLAUDE.md docs/history/llama-cpp-breaking-changes.md
+   git commit -m "Upgrade llama.cpp from b<cur> to b<next>"
+   git push -u origin <your-branch>
+   ```
+5. **Re-run the helper** for the next chunk. Repeat until it reports the **final chunk** (target
+   reached).
+
+CI builds every native classifier from the new pin; the full model-backed Java + C++ suites gate the
+result. A build failure at the configure step almost always means a patch needs refreshing (step 2).
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 4c480f81..523d4b3b 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.22)
 
 project(jllama CXX)
 
@@ -8,7 +8,14 @@ project(jllama CXX)
 # Must be set before any FetchContent_MakeAvailable() so that llama.cpp and all
 # other subprojects inherit the same CRT choice (mixing /MT and /MD in a single
 # link is a linker error).
-if(MSVC)
+#
+# EXCEPTION: the Intel oneAPI SYCL and OpenVINO backends must use the DYNAMIC /MD
+# runtime — `icx -fsycl` rejects /MT outright ("invalid argument 'MT' not allowed
+# with '-fsycl'") and the OpenVINO import libraries are built /MD (mixing would be a
+# link error). Those classifiers already require the vendor runtime on the host, so
+# the self-contained-DLL rationale does not apply to them; the CPU + CUDA/Vulkan/OpenCL
+# classifiers keep /MT.
+if(MSVC AND NOT GGML_SYCL AND NOT GGML_OPENVINO)
     set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>" CACHE STRING "" FORCE)
 endif()
 
@@ -143,7 +150,7 @@ set(LLAMA_BUILD_APP OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b9859
+	GIT_TAG        b9870
 	PATCH_COMMAND  ${CMAKE_COMMAND}
 		-DPATCH_DIR=${CMAKE_CURRENT_SOURCE_DIR}/patches
 		-DLLAMA_SRC=<SOURCE_DIR>
@@ -166,7 +173,7 @@ execute_process(
     COMMAND ${CMAKE_COMMAND}
         -DTTS_SRC=${llama.cpp_SOURCE_DIR}/tools/tts/tts.cpp
         -DOUT_CPP=${JLLAMA_TTS_GEN_CPP}
-        -DLLAMA_TAG=b9859
+        -DLLAMA_TAG=b9870
         -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate-tts-upstream.cmake
     RESULT_VARIABLE JLLAMA_TTS_GEN_RESULT
 )
@@ -247,10 +254,18 @@ endif()
 # under its own Maven classifier, so it must land in a backend-specific resource
 # root (the default CPU tree stays src/main/resources/). The GPU branches are
 # OS-aware because the same GGML flag is used on more than one platform:
-#   - GGML_CUDA   -> Linux (resources_linux_cuda)   AND Windows (resources_windows_cuda)
-#   - GGML_OPENCL -> Android (resources_android_opencl) AND Windows (resources_windows_opencl)
-#   - GGML_VULKAN -> Windows only (resources_windows_vulkan)
-# The classifier->tree mapping is mirrored by the matching Maven profile in pom.xml.
+#   - GGML_CUDA     -> Linux (resources_linux_cuda)   AND Windows (resources_windows_cuda)
+#   - GGML_OPENCL   -> Android (resources_android_opencl) AND Windows (resources_windows_opencl)
+#   - GGML_VULKAN   -> Windows (resources_windows_vulkan) AND Linux (resources_linux_vulkan)
+#   - GGML_HIP      -> Linux (resources_linux_rocm)    AND Windows (resources_windows_rocm)   [AMD ROCm/HIP]
+#   - GGML_SYCL     -> Windows (resources_windows_sycl) AND Linux (fp16/fp32 split, see below) [Intel oneAPI]
+#   - GGML_OPENVINO -> Linux (resources_linux_openvino) AND Windows (resources_windows_openvino) [Intel OpenVINO]
+# The classifier->tree mapping is mirrored by the matching Maven profile in pom.xml. The Linux
+# Vulkan tree holds both x86_64 and aarch64 under Linux/${OS_ARCH}; two Maven profiles
+# (vulkan-linux / vulkan-linux-aarch64) split it into one single-arch classifier JAR each. The
+# Windows OpenCL tree likewise holds both x86_64 (desktop ICD) and aarch64 (Snapdragon/Adreno),
+# split by the opencl-windows / opencl-windows-aarch64 profiles. Linux SYCL ships two precision
+# variants at the SAME arch, so it is routed to two distinct trees by GGML_SYCL_F16 (fp16 vs fp32).
 if(GGML_CUDA)
     if(OS_NAME STREQUAL "Windows")
         set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_cuda/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
@@ -260,8 +275,13 @@ if(GGML_CUDA)
         message(STATUS "GPU (CUDA Linux) build - Installing files to ${JLLAMA_DIR}")
     endif()
 elseif(GGML_VULKAN)
-    set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_vulkan/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
-    message(STATUS "GPU (Vulkan) build - Installing files to ${JLLAMA_DIR}")
+    if(OS_NAME STREQUAL "Windows")
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_vulkan/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (Vulkan Windows) build - Installing files to ${JLLAMA_DIR}")
+    else()
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_vulkan/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (Vulkan Linux) build - Installing files to ${JLLAMA_DIR}")
+    endif()
 elseif(GGML_OPENCL)
     if(OS_NAME STREQUAL "Windows")
         set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_opencl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
@@ -270,6 +290,33 @@ elseif(GGML_OPENCL)
         set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_android_opencl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
         message(STATUS "GPU (OpenCL Android) build - Installing files to ${JLLAMA_DIR}")
     endif()
+elseif(GGML_HIP)
+    if(OS_NAME STREQUAL "Windows")
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_rocm/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (ROCm/HIP Windows) build - Installing files to ${JLLAMA_DIR}")
+    else()
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_rocm/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (ROCm/HIP Linux) build - Installing files to ${JLLAMA_DIR}")
+    endif()
+elseif(GGML_SYCL)
+    if(OS_NAME STREQUAL "Windows")
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_sycl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (SYCL Windows) build - Installing files to ${JLLAMA_DIR}")
+    elseif(GGML_SYCL_F16)
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (SYCL Linux fp16) build - Installing files to ${JLLAMA_DIR}")
+    else()
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (SYCL Linux fp32) build - Installing files to ${JLLAMA_DIR}")
+    endif()
+elseif(GGML_OPENVINO)
+    if(OS_NAME STREQUAL "Windows")
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_openvino/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (OpenVINO Windows) build - Installing files to ${JLLAMA_DIR}")
+    else()
+        set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_openvino/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
+        message(STATUS "GPU (OpenVINO Linux) build - Installing files to ${JLLAMA_DIR}")
+    endif()
 else()
     set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources/net/ladenthin/llama/${OS_NAME}/${OS_ARCH})
     message(STATUS "CPU build - Installing files to ${JLLAMA_DIR}")
@@ -355,6 +402,23 @@ if(NOT ANDROID_ABI AND NOT OS_NAME MATCHES "Android")
     )
 endif()
 
+# Native-server mode (net.ladenthin.llama.server.NativeServer): compile the standalone server
+# entry point (server.cpp's `llama_server`, made embeddable by patches/0006) and its tools helper
+# (server-tools.cpp); jllama's JNI bridge (native_server.cpp) then calls llama_server on a worker
+# thread. This runs the *full* upstream HTTP server — WebUI included, every llama-server flag
+# forwarded — inside libjllama, with no separate llama-server executable. server.cpp and
+# server-tools.cpp both pull in vendor/sheredom/subprocess.h (posix_spawn_*), so they share the
+# non-Android guard used for server-models.cpp above; native_server.cpp links against llama_server
+# and is guarded too. On Android the NativeServer native methods are simply absent (its JNI calls
+# throw UnsatisfiedLinkError) — use OpenAiCompatServer there.
+if(NOT ANDROID_ABI AND NOT OS_NAME MATCHES "Android")
+    target_sources(jllama PRIVATE
+        ${llama.cpp_SOURCE_DIR}/tools/server/server-tools.cpp
+        ${llama.cpp_SOURCE_DIR}/tools/server/server.cpp
+        ${CMAKE_SOURCE_DIR}/src/main/cpp/native_server.cpp
+    )
+endif()
+
 # Phase 2: also compile the upstream HTTP transport (server-http.cpp) and its
 # cpp-httplib backend directly into jllama, so the OpenAI-compatible server can be
 # driven natively from JNI — shipped inside libjllama, with no separate
diff --git a/llama/patches/0005-server-recurrent-near-prompt-end-checkpoints.patch b/llama/patches/0005-server-recurrent-near-prompt-end-checkpoints.patch
new file mode 100644
index 00000000..59f729ff
--- /dev/null
+++ b/llama/patches/0005-server-recurrent-near-prompt-end-checkpoints.patch
@@ -0,0 +1,39 @@
+diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
+index 39aa20b..d3d5978 100644
+--- a/tools/server/server-context.cpp
++++ b/tools/server/server-context.cpp
+@@ -3560,8 +3560,32 @@ private:
+                     // do not checkpoint after mtmd chunks
+                     do_checkpoint = do_checkpoint && !has_mtmd;
+ 
+-                    // no need to create checkpoints that are too close together, unless it's the last user message
+-                    do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || is_last_user_message || n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step);
++                    // recurrent (and hybrid) models cannot partially roll back their state, so the only way to
++                    // avoid re-processing an entire multi-turn conversation on the next request is a checkpoint
++                    // near the end of the current prompt. without this, a conversation that appends only
++                    // assistant/tool messages (agentic tool-calling) re-processes the whole tail every turn,
++                    // because no new user-message checkpoint is ever created and the min-step spacing blocks
++                    // the near-prompt-end ones. exempt those models' near-end checkpoints from the spacing.
++                    const bool is_ckpt_only_rollback =
++                            ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL ||
++                            ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS;
++
++                    // don't create checkpoints too close together, unless it's the last user message or a
++                    // near-prompt-end checkpoint for a checkpoint-only-rollback model (leading empty() guards
++                    // the checkpoints.back() access via short-circuit)
++                    const bool checkpoint_well_spaced =
++                            slot.prompt.checkpoints.empty() ||
++                            is_last_user_message ||
++                            (near_prompt_end && is_ckpt_only_rollback) ||
++                            n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step;
++
++                    // and never duplicate the newest checkpoint's position (else the last-user-message
++                    // checkpoint is re-created every turn, flooding the list until useful entries are evicted)
++                    const bool checkpoint_not_duplicate =
++                            slot.prompt.checkpoints.empty() ||
++                            slot.prompt.checkpoints.back().n_tokens != n_tokens_start;
++
++                    do_checkpoint = do_checkpoint && checkpoint_well_spaced && checkpoint_not_duplicate;
+                     SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max);
+ 
+                     // note: we create the checkpoint before calling llama_decode(), so the current batch is not
diff --git a/llama/patches/0006-server-embed-native-server-jni.patch b/llama/patches/0006-server-embed-native-server-jni.patch
new file mode 100644
index 00000000..35a146d5
--- /dev/null
+++ b/llama/patches/0006-server-embed-native-server-jni.patch
@@ -0,0 +1,67 @@
+diff --git a/tools/server/server.cpp b/tools/server/server.cpp
+index 84c7f0b..5c9fac9 100644
+--- a/tools/server/server.cpp
++++ b/tools/server/server.cpp
+@@ -25,6 +25,28 @@
+ static std::function<void(int)> shutdown_handler;
+ static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
+ 
++// [jllama] Embedded-mode support: when llama_server() is hosted inside libjllama and driven over
++// JNI (net.ladenthin.llama.server.NativeServer), it must NOT install process-wide signal handlers
++// (that would hijack the JVM's SIGINT/SIGTERM), and it must be stoppable out-of-band because
++// ctx_server is local to llama_server(). It also parses exactly the forwarded argv rather than
++// re-deriving it from the process command line (which would be java.exe's — the Windows bug the
++// 0001 patch fixes for the embedded path). These symbols are declared in
++// src/main/cpp/native_server_bridge.h and called by native_server.cpp.
++static std::atomic<bool> g_llama_server_embedded{false};
++
++void llama_server_set_embedded(bool embedded) {
++    g_llama_server_embedded.store(embedded);
++}
++
++void llama_server_request_shutdown() {
++    // Mirrors the SIGTERM path: invoke the installed shutdown_handler, which unblocks
++    // ctx_server.start_loop() (single-model) / ctx_http.stop() (router). No-op if the server has
++    // not finished starting (handler not yet installed) — stop after /health reports ready.
++    if (shutdown_handler) {
++        shutdown_handler(SIGTERM);
++    }
++}
++
+ static inline void signal_handler(int signal) {
+     if (is_terminating.test_and_set()) {
+         // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
+@@ -87,7 +109,13 @@ int llama_server(int argc, char ** argv) {
+     // touch it. lifecycle is symmetric, stop_gc() runs in clean_up() before backend free
+     g_stream_sessions.start_gc();
+ 
+-    if (!common_params_parse_main(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
++    // [jllama] embedded (JNI) callers forward a clean UTF-8 argv, so honor it exactly via
++    // common_params_parse; only the standalone tool needs common_params_parse_main's
++    // process-command-line (GetCommandLineW) UTF-8 recovery.
++    const bool parsed_ok = g_llama_server_embedded.load()
++        ? common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)
++        : common_params_parse_main(argc, argv, params, LLAMA_EXAMPLE_SERVER);
++    if (!parsed_ok) {
+         return 1;
+     }
+ 
+@@ -412,6 +440,10 @@ int llama_server(int argc, char ** argv) {
+     }
+ 
+     // TODO: refactor in common/console
++    // [jllama] skip installing process-wide signal handlers when embedded in the JVM (they would
++    // hijack the JVM's own SIGINT/SIGTERM). NativeServer stops the embedded server via
++    // llama_server_request_shutdown() instead.
++    if (!g_llama_server_embedded.load()) {
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+     struct sigaction sigint_action;
+     sigint_action.sa_handler = signal_handler;
+@@ -425,6 +457,7 @@ int llama_server(int argc, char ** argv) {
+     };
+     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+ #endif
++    }
+ 
+     SRV_INF("listening on %s\n", ctx_http.listening_address.c_str());
+ 
diff --git a/llama/pom.xml b/llama/pom.xml
index 67e6e563..da5a58d2 100644
--- a/llama/pom.xml
+++ b/llama/pom.xml
@@ -1149,14 +1149,180 @@ SPDX-License-Identifier: MIT
 				</plugins>
 			</build>
 		</profile>
+		<profile>
+			<!-- Linux x86_64 Vulkan GPU natives, shipped as the `vulkan-linux-x86-64`
+			     classifier JAR. Built with -DGGML_VULKAN=ON and routed by CMakeLists.txt to
+			     src/main/resources_linux_vulkan/.../Linux/x86_64/. The Vulkan runtime
+			     (libvulkan.so.1) ships with the consumer's GPU driver - nothing is bundled. The
+			     resource copy includes ONLY the Linux/x86_64 subtree so the other arch's natives
+			     (staged into the same tree by the sibling job) do not leak into this JAR. Staged by
+			     CI before this profile runs. -->
+			<id>vulkan-linux</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>vulkan-linux</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_vulkan</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-vulkan-linux</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_vulkan</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_linux_vulkan/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Linux/x86_64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>vulkan-linux</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>vulkan-linux-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_linux_vulkan</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
+		<profile>
+			<!-- Linux aarch64 Vulkan GPU natives, shipped as the `vulkan-linux-aarch64`
+			     classifier JAR. Built with -DGGML_VULKAN=ON and routed by CMakeLists.txt to
+			     src/main/resources_linux_vulkan/.../Linux/aarch64/. The Vulkan runtime
+			     (libvulkan.so.1) ships with the consumer's GPU driver - nothing is bundled. The
+			     resource copy includes ONLY the Linux/aarch64 subtree so the other arch's natives
+			     (staged into the same tree by the sibling job) do not leak into this JAR. Staged by
+			     CI before this profile runs. -->
+			<id>vulkan-linux-aarch64</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>vulkan-linux-aarch64</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_vulkan_aarch64</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-vulkan-linux-aarch64</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_vulkan_aarch64</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_linux_vulkan/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Linux/aarch64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>vulkan-linux-aarch64</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>vulkan-linux-aarch64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_linux_vulkan_aarch64</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
 
 		<profile>
-			<!-- Windows OpenCL GPU natives (x86_64 only), shipped as the
+			<!-- Windows OpenCL GPU natives (x86_64), shipped as the
 			     `opencl-windows-x86-64` classifier JAR. Built with -DGGML_OPENCL=ON and
 			     routed by CMakeLists.txt to src/main/resources_windows_opencl/. The OpenCL
 			     ICD (System32\OpenCL.dll) ships with the GPU driver — nothing is bundled.
 			     NOTE: the GGML OpenCL backend is Adreno-tuned; on desktop GPUs CUDA/Vulkan
-			     are better supported. Staged by CI before this profile runs. -->
+			     are better supported. The resource copy includes ONLY the Windows/x86_64
+			     subtree so the aarch64 natives (opencl-windows-aarch64, staged into the same
+			     tree by the sibling job) do not leak into this JAR. Staged by CI before this
+			     profile runs. -->
 			<id>opencl-windows</id>
 			<build>
 				<plugins>
@@ -1201,7 +1367,7 @@ SPDX-License-Identifier: MIT
 										<directory>
 											${basedir}/src/main/resources_windows_opencl/</directory>
 										<includes>
-											<include>**/*.*</include>
+											<include>net/ladenthin/llama/Windows/x86_64/**</include>
 										</includes>
 									</resource>
 								</resources>
@@ -1231,6 +1397,633 @@ SPDX-License-Identifier: MIT
 			</build>
 		</profile>
 
+		<profile>
+			<!-- Linux x86_64 AMD ROCm/HIP GPU natives, shipped as the `rocm-linux-x86-64` classifier JAR.
+			     Built with -DGGML_HIP=ON and routed by CMakeLists.txt to src/main/resources_linux_rocm/.
+			     The ROCm/HIP runtime ships with the consumer's ROCm install - nothing is bundled. Staged by
+			     CI before this profile runs. -->
+			<id>rocm-linux</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>rocm-linux</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_rocm</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-rocm-linux</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_rocm</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_linux_rocm/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Linux/x86_64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>rocm-linux</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>rocm-linux-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_linux_rocm</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+		<profile>
+			<!-- Windows x86_64 AMD ROCm/HIP GPU natives, shipped as the `rocm-windows-x86-64` classifier JAR.
+			     Built with -DGGML_HIP=ON and routed by CMakeLists.txt to src/main/resources_windows_rocm/.
+			     The HIP SDK runtime DLLs ship with the consumer's AMD HIP SDK - nothing is bundled. Staged by
+			     CI before this profile runs. -->
+			<id>rocm-windows</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>rocm-windows</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_rocm</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-rocm-windows</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_rocm</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_windows_rocm/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Windows/x86_64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>rocm-windows</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>rocm-windows-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_windows_rocm</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+		<profile>
+			<!-- Linux x86_64 Intel SYCL (oneAPI, fp16) GPU natives, shipped as the `sycl-fp16-linux-x86-64`
+			     classifier JAR. Built with -DGGML_SYCL=ON -DGGML_SYCL_F16=ON and routed by CMakeLists.txt to
+			     src/main/resources_linux_sycl_fp16/. The oneAPI runtime ships with the consumer's oneAPI
+			     install - nothing is bundled. Staged by CI before this profile runs. -->
+			<id>sycl-fp16-linux</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>sycl-fp16-linux</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_sycl_fp16</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-sycl-fp16-linux</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_sycl_fp16</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_linux_sycl_fp16/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Linux/x86_64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>sycl-fp16-linux</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>sycl-fp16-linux-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_linux_sycl_fp16</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+		<profile>
+			<!-- Linux x86_64 Intel SYCL (oneAPI, fp32) GPU natives, shipped as the `sycl-fp32-linux-x86-64`
+			     classifier JAR. Built with -DGGML_SYCL=ON (F16 off) and routed by CMakeLists.txt to
+			     src/main/resources_linux_sycl_fp32/. The oneAPI runtime ships with the consumer's oneAPI
+			     install - nothing is bundled. Staged by CI before this profile runs. -->
+			<id>sycl-fp32-linux</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>sycl-fp32-linux</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_sycl_fp32</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-sycl-fp32-linux</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_sycl_fp32</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_linux_sycl_fp32/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Linux/x86_64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>sycl-fp32-linux</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>sycl-fp32-linux-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_linux_sycl_fp32</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+		<profile>
+			<!-- Windows x86_64 Intel SYCL (oneAPI) GPU natives, shipped as the `sycl-windows-x86-64` classifier
+			     JAR. Built with -DGGML_SYCL=ON and routed by CMakeLists.txt to src/main/resources_windows_sycl/.
+			     The oneAPI runtime ships with the consumer's oneAPI install - nothing is bundled. Staged by CI
+			     before this profile runs. -->
+			<id>sycl-windows</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>sycl-windows</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_sycl</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-sycl-windows</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_sycl</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_windows_sycl/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Windows/x86_64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>sycl-windows</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>sycl-windows-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_windows_sycl</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+		<profile>
+			<!-- Windows aarch64 OpenCL (Adreno/Snapdragon) GPU natives, shipped as the `opencl-windows-aarch64`
+			     classifier JAR. Built with -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON and routed by
+			     CMakeLists.txt to src/main/resources_windows_opencl/.../Windows/aarch64/ (same tree as the x86_64
+			     OpenCL build). The OpenCL ICD (System32\OpenCL.dll) ships with the Adreno driver - nothing is
+			     bundled. The resource copy includes ONLY the Windows/aarch64 subtree so the x86_64 natives do not
+			     leak into this JAR. Staged by CI before this profile runs. -->
+			<id>opencl-windows-aarch64</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>opencl-windows-aarch64</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_opencl_aarch64</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-opencl-windows-aarch64</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_opencl_aarch64</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_windows_opencl/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Windows/aarch64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>opencl-windows-aarch64</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>opencl-windows-aarch64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_windows_opencl_aarch64</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+		<profile>
+			<!-- Linux x86_64 Intel OpenVINO GPU/NPU natives, shipped as the `openvino-linux-x86-64` classifier
+			     JAR. Built with -DGGML_OPENVINO=ON and routed by CMakeLists.txt to
+			     src/main/resources_linux_openvino/. The OpenVINO runtime ships with the consumer's OpenVINO
+			     install - nothing is bundled. Staged by CI before this profile runs. -->
+			<id>openvino-linux</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>openvino-linux</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_openvino</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-openvino-linux</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_linux_openvino</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_linux_openvino/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Linux/x86_64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>openvino-linux</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>openvino-linux-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_linux_openvino</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+		<profile>
+			<!-- Windows x86_64 Intel OpenVINO GPU/NPU natives, shipped as the `openvino-windows-x86-64`
+			     classifier JAR. Built with -DGGML_OPENVINO=ON and routed by CMakeLists.txt to
+			     src/main/resources_windows_openvino/. The OpenVINO runtime ships with the consumer's OpenVINO
+			     install - nothing is bundled. Staged by CI before this profile runs. -->
+			<id>openvino-windows</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-compiler-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>openvino-windows</id>
+							<phase>compile</phase>
+							<goals>
+								<goal>compile</goal>
+							</goals>
+							<configuration>
+								<excludes>
+									<exclude>module-info.java</exclude>
+								</excludes>
+								<compilerArgs>
+									<arg>-h</arg>
+									<arg>src/main/cpp</arg>
+								</compilerArgs>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_openvino</outputDirectory>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<artifactId>maven-resources-plugin</artifactId>
+						<executions>
+						<execution>
+							<id>copy-resources-openvino-windows</id>
+							<phase>process-classes</phase>
+							<goals>
+								<goal>copy-resources</goal>
+							</goals>
+							<configuration>
+								<outputDirectory>
+									${project.build.outputDirectory}_windows_openvino</outputDirectory>
+								<resources>
+									<resource>
+										<directory>
+											${basedir}/src/main/resources_windows_openvino/</directory>
+										<includes>
+											<include>net/ladenthin/llama/Windows/x86_64/**</include>
+										</includes>
+									</resource>
+								</resources>
+							</configuration>
+						</execution>
+						</executions>
+					</plugin>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-jar-plugin</artifactId>
+						<executions>
+							<execution>
+								<id>openvino-windows</id>
+								<phase>package</phase>
+								<goals>
+									<goal>jar</goal>
+								</goals>
+								<configuration>
+									<classifier>openvino-windows-x86-64</classifier>
+									<classesDirectory>
+										${project.build.outputDirectory}_windows_openvino</classesDirectory>
+								</configuration>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
+
 		<profile>
 			<id>vmlens</id>
 			<build>
@@ -1296,8 +2089,10 @@ SPDX-License-Identifier: MIT
 			<!--
 				Builds the fat jar-with-dependencies uber JAR: the library classes, the
 				default-platform native libs from src/main/resources, and all runtime Java
-				dependencies in one drop-on-classpath JAR, runnable via the OpenAiCompatServer
-				Main-Class (set below) to start the OpenAI-compatible HTTP server. Off by
+				dependencies in one drop-on-classpath JAR, with ServerLauncher as the fat-jar
+				Main-Class (set below), which dispatches on an `jllama-openai-compat` selector flag: with it, runs
+					OpenAiCompatServer (Java OpenAI API); without it, the default NativeServer (native
+					server, embedded WebUI, all flags forwarded). Both mains stay runnable by class name via `java -cp <jar> …`. Off by
 				default; the CI `package` job activates it so the uber JAR rides along in the
 				`llama-jars` upload-artifact bundle. Documented in CLAUDE.md "Build Commands"
 				as `mvn -P assembly package`.
@@ -1314,7 +2109,7 @@ SPDX-License-Identifier: MIT
 							</descriptorRefs>
 							<archive>
 								<manifest>
-									<mainClass>net.ladenthin.llama.server.OpenAiCompatServer</mainClass>
+									<mainClass>net.ladenthin.llama.server.ServerLauncher</mainClass>
 								</manifest>
 							</archive>
 						</configuration>
diff --git a/llama/spotbugs-exclude.xml b/llama/spotbugs-exclude.xml
index d64bed38..eb2fdcfa 100644
--- a/llama/spotbugs-exclude.xml
+++ b/llama/spotbugs-exclude.xml
@@ -622,4 +622,95 @@ SPDX-License-Identifier: MIT
         <Method name="~\$default\$.*"/>
     </Match>
 
+    <!--
+        NativeServer (net.ladenthin.llama.server.NativeServer) is the native-transport server-mode
+        entry point: it owns a background worker thread, three native JNI methods and a blocking
+        main(). Its SpotBugs findings are all fb-contrib style artifacts, not defects:
+          - IMC_IMMATURE_CLASS_NO_EQUALS: an identity/lifecycle-managed handle class, not a value
+            type — same rationale as the other server identity classes above.
+          - MDM_THREAD_YIELD: main() polls `while (isRunning()) Thread.sleep(...)` to keep the JVM
+            alive until the native server stops — a legitimate blocking wait in a process entry
+            point, not manual thread scheduling.
+          - UVA_USE_VAR_ARGS: startNativeServer(String[]) is a `native` method; its signature must
+            match the C/JNI side exactly and therefore cannot be varargs.
+          - WEM_WEAK_EXCEPTION_MESSAGING: the two start() guards throw with fixed messages
+            (single-instance-per-process / already-running), which carry no extra state worth
+            formatting — same category as the server request-parser WEM suppressions above.
+    -->
+    <Match>
+        <Class name="net.ladenthin.llama.server.NativeServer"/>
+        <Or>
+            <Bug pattern="IMC_IMMATURE_CLASS_NO_EQUALS"/>
+            <Bug pattern="MDM_THREAD_YIELD"/>
+            <Bug pattern="UVA_USE_VAR_ARGS"/>
+            <Bug pattern="WEM_WEAK_EXCEPTION_MESSAGING"/>
+        </Or>
+    </Match>
+
+    <!--
+        ServerLauncher.main(String[]) is the fat-jar dispatcher: it forwards to
+        OpenAiCompatServer.main / NativeServer.main, both of which declare `throws Exception` (they
+        block until server shutdown and surface whatever the underlying server throws). Declaring
+        `throws Exception` on the dispatcher mirrors that contract; narrowing it would force the
+        dispatcher to wrap arbitrary checked exceptions for no benefit — same rationale as the
+        ToolHandler.invoke THROWS suppression above.
+    -->
+    <Match>
+        <Class name="net.ladenthin.llama.server.ServerLauncher"/>
+        <Bug pattern="THROWS_METHOD_THROWS_CLAUSE_BASIC_EXCEPTION"/>
+        <Method name="main"/>
+    </Match>
+
+    <!--
+        OpenAiServerCli additions (the cache-type-k / cache-type-v, jinja and chat-template-kwargs
+        flags plus the extended usage() help). All fb-contrib/findsecbugs artifacts, not defects; scoped
+        to the specific methods that trigger them:
+          - parse(): ENMI_NULL_ENUM_VALUE — `@Nullable CacheType cacheTypeK/V = null`; null is the
+            documented "unset -> use the llama.cpp default" sentinel, explicitly @Nullable and
+            NullAway-checked.
+          - usage(): POTENTIAL_XML_INJECTION / PRMC_POSSIBLY_REDUNDANT_METHOD_CALLS — usage() builds
+            the plain-text console help string, appending the fixed cacheTypeChoices() enum list for
+            the -ctk and -ctv lines. There is no XML anywhere (false positive), and cacheTypeChoices()
+            is a cheap pure helper returning a constant, so calling it twice is harmless.
+          - parseChatTemplateKwargs(): EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS converts Jackson's
+            checked JsonProcessingException into a CLI usage error (cause chained) — the same intended
+            boundary softening already accepted for LlamaModel's JSON methods above; and
+            PSC_PRESIZE_COLLECTIONS on the tiny short-lived LinkedHashMap of CLI kwargs is not
+            worthwhile.
+    -->
+    <Match>
+        <Class name="net.ladenthin.llama.server.OpenAiServerCli"/>
+        <Bug pattern="ENMI_NULL_ENUM_VALUE"/>
+        <Method name="parse"/>
+    </Match>
+    <Match>
+        <Class name="net.ladenthin.llama.server.OpenAiServerCli"/>
+        <Or>
+            <Bug pattern="POTENTIAL_XML_INJECTION"/>
+            <Bug pattern="PRMC_POSSIBLY_REDUNDANT_METHOD_CALLS"/>
+        </Or>
+        <Method name="usage"/>
+    </Match>
+    <Match>
+        <Class name="net.ladenthin.llama.server.OpenAiServerCli"/>
+        <Or>
+            <Bug pattern="EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS"/>
+            <Bug pattern="PSC_PRESIZE_COLLECTIONS"/>
+        </Or>
+        <Method name="parseChatTemplateKwargs"/>
+    </Match>
+
+    <!--
+        OpenAiServerCli$Options.getChatTemplateKwargs() returns the chatTemplateKwargs field
+        directly, which fb-contrib flags as EI_EXPOSE_REP. The field only ever holds the
+        Collections.unmodifiableMap(...) returned by parseChatTemplateKwargs (or null when unset),
+        so the getter cannot leak a mutable map — the detector does not track that the stored map is
+        already unmodifiable.
+    -->
+    <Match>
+        <Class name="net.ladenthin.llama.server.OpenAiServerCli$Options"/>
+        <Bug pattern="EI_EXPOSE_REP"/>
+        <Method name="getChatTemplateKwargs"/>
+    </Match>
+
 </FindBugsFilter>
diff --git a/llama/src/main/cpp/jllama.cpp b/llama/src/main/cpp/jllama.cpp
index f3871f89..faa8fadd 100644
--- a/llama/src/main/cpp/jllama.cpp
+++ b/llama/src/main/cpp/jllama.cpp
@@ -802,6 +802,7 @@ JNIEXPORT jstring JNICALL Java_net_ladenthin_llama_LlamaModel_getModelMetaJson(J
         {"modalities", {{"vision", m.has_inp_image}, {"audio", m.has_inp_audio}}},
         {"name", m.model_name},
         {"architecture", std::string(arch_buf)},
+        {"ftype", m.model_ftype},
     };
     // Resolved default chat template (Jinja); empty when the model ships none.
     const char *chat_tmpl = mdl != nullptr ? llama_model_chat_template(mdl, /*name*/ nullptr) : nullptr;
diff --git a/llama/src/main/cpp/native_server.cpp b/llama/src/main/cpp/native_server.cpp
new file mode 100644
index 00000000..d9cfa527
--- /dev/null
+++ b/llama/src/main/cpp/native_server.cpp
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+// JNI bridge for net.ladenthin.llama.server.NativeServer: runs the full upstream llama.cpp HTTP
+// server (llama_server(), including its embedded WebUI) inside libjllama, driven over JNI. The
+// argv is forwarded verbatim from Java, so every llama-server flag is supported. This is an
+// independent server lifecycle (it loads its own model from the argv), distinct from LlamaModel
+// and the Java-side OpenAiCompatServer.
+//
+// Only ONE native server may run per process: server.cpp keeps its shutdown_handler /
+// is_terminating state in file-scope globals, so a second concurrent llama_server() would clobber
+// them. NativeServer enforces this on the Java side.
+
+#include "native_server_bridge.h"
+
+#include <jni.h>
+
+#include <atomic>
+#include <chrono>
+#include <string>
+#include <thread>
+#include <vector>
+
+namespace {
+
+// Owns the argv storage for the lifetime of the running server plus the worker thread that runs
+// llama_server(). The argv pointers reference the std::string storage in `args`, which is filled
+// once (with reserve) and never mutated afterwards, so the pointers stay valid.
+struct native_server {
+    std::vector<std::string> args; // args[0] is the program name ("llama-server")
+    std::vector<char *> argv;      // points into `args`
+    std::thread worker;
+    std::atomic<bool> finished{false};
+    int exit_code = -1;
+};
+
+} // namespace
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_net_ladenthin_llama_server_NativeServer_startNativeServer(JNIEnv *env, jclass,
+                                                                                       jobjectArray jargs) {
+    auto *srv = new native_server();
+
+    const jsize n = (jargs != nullptr) ? env->GetArrayLength(jargs) : 0;
+    srv->args.reserve(static_cast<size_t>(n) + 1);
+    srv->args.emplace_back("llama-server"); // argv[0]
+    for (jsize i = 0; i < n; ++i) {
+        auto js = static_cast<jstring>(env->GetObjectArrayElement(jargs, i));
+        if (js != nullptr) {
+            const char *chars = env->GetStringUTFChars(js, nullptr);
+            srv->args.emplace_back(chars != nullptr ? chars : "");
+            if (chars != nullptr) {
+                env->ReleaseStringUTFChars(js, chars);
+            }
+            env->DeleteLocalRef(js);
+        } else {
+            srv->args.emplace_back("");
+        }
+    }
+
+    srv->argv.reserve(srv->args.size());
+    for (auto &arg : srv->args) {
+        srv->argv.push_back(const_cast<char *>(arg.c_str()));
+    }
+
+    // Embedded mode: no process signal handlers, honor the forwarded argv (see patches/0006).
+    llama_server_set_embedded(true);
+
+    srv->worker = std::thread([srv]() {
+        srv->exit_code = llama_server(static_cast<int>(srv->argv.size()), srv->argv.data());
+        srv->finished.store(true);
+    });
+
+    return reinterpret_cast<jlong>(srv);
+}
+
+JNIEXPORT void JNICALL Java_net_ladenthin_llama_server_NativeServer_stopNativeServer(JNIEnv *, jclass, jlong handle) {
+    auto *srv = reinterpret_cast<native_server *>(handle);
+    if (srv == nullptr) {
+        return;
+    }
+    // Signal shutdown, retrying until the worker actually returns: a stop issued before the server
+    // finished starting (shutdown_handler not yet installed by llama_server) would otherwise be
+    // lost. Once the handler is installed the first signal takes effect; if the model failed to
+    // load, llama_server has already returned and `finished` is set.
+    while (!srv->finished.load()) {
+        llama_server_request_shutdown();
+        if (srv->finished.load()) {
+            break;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    }
+    if (srv->worker.joinable()) {
+        srv->worker.join();
+    }
+    delete srv;
+}
+
+JNIEXPORT jboolean JNICALL Java_net_ladenthin_llama_server_NativeServer_isRunningNative(JNIEnv *, jclass,
+                                                                                        jlong handle) {
+    auto *srv = reinterpret_cast<native_server *>(handle);
+    return (srv != nullptr && !srv->finished.load()) ? JNI_TRUE : JNI_FALSE;
+}
+
+} // extern "C"
diff --git a/llama/src/main/cpp/native_server_bridge.h b/llama/src/main/cpp/native_server_bridge.h
new file mode 100644
index 00000000..1a40c766
--- /dev/null
+++ b/llama/src/main/cpp/native_server_bridge.h
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+// Declarations for the upstream server entry point (llama.cpp tools/server/server.cpp) that
+// jllama's NativeServer JNI bridge (native_server.cpp) calls to run the full llama.cpp HTTP
+// server — WebUI included — inside libjllama, with no separate llama-server executable.
+//
+//  - llama_server: upstream's renamed main (b9859 already exposes `int llama_server(int, char**)`
+//    as a non-static, externally linkable function). Runs the server and blocks until shutdown,
+//    returning its process-style exit code (0 = clean).
+//  - llama_server_set_embedded / llama_server_request_shutdown: added by
+//    patches/0006-server-embed-native-server-jni.patch so the server can run embedded in the JVM
+//    (does not install process-wide signal handlers, and honors the forwarded argv instead of
+//    re-deriving it from the process command line) and can be stopped out-of-band (the SIGTERM
+//    path) since its server_context is local to llama_server().
+
+int  llama_server(int argc, char ** argv);
+void llama_server_set_embedded(bool embedded);
+void llama_server_request_shutdown();
diff --git a/llama/src/main/java/net/ladenthin/llama/LlamaModel.java b/llama/src/main/java/net/ladenthin/llama/LlamaModel.java
index 59644ebe..6dbe19d0 100644
--- a/llama/src/main/java/net/ladenthin/llama/LlamaModel.java
+++ b/llama/src/main/java/net/ladenthin/llama/LlamaModel.java
@@ -856,6 +856,18 @@ public boolean supportsAudio() {
         return getModelMeta().supportsAudio();
     }
 
+    /**
+     * Returns the loaded model's file type (quantization) as a human-readable string, e.g.
+     * {@code "Q8_0"} or {@code "Q4_K - Medium"} (llama.cpp {@code llama_ftype_name}); a guessed
+     * type is prefixed with {@code "(guessed) "}. Returns an empty string when the native layer does
+     * not report it.
+     *
+     * @return the quantization file-type label, or {@code ""} if absent
+     */
+    public String getModelFtype() {
+        return getModelMeta().getFtype();
+    }
+
     native String getModelMetaJson();
 
     /**
diff --git a/llama/src/main/java/net/ladenthin/llama/loader/OSInfo.java b/llama/src/main/java/net/ladenthin/llama/loader/OSInfo.java
index 21cf7a3b..138b5900 100644
--- a/llama/src/main/java/net/ladenthin/llama/loader/OSInfo.java
+++ b/llama/src/main/java/net/ladenthin/llama/loader/OSInfo.java
@@ -114,6 +114,8 @@ public OSInfo() {}
     public static final String PPC64 = "ppc64";
     /** Folder name for 64-bit RISC-V. */
     public static final String RISCV64 = "riscv64";
+    /** Folder name for 64-bit IBM Z (s390x, big-endian). */
+    public static final String S390X = "s390x";
 
     static {
         // x86 mappings
@@ -155,6 +157,8 @@ public OSInfo() {}
         archMapping.put("ppc64le", PPC64);
 
         archMapping.put(RISCV64, RISCV64);
+
+        archMapping.put(S390X, S390X);
     }
 
     /**
diff --git a/llama/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java b/llama/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java
index a47ee190..a831234e 100644
--- a/llama/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java
+++ b/llama/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java
@@ -61,6 +61,7 @@ public final class InferenceParameters extends JsonParameters {
     private static final String PARAM_CACHE_REUSE = "n_cache_reuse";
     private static final String PARAM_SLOT_ID = "id_slot";
     private static final String PARAM_STREAM_OPTIONS = "stream_options";
+    private static final String PARAM_SSE_PING_INTERVAL = "sse_ping_interval";
     private static final String PARAM_RESPONSE_FORMAT = "response_format";
     private static final String PARAM_N_PREDICT = "n_predict";
     private static final String PARAM_TOP_K = "top_k";
@@ -108,6 +109,16 @@ public final class InferenceParameters extends JsonParameters {
     private static final String PARAM_DRY_ALLOWED_LENGTH = "dry_allowed_length";
     private static final String PARAM_DRY_PENALTY_LAST_N = "dry_penalty_last_n";
     private static final String PARAM_DRY_SEQUENCE_BREAKERS = "dry_sequence_breakers";
+    // Additional completion-schema fields honored by the native parser (eval_llama_cmpl_schema)
+    // but previously not surfaced as withers. All plain scalars.
+    private static final String PARAM_XTC_PROBABILITY = "xtc_probability";
+    private static final String PARAM_XTC_THRESHOLD = "xtc_threshold";
+    private static final String PARAM_N_DISCARD = "n_discard";
+    private static final String PARAM_N_INDENT = "n_indent";
+    private static final String PARAM_T_MAX_PREDICT_MS = "t_max_predict_ms";
+    private static final String PARAM_POST_SAMPLING_PROBS = "post_sampling_probs";
+    private static final String PARAM_TIMINGS_PER_TOKEN = "timings_per_token";
+    private static final String PARAM_RETURN_TOKENS = "return_tokens";
 
     private static final InferenceParameters EMPTY = new InferenceParameters();
 
@@ -868,4 +879,113 @@ public InferenceParameters withContinueFinalMessage(ContinuationMode mode) {
     public InferenceParameters withStream(boolean stream) {
         return withScalar(PARAM_STREAM, stream);
     }
+
+    /**
+     * Returns a new request with the SSE ping interval replaced (llama.cpp {@code sse_ping_interval},
+     * added upstream in b9864). In {@code stream} mode the server emits an SSE comment ping every
+     * {@code seconds} while the stream stays silent (e.g. during long prompt processing), keeping the
+     * connection observable; this per-request value overrides the server's {@code --sse-ping-interval}
+     * setting. Use {@code -1} to disable pings. Default: the server setting (30&nbsp;s upstream).
+     *
+     * @param seconds interval in seconds between SSE comment pings, or {@code -1} to disable
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withSsePingInterval(int seconds) {
+        return withScalar(PARAM_SSE_PING_INTERVAL, seconds);
+    }
+
+    /**
+     * Returns a new request with the XTC (Exclude Top Choices) sampler probability replaced
+     * ({@code xtc_probability}, default 0.0 = disabled). At each step, with this probability the
+     * sampler removes all but the least-likely of the tokens above {@link #withXtcThreshold(float)},
+     * flattening over-confident distributions.
+     *
+     * @param xtcProbability the XTC trigger probability in {@code [0, 1]} (0 disables XTC)
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withXtcProbability(float xtcProbability) {
+        return withScalar(PARAM_XTC_PROBABILITY, xtcProbability);
+    }
+
+    /**
+     * Returns a new request with the XTC sampler threshold replaced ({@code xtc_threshold},
+     * default 0.1). Only tokens whose probability is at least this value are eligible for XTC removal.
+     *
+     * @param xtcThreshold the minimum token probability considered by XTC
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withXtcThreshold(float xtcThreshold) {
+        return withScalar(PARAM_XTC_THRESHOLD, xtcThreshold);
+    }
+
+    /**
+     * Returns a new request with the number of tokens discarded on a context shift replaced
+     * ({@code n_discard}, default 0 = discard half of {@code n_ctx - n_keep}). When the context fills,
+     * the oldest {@code n_discard} tokens after the kept prefix are dropped to make room.
+     *
+     * @param nDiscard tokens to discard on context shift (0 = half)
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withNDiscard(int nDiscard) {
+        return withScalar(PARAM_N_DISCARD, nDiscard);
+    }
+
+    /**
+     * Returns a new request with the infill indentation hint replaced ({@code n_indent}, default 0).
+     * Used with {@link #withInputPrefix(String)} / {@link #withInputSuffix(String)}: generated infill
+     * lines are required to be indented at least this many columns, which helps code models keep block
+     * structure.
+     *
+     * @param nIndent minimum indentation (columns) for infilled lines
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withNIndent(int nIndent) {
+        return withScalar(PARAM_N_INDENT, nIndent);
+    }
+
+    /**
+     * Returns a new request with a wall-clock generation-time budget replaced ({@code t_max_predict_ms},
+     * default -1 = no limit). Generation stops once it has run for this many milliseconds, regardless of
+     * {@link #withNPredict(int)} — useful as an agentic/interactive latency guard.
+     *
+     * @param tMaxPredictMs maximum generation time in milliseconds (-1 = no limit)
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withTMaxPredictMs(int tMaxPredictMs) {
+        return withScalar(PARAM_T_MAX_PREDICT_MS, tMaxPredictMs);
+    }
+
+    /**
+     * Returns a new request toggling post-sampling token probabilities ({@code post_sampling_probs},
+     * default false). When true, the {@code n_probs} probabilities are reported <em>after</em> the full
+     * sampling chain is applied rather than from the raw logits.
+     *
+     * @param postSamplingProbs whether to report probabilities after sampling
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withPostSamplingProbs(boolean postSamplingProbs) {
+        return withScalar(PARAM_POST_SAMPLING_PROBS, postSamplingProbs);
+    }
+
+    /**
+     * Returns a new request toggling per-token timing telemetry ({@code timings_per_token},
+     * default false). When true, streamed responses carry per-token timing information.
+     *
+     * @param timingsPerToken whether to include per-token timings
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withTimingsPerToken(boolean timingsPerToken) {
+        return withScalar(PARAM_TIMINGS_PER_TOKEN, timingsPerToken);
+    }
+
+    /**
+     * Returns a new request toggling raw token-id output ({@code return_tokens}, default false).
+     * When true, the response includes the generated token ids alongside the decoded text.
+     *
+     * @param returnTokens whether to include raw token ids in the response
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withReturnTokens(boolean returnTokens) {
+        return withScalar(PARAM_RETURN_TOKENS, returnTokens);
+    }
 }
diff --git a/llama/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java b/llama/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java
index ce62131b..a8c6965b 100644
--- a/llama/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java
+++ b/llama/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java
@@ -1142,6 +1142,42 @@ public ModelParameters setSlotPromptSimilarity(float similarity) {
         return putScalar("--slot-prompt-similarity", similarity);
     }
 
+    /**
+     * Set the maximum number of context checkpoints kept per slot (default: 32; 0 disables
+     * checkpointing).
+     *
+     * <p>Context checkpoints let the server roll a slot back to an earlier state instead of
+     * re-processing the whole prompt when a follow-up request diverges from the cached tokens.
+     * They are essential for models that cannot truncate their state to an arbitrary position:
+     * recurrent/hybrid architectures (e.g. Granite-4, Mamba, Jamba) and SWA models. Each
+     * checkpoint costs host memory proportional to the model's recurrent/SWA state size, so
+     * lower this value on memory-constrained machines or raise it for very long multi-turn
+     * (agentic tool-calling) sessions.</p>
+     *
+     * @param ctxCheckpoints the maximum number of context checkpoints per slot
+     * @return this builder
+     */
+    public ModelParameters setCtxCheckpoints(int ctxCheckpoints) {
+        return putScalar("--ctx-checkpoints", ctxCheckpoints);
+    }
+
+    /**
+     * Set the minimum spacing between context checkpoints in tokens (default: 8192; 0 = no
+     * minimum).
+     *
+     * <p>Smaller values create checkpoints more often, improving prompt-cache reuse for
+     * multi-turn conversations at the cost of more host memory (bounded by
+     * {@link #setCtxCheckpoints(int)}). This matters most for recurrent/hybrid models
+     * (e.g. Granite-4), whose state can only be rolled back to a checkpoint — with sparse
+     * checkpoints a follow-up request may have to re-process most of the conversation.</p>
+     *
+     * @param checkpointMinStep the minimum number of tokens between two checkpoints (must not be negative)
+     * @return this builder
+     */
+    public ModelParameters setCheckpointMinStep(int checkpointMinStep) {
+        return putScalar("--checkpoint-min-step", checkpointMinStep);
+    }
+
     /**
      * Load LoRA adapters without applying them (apply later via POST /lora-adapters).
      *
diff --git a/llama/src/main/java/net/ladenthin/llama/server/NativeServer.java b/llama/src/main/java/net/ladenthin/llama/server/NativeServer.java
index 024ac827..65caf6c8 100644
--- a/llama/src/main/java/net/ladenthin/llama/server/NativeServer.java
+++ b/llama/src/main/java/net/ladenthin/llama/server/NativeServer.java
@@ -5,105 +5,241 @@
 package net.ladenthin.llama.server;
 
 import java.util.Objects;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 import lombok.ToString;
+import net.ladenthin.llama.loader.LlamaLoader;
 
 /**
- * Scaffold for the <em>native</em> HTTP server bridge — the planned counterpart to
- * {@link OpenAiCompatServer}.
+ * Runs the <em>full</em> upstream llama.cpp HTTP server — including its embedded
+ * <strong>WebUI</strong> — inside {@code libjllama}, driven over JNI, with no separate
+ * {@code llama-server} executable. It is the second of two server modes, the native counterpart to
+ * the Java-transport {@link OpenAiCompatServer}.
  *
- * <p>{@link OpenAiCompatServer} implements the HTTP transport in Java (on the JDK's
- * {@code com.sun.net.httpserver}) and drives the native llama.cpp server <em>core</em> over JNI. This
- * class is instead the entry point for the upstream <em>native</em> HTTP transport that is already
- * compiled into {@code libjllama} (llama.cpp's {@code server-http.cpp} plus its {@code cpp-httplib}
- * backend). That native transport is the only component able to serve the embedded llama.cpp
- * <strong>WebUI</strong> (the {@code ui.cpp}/{@code ui.h} asset table compiled in behind
- * {@code LLAMA_UI_HAS_ASSETS}).</p>
+ * <p>The constructor takes the raw llama-server command-line arguments and forwards them verbatim
+ * to the native entry point ({@code llama_server}), so <em>every</em> llama-server flag is supported
+ * ({@code -m}, {@code -c}, {@code -b}, {@code -ub}, {@code -ngl}, {@code -t}, {@code -tb},
+ * {@code -ctk}, {@code -ctv}, {@code --jinja}, {@code --chat-template-kwargs}, {@code --host},
+ * {@code --port}, {@code --ui}/{@code --no-ui}, …). Unlike {@link OpenAiCompatServer}, no per-flag
+ * Java mapping is involved.</p>
  *
- * <p><strong>Status: scaffold only.</strong> The route registration that upstream performs in
- * {@code server.cpp} (deliberately excluded from this build) is not yet wired to a JNI entry point, so
- * {@link #start()} throws {@link UnsupportedOperationException} for now. This class only fixes the
- * package structure and the public API shape; the native {@code startServer}/{@code stopServer}
- * methods, their C++ implementation, the server lifecycle/threading and WebUI serving are a separate,
- * detailed step (see {@code CLAUDE.md}, "WebUI (llama.cpp Svelte UI) embedding").</p>
+ * <p><strong>Independent lifecycle.</strong> {@code NativeServer} loads its <em>own</em> model from
+ * the forwarded arguments — exactly like running {@code llama-server.exe} — and is unrelated to any
+ * {@code net.ladenthin.llama.LlamaModel} you may also have open. Reusing an already-loaded
+ * {@code LlamaModel}'s context instead of loading a second copy is a possible future enhancement
+ * (see {@code TODO.md}). While the native server runs it owns the process-wide llama backend and
+ * routes llama.cpp logging to stderr/file (llama-server's own logging), not the JNI log callback.</p>
  *
- * <p>It is {@link AutoCloseable} so that, once implemented, callers can drive it with
- * try-with-resources exactly like {@link OpenAiCompatServer}.</p>
+ * <p><strong>Single instance per process.</strong> The upstream server keeps its shutdown state in
+ * file-scope globals, so only one {@code NativeServer} may run at a time; {@link #start()} throws if
+ * another instance is already running.</p>
+ *
+ * <p>Typical use:</p>
+ * <pre>{@code
+ * try (NativeServer server = new NativeServer(
+ *         "-m", "models/model.gguf", "--host", "127.0.0.1", "--port", "8080", "-c", "65536").start()) {
+ *     // Server (and WebUI at http://127.0.0.1:8080/) runs on a native worker thread.
+ *     // Readiness: poll GET /health until it returns {"status":"ok"}.
+ *     Thread.currentThread().join();
+ * }
+ * }</pre>
+ *
+ * <p><strong>Platform note.</strong> The native methods are compiled into {@code libjllama} on all
+ * platforms except Android (the upstream server pulls in {@code posix_spawn_*}, unavailable there);
+ * on Android use {@link OpenAiCompatServer}. No SSL: the embedded server is plain HTTP — bind
+ * localhost or front it with a TLS proxy.</p>
  */
 @ToString
 public final class NativeServer implements AutoCloseable {
 
-    /** Message thrown by {@link #start()} until the native route-wiring lands. */
-    static final String NOT_WIRED_MESSAGE =
-            "NativeServer is a scaffold: the upstream native HTTP routes (server-http.cpp) are "
-                    + "not yet wired to JNI. Use OpenAiCompatServer for now; the native server and "
-                    + "embedded WebUI are a planned step.";
+    /** Guards the process-wide single-instance invariant (upstream uses file-scope globals). */
+    private static final AtomicBoolean RUNNING = new AtomicBoolean(false);
+
+    /** Default bind host reported by {@link #getHost()} when {@code --host} is not passed. */
+    private static final String DEFAULT_HOST = "127.0.0.1";
+
+    /** Default port reported by {@link #getPort()} when no port flag is passed. */
+    private static final int DEFAULT_PORT = 8080;
+
+    /** The llama-server argument vector, forwarded verbatim to the native entry point. */
+    private final String[] args;
 
-    /** Immutable server configuration (bind host, port, ...) shared with {@link OpenAiCompatServer}. */
-    private final OpenAiServerConfig config;
+    /** Native handle (pointer) while running, or {@code 0} when not started / stopped. */
+    private volatile long handle;
 
     /**
-     * Creates a native-server bridge for the given configuration.
+     * Creates a native-server bridge for the given llama-server arguments.
      *
-     * <p>Construction performs no native work and binds no socket; it only captures the configuration.
-     * Call {@link #start()} to launch the server (not implemented yet).</p>
+     * <p>Construction performs no native work and binds no socket; it only captures the arguments.
+     * Call {@link #start()} to launch the server.</p>
      *
-     * @param config the server configuration (host, port, ...); must not be {@code null}
+     * @param args the llama-server command-line arguments (e.g. {@code "-m", "model.gguf",
+     *             "--port", "8080"}); must not be {@code null} and must not contain {@code null}
+     *             elements
      */
-    public NativeServer(OpenAiServerConfig config) {
-        this.config = Objects.requireNonNull(config, "config");
+    public NativeServer(String... args) {
+        Objects.requireNonNull(args, "args");
+        for (final String arg : args) {
+            Objects.requireNonNull(arg, "args element");
+        }
+        this.args = args.clone();
     }
 
     /**
-     * Starts the native HTTP server and begins serving the embedded WebUI.
+     * Starts the native HTTP server (and its embedded WebUI) on a background thread and returns
+     * immediately. The server binds and begins serving {@code GET /health} before the model finishes
+     * loading; poll {@code /health} for readiness.
      *
-     * <p><strong>Not implemented yet</strong> — this is a scaffold. The native route registration and
-     * its JNI binding are a planned step, so this method always throws until then.</p>
-     *
-     * @return this server instance (for fluent / try-with-resources use), once implemented
-     * @throws UnsupportedOperationException always, until the native routes are wired to JNI
+     * @return this server instance (for fluent / try-with-resources use)
+     * @throws IllegalStateException if this instance was already started, or another
+     *                               {@code NativeServer} is already running in this process
      */
-    // Scaffold: start() intentionally always throws for now, but must stay callable (not @DoNotCall)
-    // so the real implementation and its callers/tests keep the same signature.
-    @SuppressWarnings("DoNotCallSuggester")
     public NativeServer start() {
-        throw new UnsupportedOperationException(NOT_WIRED_MESSAGE);
+        if (handle != 0) {
+            throw new IllegalStateException("NativeServer already started");
+        }
+        if (!RUNNING.compareAndSet(false, true)) {
+            throw new IllegalStateException(
+                    "another NativeServer is already running in this process (only one is supported)");
+        }
+        try {
+            // Load libjllama lazily here (not in a static initializer) so construction, argument
+            // parsing and close() stay usable — and unit-testable — without the native library.
+            LlamaLoader.initialize();
+            handle = startNativeServer(args);
+        } catch (final RuntimeException | Error e) {
+            RUNNING.set(false);
+            throw e;
+        }
+        return this;
     }
 
     /**
-     * Reports whether the native server is currently running.
+     * Reports whether the native server worker is currently running.
+     *
+     * <p>Note: this becomes {@code true} as soon as the worker thread starts, which is before the
+     * socket is necessarily accepting connections — use {@code GET /health} to detect readiness.</p>
      *
-     * @return {@code false} — the scaffold never starts a server yet
+     * @return {@code true} if the server has been started and its worker has not yet exited
      */
     public boolean isRunning() {
-        return false;
+        final long h = handle;
+        return h != 0 && isRunningNative(h);
     }
 
     /**
-     * Returns the host the server is configured to bind to.
+     * Returns the bind host parsed from the arguments ({@code --host}), or {@code 127.0.0.1} when
+     * absent. Best-effort convenience for logging; the authoritative value is what the native server
+     * parsed.
      *
      * @return the configured bind host
      */
     public String getHost() {
-        return config.getHost();
+        for (int i = 0; i < args.length - 1; i++) {
+            if ("--host".equals(args[i])) {
+                return args[i + 1];
+            }
+        }
+        return DEFAULT_HOST;
     }
 
     /**
-     * Returns the port the server is configured to bind to.
+     * Returns the port parsed from the arguments ({@code --port} / {@code -p}), or {@code 8080} when
+     * absent or unparseable. Best-effort convenience for logging.
      *
      * @return the configured port
      */
     public int getPort() {
-        return config.getPort();
+        for (int i = 0; i < args.length - 1; i++) {
+            if ("--port".equals(args[i]) || "-p".equals(args[i])) {
+                try {
+                    return Integer.parseInt(args[i + 1].trim());
+                } catch (final NumberFormatException e) {
+                    return DEFAULT_PORT;
+                }
+            }
+        }
+        return DEFAULT_PORT;
     }
 
     /**
-     * Stops the native server if it is running.
-     *
-     * <p>No-op in the scaffold (nothing is ever started), so it is always safe to call, including from
-     * try-with-resources. Real lifecycle teardown is part of the planned native-server implementation.</p>
+     * Stops the native server if it is running and releases the native handle. Blocks until the
+     * server has fully shut down. Safe to call more than once and from try-with-resources even if
+     * {@link #start()} was never called (no-op then).
      */
     @Override
     public void close() {
-        // Nothing is started yet, so there is nothing to release.
+        final long h = handle;
+        if (h == 0) {
+            return;
+        }
+        handle = 0;
+        try {
+            stopNativeServer(h);
+        } finally {
+            RUNNING.set(false);
+        }
     }
+
+    /**
+     * Fat-jar entry point (the assembly JAR's {@code Main-Class}): starts the full native llama.cpp
+     * server — WebUI included — forwarding every argument to it verbatim, and blocks until the
+     * server exits or the JVM is asked to shut down (Ctrl-C / SIGTERM), stopping the server cleanly
+     * on the way out.
+     *
+     * <p>This is the default runnable server. The Java-transport {@link OpenAiCompatServer} remains
+     * available via its own {@code main} — run it explicitly with
+     * {@code java -cp <jar> net.ladenthin.llama.server.OpenAiCompatServer …}.</p>
+     *
+     * @param args the llama-server command-line arguments, forwarded verbatim (e.g. {@code -m
+     *             model.gguf --host 127.0.0.1 --port 8080}); pass {@code --help} for the full
+     *             llama-server option list
+     * @throws InterruptedException if interrupted while waiting for the server to exit
+     */
+    public static void main(String[] args) throws InterruptedException {
+        // Own the server in a try/finally so close() is guaranteed on normal or exceptional exit of
+        // the block (satisfies S2095 via the "close in a finally clause" option — try-with-resources
+        // is not used because the shutdown hook must also call close() explicitly, which javac flags
+        // under -Werror as an "explicit call to close() on an auto-closeable resource"). close() is
+        // idempotent (guards on a zero handle), so the finally and the hook both firing is safe.
+        final NativeServer server = new NativeServer(args);
+        try {
+            // Signalled by the shutdown hook so the main thread wakes immediately on Ctrl-C / SIGTERM
+            // rather than waiting out a poll tick — and so the wait uses a bounded latch await instead
+            // of Thread.sleep (banned by LlamaArchitectureTest.noThreadSleep).
+            final CountDownLatch stopSignal = new CountDownLatch(1);
+            // Graceful Ctrl-C / SIGTERM: the embedded server installs no signal handlers of its own
+            // (see patches/0006), so the JVM-level shutdown hook is what stops it before exit.
+            Runtime.getRuntime()
+                    .addShutdownHook(new Thread(
+                            () -> {
+                                server.close();
+                                stopSignal.countDown();
+                            },
+                            "jllama-native-server-shutdown"));
+            server.start();
+            // Keep the JVM alive until the native worker exits — on its own (e.g. a fatal startup/model
+            // error that llama_server has already logged) or because the shutdown hook stopped it. The
+            // bounded await returns early when the hook fires; on timeout we re-check isRunning() to
+            // catch a self-terminated worker.
+            while (server.isRunning() && !stopSignal.await(200L, TimeUnit.MILLISECONDS)) {
+                // wait for the native worker to exit or the shutdown hook to fire
+            }
+        } finally {
+            server.close();
+        }
+    }
+
+    /**
+     * Starts the native server on a worker thread and returns an opaque handle. The argv is
+     * forwarded verbatim (with a synthetic {@code argv[0]}).
+     */
+    private static native long startNativeServer(String[] args);
+
+    /** Signals shutdown, joins the worker thread, and frees the handle. */
+    private static native void stopNativeServer(long handle);
+
+    /** Whether the worker thread for the given handle is still running. */
+    private static native boolean isRunningNative(long handle);
 }
diff --git a/llama/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java b/llama/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java
index b93e7766..bdd9befd 100644
--- a/llama/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java
+++ b/llama/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java
@@ -748,7 +748,7 @@ private void handleModels(HttpExchange exchange) throws IOException {
                 sendError(exchange, HTTP_UNAUTHORIZED, ERROR_TYPE_REQUEST, "Missing or invalid API key");
                 return;
             }
-            sendJson(exchange, HTTP_OK, OpenAiSseFormatter.modelsJson(config.getModelId()));
+            sendJson(exchange, HTTP_OK, OpenAiSseFormatter.modelsJson(config.getModelId(), config.getModelFtype()));
         } finally {
             exchange.close();
         }
@@ -1064,7 +1064,7 @@ public static void main(String[] args) throws IOException {
                         "jllama-openai-shutdown"));
 
         try (LlamaModel model = new LlamaModel(options.toModelParameters())) {
-            OpenAiServerConfig config = options.toServerConfig(model.supportsVision());
+            OpenAiServerConfig config = options.toServerConfig(model.supportsVision(), model.getModelFtype());
             try (OpenAiCompatServer server = new OpenAiCompatServer(model, config)) {
                 server.start();
                 printReady(config, server.getPort());
diff --git a/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java b/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java
index f32728b8..3e04d5fe 100644
--- a/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java
+++ b/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java
@@ -4,8 +4,15 @@
 
 package net.ladenthin.llama.server;
 
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import net.ladenthin.llama.args.CacheType;
 import net.ladenthin.llama.parameters.ModelParameters;
 import org.jspecify.annotations.Nullable;
 
@@ -19,8 +26,11 @@
  * via {@link #isHelpRequested(String[])} so callers can print help without it being treated as an error.
  *
  * <p>Flags mirror llama.cpp's own server where they overlap ({@code -m}, {@code -p}, {@code -c},
- * {@code -ngl}, {@code -t}); a few legacy spellings are accepted as aliases so earlier documented
- * invocations keep working.
+ * {@code -b}, {@code -ub}, {@code -ngl}, {@code -t}, {@code -tb}, {@code -ctk}, {@code -ctv},
+ * {@code --jinja}, {@code --chat-template-kwargs}); a few legacy spellings are accepted as aliases so
+ * earlier documented invocations keep working. The {@code --chat-template-kwargs} JSON is parsed here
+ * (the only JSON this otherwise dependency-light parser touches) so a malformed object fails fast with
+ * usage text rather than at native model load.
  */
 public final class OpenAiServerCli {
 
@@ -65,7 +75,14 @@ public static Options parse(String... args) {
         int ctxSize = 0;
         int gpuLayers = 0;
         int threads = 0;
+        int threadsBatch = 0;
         int parallel = 0;
+        int batchSize = 0;
+        int ubatchSize = 0;
+        @Nullable CacheType cacheTypeK = null;
+        @Nullable CacheType cacheTypeV = null;
+        boolean jinja = false;
+        @Nullable Map<String, String> chatTemplateKwargs = null;
         boolean embedding = false;
         boolean reranking = false;
 
@@ -97,6 +114,32 @@ public static Options parse(String... args) {
                 case "--threads":
                     threads = intValue(args, ++i, arg);
                     break;
+                case "-tb":
+                case "--threads-batch":
+                    threadsBatch = intValue(args, ++i, arg);
+                    break;
+                case "-b":
+                case "--batch-size":
+                    batchSize = intValue(args, ++i, arg);
+                    break;
+                case "-ub":
+                case "--ubatch-size":
+                    ubatchSize = intValue(args, ++i, arg);
+                    break;
+                case "-ctk":
+                case "--cache-type-k":
+                    cacheTypeK = cacheTypeValue(args, ++i, arg);
+                    break;
+                case "-ctv":
+                case "--cache-type-v":
+                    cacheTypeV = cacheTypeValue(args, ++i, arg);
+                    break;
+                case "--jinja":
+                    jinja = true;
+                    break;
+                case "--chat-template-kwargs":
+                    chatTemplateKwargs = parseChatTemplateKwargs(nextValue(args, ++i, arg), arg);
+                    break;
                 case "--parallel":
                     parallel = intValue(args, ++i, arg);
                     break;
@@ -131,7 +174,24 @@ public static Options parse(String... args) {
             throw error("Missing required argument: -m/--model <path-to-gguf>");
         }
         return new Options(
-                host, port, modelPath, modelId, apiKey, mmproj, ctxSize, gpuLayers, threads, parallel, embedding,
+                host,
+                port,
+                modelPath,
+                modelId,
+                apiKey,
+                mmproj,
+                ctxSize,
+                gpuLayers,
+                threads,
+                threadsBatch,
+                parallel,
+                batchSize,
+                ubatchSize,
+                cacheTypeK,
+                cacheTypeV,
+                jinja,
+                chatTemplateKwargs,
+                embedding,
                 reranking);
     }
 
@@ -155,8 +215,16 @@ public static String usage() {
                 "  --host <host>              Interface to bind (default: " + DEFAULT_HOST + ")",
                 "  -p,  --port <port>         TCP port to listen on (default: " + DEFAULT_PORT + ")",
                 "  -c,  --ctx-size <n>        Context window size (default: llama.cpp default)",
+                "  -b,  --batch-size <n>      Logical (prompt) batch size (default: llama.cpp default)",
+                "  -ub, --ubatch-size <n>     Physical (micro) batch size (default: llama.cpp default)",
                 "  -ngl,--n-gpu-layers <n>    Layers to offload to GPU (default: 0 = CPU only)",
                 "  -t,  --threads <n>         Inference thread count (default: llama.cpp default)",
+                "  -tb, --threads-batch <n>   Thread count for batch/prompt processing (default: same as -t)",
+                "  -ctk,--cache-type-k <t>    KV cache K quantization: " + cacheTypeChoices() + " (default: f16)",
+                "  -ctv,--cache-type-v <t>    KV cache V quantization: " + cacheTypeChoices() + " (default: f16)",
+                "  --jinja                    Use the model's Jinja chat template",
+                "  --chat-template-kwargs <j> JSON object of chat-template variables (requires --jinja),",
+                "                             e.g. {\"reasoning_effort\":\"low\"}",
                 "  --parallel <n>             Parallel inference slots (default: llama.cpp default)",
                 "  --model-id <name>          Model id reported by /v1/models (default: file name)",
                 "  --api-key <key>            Require an 'Authorization: Bearer <key>' header",
@@ -191,6 +259,53 @@ private static int intValue(String[] args, int valueIndex, String flag) {
         }
     }
 
+    /** Reusable parser for the {@code --chat-template-kwargs} JSON object; no state, thread-safe. */
+    private static final ObjectMapper CHAT_TEMPLATE_KWARGS_MAPPER = new ObjectMapper();
+
+    private static CacheType cacheTypeValue(String[] args, int valueIndex, String flag) {
+        final String raw = nextValue(args, valueIndex, flag).trim();
+        for (final CacheType type : CacheType.values()) {
+            if (type.getArgValue().equalsIgnoreCase(raw)) {
+                return type;
+            }
+        }
+        throw error(flag + " expects one of " + cacheTypeChoices() + ", got: " + raw);
+    }
+
+    private static String cacheTypeChoices() {
+        final StringBuilder sb = new StringBuilder();
+        for (final CacheType type : CacheType.values()) {
+            if (sb.length() > 0) {
+                sb.append(", ");
+            }
+            sb.append(type.getArgValue());
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Parse a {@code --chat-template-kwargs} JSON object into the raw-per-value map that
+     * {@link ModelParameters#setChatTemplateKwargs(Map)} expects: each entry's value is kept as its
+     * raw JSON text (a string stays quoted, a boolean/number stays bare), so the object is
+     * reconstructed verbatim for the native flag. Insertion order is preserved.
+     */
+    private static Map<String, String> parseChatTemplateKwargs(String json, String flag) {
+        final JsonNode root;
+        try {
+            root = CHAT_TEMPLATE_KWARGS_MAPPER.readTree(json);
+        } catch (JsonProcessingException e) {
+            throw error(flag + " expects a JSON object (e.g. {\"reasoning_effort\":\"low\"}), got: " + json, e);
+        }
+        if (root == null || !root.isObject()) {
+            throw error(flag + " expects a JSON object (e.g. {\"reasoning_effort\":\"low\"}), got: " + json);
+        }
+        final Map<String, String> kwargs = new LinkedHashMap<>();
+        for (final Map.Entry<String, JsonNode> field : root.properties()) {
+            kwargs.put(field.getKey(), field.getValue().toString());
+        }
+        return Collections.unmodifiableMap(kwargs);
+    }
+
     private static IllegalArgumentException error(String message) {
         return error(message, null);
     }
@@ -200,10 +315,12 @@ private static IllegalArgumentException error(String message, @Nullable Throwabl
     }
 
     /**
-     * Immutable, parsed launcher options. {@code ctxSize}, {@code threads} and {@code parallel} use
-     * {@code 0} as a sentinel meaning "leave the llama.cpp default" — they are only applied to
-     * {@link ModelParameters} when positive. {@code gpuLayers} is always applied (its own default of
-     * {@code 0} already means CPU-only).
+     * Immutable, parsed launcher options. The integer tuning knobs — {@code ctxSize},
+     * {@code threads}, {@code threadsBatch}, {@code parallel}, {@code batchSize} and
+     * {@code ubatchSize} — use {@code 0} as a sentinel meaning "leave the llama.cpp default", and are
+     * only applied to {@link ModelParameters} when positive. {@code cacheTypeK}/{@code cacheTypeV}
+     * and {@code chatTemplateKwargs} use {@code null} as the same "leave the default" sentinel.
+     * {@code gpuLayers} is always applied (its own default of {@code 0} already means CPU-only).
      */
     public static final class Options {
 
@@ -216,7 +333,14 @@ public static final class Options {
         private final int ctxSize;
         private final int gpuLayers;
         private final int threads;
+        private final int threadsBatch;
         private final int parallel;
+        private final int batchSize;
+        private final int ubatchSize;
+        private final @Nullable CacheType cacheTypeK;
+        private final @Nullable CacheType cacheTypeV;
+        private final boolean jinja;
+        private final @Nullable Map<String, String> chatTemplateKwargs;
         private final boolean embedding;
         private final boolean reranking;
 
@@ -230,7 +354,14 @@ private Options(
                 int ctxSize,
                 int gpuLayers,
                 int threads,
+                int threadsBatch,
                 int parallel,
+                int batchSize,
+                int ubatchSize,
+                @Nullable CacheType cacheTypeK,
+                @Nullable CacheType cacheTypeV,
+                boolean jinja,
+                @Nullable Map<String, String> chatTemplateKwargs,
                 boolean embedding,
                 boolean reranking) {
             this.host = host;
@@ -242,7 +373,14 @@ private Options(
             this.ctxSize = ctxSize;
             this.gpuLayers = gpuLayers;
             this.threads = threads;
+            this.threadsBatch = threadsBatch;
             this.parallel = parallel;
+            this.batchSize = batchSize;
+            this.ubatchSize = ubatchSize;
+            this.cacheTypeK = cacheTypeK;
+            this.cacheTypeV = cacheTypeV;
+            this.jinja = jinja;
+            this.chatTemplateKwargs = chatTemplateKwargs;
             this.embedding = embedding;
             this.reranking = reranking;
         }
@@ -341,6 +479,72 @@ public int getParallel() {
             return parallel;
         }
 
+        /**
+         * The batch/prompt-processing thread count, or {@code 0} for the llama.cpp default (same as
+         * {@link #getThreads()}).
+         *
+         * @return the batch thread count
+         */
+        public int getThreadsBatch() {
+            return threadsBatch;
+        }
+
+        /**
+         * The logical (prompt) batch size, or {@code 0} for the llama.cpp default.
+         *
+         * @return the batch size
+         */
+        public int getBatchSize() {
+            return batchSize;
+        }
+
+        /**
+         * The physical (micro) batch size, or {@code 0} for the llama.cpp default.
+         *
+         * @return the micro-batch size
+         */
+        public int getUbatchSize() {
+            return ubatchSize;
+        }
+
+        /**
+         * The KV cache K quantization type, or {@code null} for the llama.cpp default.
+         *
+         * @return the K cache type, or {@code null} when unset
+         */
+        public @Nullable CacheType getCacheTypeK() {
+            return cacheTypeK;
+        }
+
+        /**
+         * The KV cache V quantization type, or {@code null} for the llama.cpp default.
+         *
+         * @return the V cache type, or {@code null} when unset
+         */
+        public @Nullable CacheType getCacheTypeV() {
+            return cacheTypeV;
+        }
+
+        /**
+         * Whether the model's Jinja chat template is enabled.
+         *
+         * @return {@code true} if {@code --jinja} was requested
+         */
+        public boolean isJinja() {
+            return jinja;
+        }
+
+        /**
+         * The parsed {@code --chat-template-kwargs} as a raw-per-value map (see
+         * {@link ModelParameters#setChatTemplateKwargs(Map)}), or {@code null} when unset. The map is
+         * unmodifiable.
+         *
+         * @return the chat-template variables, or {@code null} when unset
+         */
+        public @Nullable Map<String, String> getChatTemplateKwargs() {
+            return chatTemplateKwargs;
+        }
+
         /**
          * Whether to load the model in embedding mode.
          *
@@ -376,9 +580,30 @@ public ModelParameters toModelParameters() {
             if (threads > 0) {
                 params.setThreads(threads);
             }
+            if (threadsBatch > 0) {
+                params.setThreadsBatch(threadsBatch);
+            }
             if (parallel > 0) {
                 params.setParallel(parallel);
             }
+            if (batchSize > 0) {
+                params.setBatchSize(batchSize);
+            }
+            if (ubatchSize > 0) {
+                params.setUbatchSize(ubatchSize);
+            }
+            if (cacheTypeK != null) {
+                params.setCacheTypeK(cacheTypeK);
+            }
+            if (cacheTypeV != null) {
+                params.setCacheTypeV(cacheTypeV);
+            }
+            if (jinja) {
+                params.enableJinja();
+            }
+            if (chatTemplateKwargs != null) {
+                params.setChatTemplateKwargs(chatTemplateKwargs);
+            }
             if (embedding) {
                 params.enableEmbedding();
             }
@@ -395,7 +620,7 @@ public ModelParameters toModelParameters() {
          * @return the server configuration
          */
         public OpenAiServerConfig toServerConfig() {
-            return toServerConfig(mmproj != null);
+            return toServerConfig(mmproj != null, "");
         }
 
         /**
@@ -407,11 +632,25 @@ public OpenAiServerConfig toServerConfig() {
          * @return the server configuration
          */
         public OpenAiServerConfig toServerConfig(boolean supportsVision) {
+            return toServerConfig(supportsVision, "");
+        }
+
+        /**
+         * Build the server configuration with capability + metadata values obtained from the loaded
+         * model. This overload lets the standalone launcher advertise the model's quantization file
+         * type in {@code /v1/models} alongside the vision capability.
+         *
+         * @param supportsVision whether the loaded model reports usable vision input
+         * @param modelFtype the model's file-type (quantization) label, or {@code ""} if unknown
+         * @return the server configuration
+         */
+        public OpenAiServerConfig toServerConfig(boolean supportsVision, String modelFtype) {
             final OpenAiServerConfig.Builder builder = OpenAiServerConfig.builder()
                     .host(host)
                     .port(port)
                     .modelId(getModelId())
-                    .supportsVision(supportsVision);
+                    .supportsVision(supportsVision)
+                    .modelFtype(modelFtype);
             if (apiKey != null) {
                 builder.apiKey(apiKey);
             }
diff --git a/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java b/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java
index e6d694e9..7f9c3701 100644
--- a/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java
+++ b/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java
@@ -55,6 +55,7 @@ public final class OpenAiServerConfig {
     private final String corsAllowOrigin;
     private final boolean supportsVision;
     private final int maxRequestBodyBytes;
+    private final String modelFtype;
 
     private OpenAiServerConfig(Builder builder) {
         this.host = builder.host;
@@ -67,6 +68,7 @@ private OpenAiServerConfig(Builder builder) {
         this.corsAllowOrigin = builder.corsAllowOrigin;
         this.supportsVision = builder.supportsVision;
         this.maxRequestBodyBytes = builder.maxRequestBodyBytes;
+        this.modelFtype = builder.modelFtype;
     }
 
     /**
@@ -169,6 +171,17 @@ public boolean isSupportsVision() {
         return supportsVision;
     }
 
+    /**
+     * The served model's file type (quantization) as a human-readable string, e.g. {@code "Q8_0"}
+     * or {@code "Q4_K - Medium"}, advertised in the {@code GET /v1/models} {@code data[].ftype} field
+     * (matching the upstream llama.cpp server). Empty when unknown.
+     *
+     * @return the quantization file-type label, or {@code ""} if unknown
+     */
+    public String getModelFtype() {
+        return modelFtype;
+    }
+
     /**
      * Whether bearer-token authentication is enabled (an API key is configured).
      *
@@ -217,6 +230,7 @@ public static final class Builder {
         private String corsAllowOrigin = DEFAULT_CORS_ALLOW_ORIGIN;
         private boolean supportsVision;
         private int maxRequestBodyBytes = DEFAULT_MAX_REQUEST_BODY_BYTES;
+        private String modelFtype = "";
 
         private Builder() {}
 
@@ -319,6 +333,18 @@ public Builder supportsVision(boolean supportsVision) {
             return this;
         }
 
+        /**
+         * Sets the served model's file type (quantization) label to advertise in {@code /v1/models}.
+         *
+         * @param modelFtype the quantization file-type label (e.g. {@code "Q4_K - Medium"}); {@code null}
+         *     is treated as empty (unknown)
+         * @return this builder
+         */
+        public Builder modelFtype(@Nullable String modelFtype) {
+            this.modelFtype = modelFtype == null ? "" : modelFtype;
+            return this;
+        }
+
         /**
          * Sets the maximum accepted request-body size in bytes. Bodies larger than this are rejected
          * with HTTP 413 before being buffered.
diff --git a/llama/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java b/llama/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java
index f87c2599..764475be 100644
--- a/llama/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java
+++ b/llama/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java
@@ -122,10 +122,26 @@ static String ensureUsageCachedTokens(String chunkJson) {
      * @return an OpenAI model-list object serialized as JSON
      */
     static String modelsJson(String modelId) {
+        return modelsJson(modelId, "");
+    }
+
+    /**
+     * Build the {@code GET /v1/models} body advertising a single model, including the model's file
+     * type (quantization) as a {@code data[].ftype} field when known — mirroring the upstream
+     * llama.cpp server's {@code get_model_info()}.
+     *
+     * @param modelId the model id to advertise
+     * @param ftype the model's file-type (quantization) label, or {@code ""}/{@code null} to omit it
+     * @return an OpenAI model-list object serialized as JSON
+     */
+    static String modelsJson(String modelId, @Nullable String ftype) {
         ObjectNode model = OBJECT_MAPPER.createObjectNode();
         model.put("id", modelId);
         model.put("object", "model");
         model.put("owned_by", "llama.cpp");
+        if (ftype != null && !ftype.isEmpty()) {
+            model.put("ftype", ftype);
+        }
         ArrayNode data = OBJECT_MAPPER.createArrayNode();
         data.add(model);
         ObjectNode root = OBJECT_MAPPER.createObjectNode();
diff --git a/llama/src/main/java/net/ladenthin/llama/server/ServerLauncher.java b/llama/src/main/java/net/ladenthin/llama/server/ServerLauncher.java
new file mode 100644
index 00000000..964ba4e4
--- /dev/null
+++ b/llama/src/main/java/net/ladenthin/llama/server/ServerLauncher.java
@@ -0,0 +1,80 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Fat-jar entry point that dispatches to one of the two server modes based on a single selector
+ * flag. With {@value #OPENAI_COMPAT_FLAG} present it runs {@link OpenAiCompatServer} (the
+ * Java-transport, OpenAI-compatible JSON API); without it, {@link NativeServer} (the full native
+ * llama.cpp server with embedded WebUI, the default).
+ *
+ * <p>The dispatch uses a single primitive, {@link #withoutFlag(String[], String)}: it strips the
+ * selector from the arguments (the flag is not a llama.cpp flag, and {@code llama_server} rejects
+ * unknown flags), and the mode is chosen purely by whether that shortened the list — present iff the
+ * result is smaller. Every other argument is forwarded verbatim.</p>
+ *
+ * <p><strong>Flag sets differ.</strong> {@link NativeServer} forwards <em>every</em> llama-server
+ * flag to {@code llama_server}, whereas {@link OpenAiCompatServer}'s CLI ({@link OpenAiServerCli})
+ * accepts a curated subset and rejects unknown flags — so native-only flags (e.g. {@code --ui},
+ * {@code -fa}) cannot be combined with {@value #OPENAI_COMPAT_FLAG}.</p>
+ *
+ * <p>Both underlying mains remain directly runnable by class name via {@code java -cp}; this
+ * launcher is purely a convenience so a single {@code java -jar} covers both.</p>
+ */
+public final class ServerLauncher {
+
+    /**
+     * Selector flag: when present, run {@link OpenAiCompatServer} instead of the default
+     * {@link NativeServer}.
+     *
+     * <p>Namespaced with the {@code jllama} prefix (this project's native-library name) so it can
+     * never collide with a current or future llama.cpp / llama-server flag — upstream owns the
+     * {@code --*} space, this launcher owns {@code --jllama-*}. The launcher strips it before
+     * forwarding, so it never reaches {@code llama_server} (which rejects unknown flags).</p>
+     */
+    public static final String OPENAI_COMPAT_FLAG = "--jllama-openai-compat";
+
+    private ServerLauncher() {}
+
+    /**
+     * Dispatches to {@link OpenAiCompatServer#main(String[])} when {@value #OPENAI_COMPAT_FLAG} is
+     * present (with that marker removed), otherwise to {@link NativeServer#main(String[])} with all
+     * arguments forwarded unchanged. Selection is derived from whether stripping the flag shortened
+     * the argument list.
+     *
+     * @param args the process arguments
+     * @throws Exception if the selected server's {@code main} throws (it blocks until shutdown)
+     */
+    public static void main(String[] args) throws Exception {
+        final String[] forwarded = withoutFlag(args, OPENAI_COMPAT_FLAG);
+        if (forwarded.length != args.length) {
+            OpenAiCompatServer.main(forwarded);
+        } else {
+            NativeServer.main(args);
+        }
+    }
+
+    /**
+     * Returns a copy of {@code args} with every occurrence of {@code flag} removed, preserving the
+     * order of the remaining arguments. The result is shorter than {@code args} exactly when
+     * {@code flag} was present — which is how {@link #main(String[])} selects the server mode.
+     *
+     * @param args the arguments
+     * @param flag the flag token to strip
+     * @return a new array without {@code flag}
+     */
+    static String[] withoutFlag(String[] args, String flag) {
+        final List<String> filtered = new ArrayList<>(args.length);
+        for (final String arg : args) {
+            if (!flag.equals(arg)) {
+                filtered.add(arg);
+            }
+        }
+        return filtered.toArray(new String[0]);
+    }
+}
diff --git a/llama/src/main/java/net/ladenthin/llama/value/ModelMeta.java b/llama/src/main/java/net/ladenthin/llama/value/ModelMeta.java
index 91059e71..6f6f8975 100644
--- a/llama/src/main/java/net/ladenthin/llama/value/ModelMeta.java
+++ b/llama/src/main/java/net/ladenthin/llama/value/ModelMeta.java
@@ -129,6 +129,18 @@ public String getModelName() {
         return node.path("name").asText("");
     }
 
+    /**
+     * The model file type (quantization) as a human-readable string, e.g. {@code "Q8_0"} or
+     * {@code "Q4_K - Medium"}, from the GGUF {@code general.file_type} the model was loaded with
+     * (llama.cpp {@code llama_ftype_name}). A guessed type is prefixed with {@code "(guessed) "}.
+     * Returns an empty string if the native layer does not report it (older native builds).
+     *
+     * @return the quantization file-type label, or {@code ""} if absent
+     */
+    public String getFtype() {
+        return node.path("ftype").asText("");
+    }
+
     /**
      * The model's resolved default chat template (Jinja), from GGUF
      * {@code tokenizer.chat_template} metadata.
diff --git a/llama/src/test/cpp/test_server.cpp b/llama/src/test/cpp/test_server.cpp
index 546b618b..5ff9d894 100644
--- a/llama/src/test/cpp/test_server.cpp
+++ b/llama/src/test/cpp/test_server.cpp
@@ -1732,6 +1732,25 @@ TEST(ParamsFromJsonCmpl, SimpleFields_RoundTrip) {
     EXPECT_EQ(p.n_predict, 128);
 }
 
+// b9864: per-request sse_ping_interval overrides the server --sse-ping-interval setting; -1 disables
+// pings. Pins that the JSON key emitted by InferenceParameters.withSsePingInterval is honored by the
+// schema (field_num with set_hard_limits(-1, INT32_MAX)).
+TEST(ParamsFromJsonCmpl, SsePingInterval_RoundTrip) {
+    EXPECT_EQ(parse_params({{"sse_ping_interval", 5}}).sse_ping_interval, 5);
+    EXPECT_EQ(parse_params({{"sse_ping_interval", -1}}).sse_ping_interval, -1); // -1 disables pings
+}
+
+TEST(ParamsFromJsonCmpl, SsePingInterval_BelowHardLimit_Throws) {
+    // hard lower bound is -1; anything below throws (wrapped in std::invalid_argument by the schema).
+    EXPECT_THROW(parse_params({{"sse_ping_interval", -2}}), std::invalid_argument);
+}
+
+TEST(ParamsFromJsonCmpl, SsePingInterval_Absent_InheritsServerSetting) {
+    // When omitted from the request, the value inherits params_base.sse_ping_interval (the server setting).
+    common_params defaults;
+    EXPECT_EQ(parse_params({}).sse_ping_interval, defaults.sse_ping_interval);
+}
+
 TEST(ParamsFromJsonCmpl, RepeatLastN_MinusOne_ExpandsToNCtxSlot) {
     const auto p = parse_params({{"repeat_last_n", -1}}, /*n_ctx=*/256);
     EXPECT_EQ(p.sampling.penalty_last_n, 256);
diff --git a/llama/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java b/llama/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
index 28897996..a3100e57 100644
--- a/llama/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
+++ b/llama/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
@@ -94,8 +94,10 @@ public class LlamaArchitectureTest {
      * intend it. Conceptual tiers (informational): {@code Server} &gt; {@code Api} (root) &gt;
      * {@code Loader} &gt; {@code Json}/{@code Parameters} &gt;
      * {@code Value}/{@code Callback}/{@code Exception}/{@code Args}. The {@code Server} layer is the
-     * optional OpenAI-compatible HTTP entry point; it is the only layer permitted to access the
-     * {@code Api} root.
+     * optional OpenAI-compatible HTTP / native-server entry point; it is the only layer permitted to
+     * access the {@code Api} root, and it also reaches the {@code Loader} ({@code NativeServer}
+     * triggers {@code LlamaLoader.initialize()} before starting the embedded native server) and the
+     * {@code Args} enums ({@code OpenAiServerCli} maps {@code -ctk}/{@code -ctv} to {@code CacheType}).
      */
     @ArchTest
     static final ArchRule layeredArchitecture = layeredArchitecture()
@@ -121,7 +123,7 @@ public class LlamaArchitectureTest {
             .whereLayer("Api")
             .mayOnlyBeAccessedByLayers("Server")
             .whereLayer("Loader")
-            .mayOnlyBeAccessedByLayers("Api")
+            .mayOnlyBeAccessedByLayers("Api", "Server")
             .whereLayer("Json")
             .mayOnlyBeAccessedByLayers("Api")
             .whereLayer("Parameters")
@@ -133,7 +135,7 @@ public class LlamaArchitectureTest {
             .whereLayer("Exception")
             .mayOnlyBeAccessedByLayers("Api", "Loader")
             .whereLayer("Args")
-            .mayOnlyBeAccessedByLayers("Api", "Loader", "Parameters")
+            .mayOnlyBeAccessedByLayers("Api", "Loader", "Parameters", "Server")
             .whereLayer("Server")
             .mayNotBeAccessedByAnyLayer();
 
diff --git a/llama/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java b/llama/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java
index 0faa2626..e0ed445c 100644
--- a/llama/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java
+++ b/llama/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java
@@ -73,6 +73,36 @@ public void testSetNPredict() {
         assertThat(params.parameters.get("n_predict"), is("42"));
     }
 
+    @Test
+    public void testSetSsePingInterval() {
+        InferenceParameters params = new InferenceParameters("").withSsePingInterval(1);
+        assertThat(params.parameters.get("sse_ping_interval"), is("1"));
+        // -1 disables pings and must be accepted (the schema's hard lower bound is -1).
+        assertThat(
+                InferenceParameters.empty().withSsePingInterval(-1).parameters.get("sse_ping_interval"), is("-1"));
+    }
+
+    @Test
+    public void testAdditionalCompletionScalarsFromB9864Audit() {
+        // Plain scalars honored by eval_llama_cmpl_schema but previously not surfaced as withers.
+        assertThat(
+                InferenceParameters.empty().withXtcProbability(0.5f).parameters.get("xtc_probability"), is("0.5"));
+        assertThat(InferenceParameters.empty().withXtcThreshold(0.1f).parameters.get("xtc_threshold"), is("0.1"));
+        assertThat(InferenceParameters.empty().withNDiscard(64).parameters.get("n_discard"), is("64"));
+        assertThat(InferenceParameters.empty().withNIndent(4).parameters.get("n_indent"), is("4"));
+        assertThat(
+                InferenceParameters.empty().withTMaxPredictMs(2000).parameters.get("t_max_predict_ms"), is("2000"));
+        assertThat(
+                InferenceParameters.empty()
+                        .withPostSamplingProbs(true)
+                        .parameters
+                        .get("post_sampling_probs"),
+                is("true"));
+        assertThat(
+                InferenceParameters.empty().withTimingsPerToken(true).parameters.get("timings_per_token"), is("true"));
+        assertThat(InferenceParameters.empty().withReturnTokens(true).parameters.get("return_tokens"), is("true"));
+    }
+
     @Test
     public void testSetCacheReuse() {
         InferenceParameters params = InferenceParameters.empty().withCacheReuse(256);
diff --git a/llama/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java b/llama/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java
index bc4dc3aa..7bf7b476 100644
--- a/llama/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java
+++ b/llama/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java
@@ -891,6 +891,18 @@ public void testSetSlotPromptSimilarity() {
         assertThat(p.parameters.get("--slot-prompt-similarity"), is("0.8"));
     }
 
+    @Test
+    public void testSetCtxCheckpoints() {
+        ModelParameters p = new ModelParameters().setCtxCheckpoints(8);
+        assertThat(p.parameters.get("--ctx-checkpoints"), is("8"));
+    }
+
+    @Test
+    public void testSetCheckpointMinStep() {
+        ModelParameters p = new ModelParameters().setCheckpointMinStep(0);
+        assertThat(p.parameters.get("--checkpoint-min-step"), is("0"));
+    }
+
     // -------------------------------------------------------------------------
     // Override KV
     // -------------------------------------------------------------------------
diff --git a/llama/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java b/llama/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java
index 7e74dec4..389136f9 100644
--- a/llama/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java
+++ b/llama/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java
@@ -5,44 +5,61 @@
 package net.ladenthin.llama.server;
 
 import static org.hamcrest.MatcherAssert.assertThat;
-import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.is;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 import org.junit.jupiter.api.Test;
 
 /**
- * Model-free smoke test for the {@link NativeServer} scaffold: it must construct without any native
- * work, expose its configured host/port, never report itself running, throw a clear
- * {@link UnsupportedOperationException} from {@link NativeServer#start()} until the native routes are
- * wired, and be a safe no-op {@link AutoCloseable}. No model and no {@code libjllama} required.
+ * Model-free, library-free unit tests for {@link NativeServer}'s pure-Java surface: it must
+ * construct without any native work (libjllama is loaded lazily in {@link NativeServer#start()},
+ * not in a static initializer), best-effort parse host/port from the forwarded arguments, report
+ * itself not running before {@code start()}, and be a safe no-op {@link AutoCloseable} when never
+ * started. Actually starting the native server is exercised by CI / manual runs with a real model.
  */
 public class NativeServerSmokeTest {
 
-    private static OpenAiServerConfig config() {
-        return OpenAiServerConfig.builder().host("127.0.0.1").port(1234).build();
+    @Test
+    public void parsesHostAndPortFromArgs() {
+        NativeServer server = new NativeServer("-m", "m.gguf", "--host", "0.0.0.0", "--port", "1234");
+        assertThat(server.getHost(), is("0.0.0.0"));
+        assertThat(server.getPort(), is(1234));
+        assertThat(server.isRunning(), is(false));
+    }
+
+    @Test
+    public void shortPortFlagParsed() {
+        NativeServer server = new NativeServer("-m", "m.gguf", "-p", "9099");
+        assertThat(server.getPort(), is(9099));
     }
 
     @Test
-    public void exposesConfiguredHostAndPortWithoutStarting() {
-        NativeServer server = new NativeServer(config());
+    public void defaultsWhenFlagsAbsent() {
+        NativeServer server = new NativeServer("-m", "m.gguf");
         assertThat(server.getHost(), is("127.0.0.1"));
-        assertThat(server.getPort(), is(1234));
-        assertThat(server.isRunning(), is(false));
+        assertThat(server.getPort(), is(8080));
     }
 
     @Test
-    public void startThrowsUntilNativeRoutesAreWired() {
-        NativeServer server = new NativeServer(config());
-        UnsupportedOperationException ex = assertThrows(UnsupportedOperationException.class, server::start);
-        assertThat(ex.getMessage(), containsString("not yet wired"));
-        assertThat(server.isRunning(), is(false));
+    public void nonIntegerPortFallsBackToDefault() {
+        NativeServer server = new NativeServer("-m", "m.gguf", "--port", "abc");
+        assertThat(server.getPort(), is(8080));
     }
 
     @Test
-    public void closeIsSafeNoOpEvenViaTryWithResources() {
-        try (NativeServer server = new NativeServer(config())) {
+    public void closeBeforeStartIsSafeNoOpViaTryWithResources() {
+        try (NativeServer server = new NativeServer("-m", "m.gguf")) {
             assertThat(server.isRunning(), is(false));
         }
     }
+
+    @Test
+    public void nullArgsRejected() {
+        assertThrows(NullPointerException.class, () -> new NativeServer((String[]) null));
+    }
+
+    @Test
+    public void nullArgElementRejected() {
+        assertThrows(NullPointerException.class, () -> new NativeServer("-m", null));
+    }
 }
diff --git a/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java b/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java
index ff3dcd11..30204d6a 100644
--- a/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java
+++ b/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java
@@ -9,6 +9,7 @@
 import static org.hamcrest.Matchers.is;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
+import net.ladenthin.llama.args.CacheType;
 import org.junit.jupiter.api.Test;
 
 /**
@@ -207,4 +208,131 @@ public void modelParametersIncludeModelPath() {
                 OpenAiServerCli.parse("-m", "models/m.gguf").toModelParameters().toString();
         assertThat(json, containsString("models/m.gguf"));
     }
+
+    @Test
+    public void tuningFlagsDefaultToSentinels() {
+        OpenAiServerCli.Options options = OpenAiServerCli.parse("-m", "m.gguf");
+        assertThat(options.getBatchSize(), is(0));
+        assertThat(options.getUbatchSize(), is(0));
+        assertThat(options.getThreadsBatch(), is(0));
+        assertThat(options.getCacheTypeK(), is((CacheType) null));
+        assertThat(options.getCacheTypeV(), is((CacheType) null));
+        assertThat(options.isJinja(), is(false));
+        assertThat(options.getChatTemplateKwargs(), is((Object) null));
+    }
+
+    @Test
+    public void tuningShortFlagsParsed() {
+        OpenAiServerCli.Options options = OpenAiServerCli.parse(
+                "-m", "m.gguf", "-b", "4096", "-ub", "2048", "-tb", "16", "-ctk", "q8_0", "-ctv", "q8_0");
+        assertThat(options.getBatchSize(), is(4096));
+        assertThat(options.getUbatchSize(), is(2048));
+        assertThat(options.getThreadsBatch(), is(16));
+        assertThat(options.getCacheTypeK(), is(CacheType.Q8_0));
+        assertThat(options.getCacheTypeV(), is(CacheType.Q8_0));
+    }
+
+    @Test
+    public void tuningLongFlagsParsed() {
+        OpenAiServerCli.Options options = OpenAiServerCli.parse(
+                "-m",
+                "m.gguf",
+                "--batch-size",
+                "512",
+                "--ubatch-size",
+                "256",
+                "--threads-batch",
+                "6",
+                "--cache-type-k",
+                "f16",
+                "--cache-type-v",
+                "q4_0",
+                "--jinja");
+        assertThat(options.getBatchSize(), is(512));
+        assertThat(options.getUbatchSize(), is(256));
+        assertThat(options.getThreadsBatch(), is(6));
+        assertThat(options.getCacheTypeK(), is(CacheType.F16));
+        assertThat(options.getCacheTypeV(), is(CacheType.Q4_0));
+        assertThat(options.isJinja(), is(true));
+    }
+
+    @Test
+    public void cacheTypeIsCaseInsensitive() {
+        OpenAiServerCli.Options options = OpenAiServerCli.parse("-m", "m.gguf", "-ctk", "Q8_0");
+        assertThat(options.getCacheTypeK(), is(CacheType.Q8_0));
+    }
+
+    @Test
+    public void unknownCacheTypeThrows() {
+        IllegalArgumentException ex = assertThrows(
+                IllegalArgumentException.class, () -> OpenAiServerCli.parse("-m", "m.gguf", "-ctk", "q3_k"));
+        assertThat(ex.getMessage(), containsString("expects one of"));
+        assertThat(ex.getMessage(), containsString("q8_0"));
+        assertThat(ex.getMessage(), containsString("q3_k"));
+    }
+
+    @Test
+    public void chatTemplateKwargsParsedToRawJsonValues() {
+        OpenAiServerCli.Options options = OpenAiServerCli.parse(
+                "-m", "m.gguf", "--chat-template-kwargs", "{\"reasoning_effort\":\"low\",\"enable_thinking\":true}");
+        assertThat(options.getChatTemplateKwargs().get("reasoning_effort"), is("\"low\""));
+        assertThat(options.getChatTemplateKwargs().get("enable_thinking"), is("true"));
+    }
+
+    @Test
+    public void chatTemplateKwargsInvalidJsonThrows() {
+        IllegalArgumentException ex = assertThrows(
+                IllegalArgumentException.class,
+                () -> OpenAiServerCli.parse("-m", "m.gguf", "--chat-template-kwargs", "{not json"));
+        assertThat(ex.getMessage(), containsString("--chat-template-kwargs expects a JSON object"));
+    }
+
+    @Test
+    public void chatTemplateKwargsNonObjectThrows() {
+        IllegalArgumentException ex = assertThrows(
+                IllegalArgumentException.class,
+                () -> OpenAiServerCli.parse("-m", "m.gguf", "--chat-template-kwargs", "\"low\""));
+        assertThat(ex.getMessage(), containsString("--chat-template-kwargs expects a JSON object"));
+    }
+
+    @Test
+    public void toModelParametersCarriesTuningFlags() {
+        String argv = OpenAiServerCli.parse(
+                        "-m",
+                        "m.gguf",
+                        "-b",
+                        "4096",
+                        "-ub",
+                        "2048",
+                        "-tb",
+                        "16",
+                        "-ctk",
+                        "q8_0",
+                        "-ctv",
+                        "q8_0",
+                        "--jinja",
+                        "--chat-template-kwargs",
+                        "{\"reasoning_effort\":\"low\"}")
+                .toModelParameters()
+                .toString();
+        assertThat(argv, containsString("--batch-size 4096"));
+        assertThat(argv, containsString("--ubatch-size 2048"));
+        assertThat(argv, containsString("--threads-batch 16"));
+        assertThat(argv, containsString("--cache-type-k q8_0"));
+        assertThat(argv, containsString("--cache-type-v q8_0"));
+        assertThat(argv, containsString("--jinja"));
+        assertThat(argv, containsString("--chat-template-kwargs"));
+        assertThat(argv, containsString("reasoning_effort"));
+    }
+
+    @Test
+    public void usageMentionsNewTuningFlags() {
+        String usage = OpenAiServerCli.usage();
+        assertThat(usage, containsString("--batch-size"));
+        assertThat(usage, containsString("--ubatch-size"));
+        assertThat(usage, containsString("--threads-batch"));
+        assertThat(usage, containsString("--cache-type-k"));
+        assertThat(usage, containsString("--jinja"));
+        assertThat(usage, containsString("--chat-template-kwargs"));
+    }
 }
diff --git a/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java b/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java
index 43c43ddb..f30c8fb6 100644
--- a/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java
+++ b/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java
@@ -29,10 +29,19 @@ public void builderAppliesLocalhostDefaults() {
         assertThat(config.getHeartbeatMillis(), is(OpenAiServerConfig.DEFAULT_HEARTBEAT_MILLIS));
         assertThat(config.getCorsAllowOrigin(), is(OpenAiServerConfig.DEFAULT_CORS_ALLOW_ORIGIN));
         assertThat(config.isSupportsVision(), is(false));
+        assertThat(config.getModelFtype(), is(""));
         assertThat(config.getApiKey(), is((String) null));
         assertThat(config.isAuthenticationEnabled(), is(false));
     }
 
+    @Test
+    public void modelFtypeIsConfigurableAndNullBecomesEmpty() {
+        assertThat(
+                OpenAiServerConfig.builder().modelFtype("Q4_K - Medium").build().getModelFtype(), is("Q4_K - Medium"));
+        // null is normalized to the empty "unknown" marker
+        assertThat(OpenAiServerConfig.builder().modelFtype(null).build().getModelFtype(), is(""));
+    }
+
     @Test
     public void authenticationEnabledOnlyForNonEmptyKey() {
         assertThat(OpenAiServerConfig.builder().build().isAuthenticationEnabled(), is(false));
diff --git a/llama/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java b/llama/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java
index 866efd25..265d1f75 100644
--- a/llama/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java
+++ b/llama/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java
@@ -103,6 +103,28 @@ public void modelsJsonAdvertisesTheConfiguredModel() throws IOException {
         assertThat(root.path("object").asText(), is("list"));
         assertThat(root.path("data").get(0).path("id").asText(), is("gemma-local"));
         assertThat(root.path("data").get(0).path("object").asText(), is("model"));
+        // no ftype supplied -> the field is omitted entirely
+        assertThat(root.path("data").get(0).has("ftype"), is(false));
+    }
+
+    @Test
+    public void modelsJsonIncludesFtypeWhenKnownAndOmitsWhenBlank() throws IOException {
+        JsonNode withFtype = MAPPER.readTree(OpenAiSseFormatter.modelsJson("gemma-local", "Q4_K - Medium"));
+        assertThat(withFtype.path("data").get(0).path("ftype").asText(), is("Q4_K - Medium"));
+
+        // empty and null are treated as "unknown" -> field omitted
+        assertThat(
+                MAPPER.readTree(OpenAiSseFormatter.modelsJson("gemma-local", ""))
+                        .path("data")
+                        .get(0)
+                        .has("ftype"),
+                is(false));
+        assertThat(
+                MAPPER.readTree(OpenAiSseFormatter.modelsJson("gemma-local", null))
+                        .path("data")
+                        .get(0)
+                        .has("ftype"),
+                is(false));
     }
 
     @Test
diff --git a/llama/src/test/java/net/ladenthin/llama/server/ServerLauncherTest.java b/llama/src/test/java/net/ladenthin/llama/server/ServerLauncherTest.java
new file mode 100644
index 00000000..1c3fbada
--- /dev/null
+++ b/llama/src/test/java/net/ladenthin/llama/server/ServerLauncherTest.java
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.arrayContaining;
+import static org.hamcrest.Matchers.emptyArray;
+import static org.hamcrest.Matchers.is;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Pure-Java unit tests for {@link ServerLauncher}'s single dispatch primitive,
+ * {@link ServerLauncher#withoutFlag(String[], String)}. Selection is derived from the length change
+ * (result shorter iff the flag was present), so these tests cover both the stripping behaviour and
+ * that selection signal. No server is started and no native library is required.
+ */
+public class ServerLauncherTest {
+
+    private static final String FLAG = ServerLauncher.OPENAI_COMPAT_FLAG;
+
+    // --- selection signal: shorter iff the flag was present ---
+
+    @Test
+    public void resultIsShorterWhenFlagPresent() {
+        String[] in = {FLAG, "-m", "m.gguf", "--port", "8080"};
+        assertThat(ServerLauncher.withoutFlag(in, FLAG).length < in.length, is(true));
+    }
+
+    @Test
+    public void resultKeepsLengthWhenFlagAbsent() {
+        String[] in = {"-m", "m.gguf", "--port", "8080"};
+        assertThat(ServerLauncher.withoutFlag(in, FLAG).length == in.length, is(true));
+    }
+
+    @Test
+    public void flagPositionDoesNotMatter() {
+        String[] in = {"-m", "m.gguf", FLAG};
+        assertThat(ServerLauncher.withoutFlag(in, FLAG).length < in.length, is(true));
+    }
+
+    // --- stripping behaviour ---
+
+    @Test
+    public void stripsTheSelectorAndPreservesTheRest() {
+        String[] out = ServerLauncher.withoutFlag(new String[] {FLAG, "-m", "m.gguf", "--port", "8080"}, FLAG);
+        assertThat(out, arrayContaining("-m", "m.gguf", "--port", "8080"));
+    }
+
+    @Test
+    public void removesEveryOccurrence() {
+        String[] out = ServerLauncher.withoutFlag(new String[] {FLAG, "-m", "m.gguf", FLAG}, FLAG);
+        assertThat(out, arrayContaining("-m", "m.gguf"));
+    }
+
+    @Test
+    public void isNoOpWhenAbsent() {
+        assertThat(ServerLauncher.withoutFlag(new String[] {"-m", "m.gguf"}, FLAG), arrayContaining("-m", "m.gguf"));
+    }
+
+    @Test
+    public void emptyArgsStayEmpty() {
+        assertThat(ServerLauncher.withoutFlag(new String[] {}, FLAG), is(emptyArray()));
+    }
+}
diff --git a/llama/src/test/java/net/ladenthin/llama/value/ModelMetaTest.java b/llama/src/test/java/net/ladenthin/llama/value/ModelMetaTest.java
index 5552e2a7..19f41dc4 100644
--- a/llama/src/test/java/net/ladenthin/llama/value/ModelMetaTest.java
+++ b/llama/src/test/java/net/ladenthin/llama/value/ModelMetaTest.java
@@ -80,6 +80,25 @@ public void testGetModelName() throws Exception {
         assertThat(meta.getModelName(), is("Mistral-7B-v0.1"));
     }
 
+    @Test
+    public void testGetFtype() throws Exception {
+        ModelMeta meta = parse("{\"vocab_type\":1,\"n_vocab\":32016,\"n_ctx_train\":16384,"
+                + "\"n_embd\":4096,\"n_params\":6738546688,\"size\":2825274880,"
+                + "\"modalities\":{\"vision\":false,\"audio\":false},"
+                + "\"architecture\":\"mistral\",\"name\":\"Mistral-7B-v0.1\",\"ftype\":\"Q4_K - Medium\"}");
+
+        assertThat(meta.getFtype(), is("Q4_K - Medium"));
+    }
+
+    @Test
+    public void testGetFtypeEmptyWhenAbsent() throws Exception {
+        ModelMeta meta = parse("{\"vocab_type\":1,\"n_vocab\":100,\"n_ctx_train\":4096,"
+                + "\"n_embd\":512,\"n_params\":1000000,\"size\":500000,"
+                + "\"modalities\":{\"vision\":false,\"audio\":false}}");
+
+        assertThat(meta.getFtype(), is(""));
+    }
+
     @Test
     public void testGetArchitectureEmptyWhenAbsent() throws Exception {
         ModelMeta meta = parse("{\"vocab_type\":1,\"n_vocab\":100,\"n_ctx_train\":4096,"