diff --git a/.github/build.bat b/.github/build.bat index a9df77f5..d2c755b3 100755 --- a/.github/build.bat +++ b/.github/build.bat @@ -60,6 +60,28 @@ REM was wired in as the launcher. if defined LAUNCH ( echo build.bat: sccache --show-stats sccache --show-stats + REM KISS per-job cache summary in the GitHub Actions job summary (like upstream llama.cpp's + REM ccache-action table). Parse the text stats: the top-level "Compile requests" line is the + REM total and the top-level "Cache hits" line is the hits (the per-language "Cache hits (C/C++)" + REM line has "(" after the label, so the digit-anchored findstr regex skips it). Only in CI + REM (GITHUB_STEP_SUMMARY set); local runs are untouched. Best-effort -- skipped if the two + REM numbers can't be parsed or there were no requests. Integer math with rounding to one decimal. + if defined GITHUB_STEP_SUMMARY ( + set "SCC_REQ=" + set "SCC_HITS=" + for /f "tokens=3" %%a in ('sccache --show-stats 2^>nul ^| findstr /r /c:"^Compile requests *[0-9]"') do set "SCC_REQ=%%a" + for /f "tokens=3" %%a in ('sccache --show-stats 2^>nul ^| findstr /r /c:"^Cache hits *[0-9]"') do set "SCC_HITS=%%a" + if defined SCC_REQ if defined SCC_HITS if !SCC_REQ! gtr 0 ( + set /a SCC_RATE10=^(!SCC_HITS! * 1000 + !SCC_REQ! / 2^) / !SCC_REQ! + set /a SCC_WHOLE=!SCC_RATE10! / 10 + set /a SCC_DEC=!SCC_RATE10! %% 10 + >>"%GITHUB_STEP_SUMMARY%" echo ### sccache statistics + >>"%GITHUB_STEP_SUMMARY%" echo. + >>"%GITHUB_STEP_SUMMARY%" echo ^| Cache hits ^| Requests ^| Hit rate ^| + >>"%GITHUB_STEP_SUMMARY%" echo ^|------------^|----------^|----------^| + >>"%GITHUB_STEP_SUMMARY%" echo ^| !SCC_HITS! ^| !SCC_REQ! ^| !SCC_WHOLE!.!SCC_DEC!%% ^| + ) + ) ) REM Propagate a build failure as a non-zero exit (a prior bug let a failed `cmake diff --git a/.github/build.sh b/.github/build.sh index 7a47ab65..6257904b 100755 --- a/.github/build.sh +++ b/.github/build.sh @@ -160,5 +160,26 @@ rm -f "$build_log" # crashing sccache (or the mid-build retry disabled it), re-invoking it here would just repeat # the crash output (harmless but noisy). if [ -n "$LAUNCH" ] && command -v sccache >/dev/null 2>&1; then - sccache --show-stats || true + sccache_stats="$(sccache --show-stats 2>/dev/null || true)" + printf '%s\n' "$sccache_stats" + # KISS per-job cache summary in the GitHub Actions job summary (like upstream llama.cpp's + # ccache-action table). Parse the text stats: the top-level "Compile requests" line is the + # total and the top-level "Cache hits" line is the hits (the per-language "Cache hits (C/C++)" + # line has "(" after the label, so the digit-anchored regex skips it). Only runs in CI + # (GITHUB_STEP_SUMMARY set); local runs are untouched. Best-effort — skips silently if the two + # numbers can't be parsed or there were no requests. + if [ -n "${GITHUB_STEP_SUMMARY:-}" ] && [ -n "$sccache_stats" ]; then + sccache_req="$(printf '%s\n' "$sccache_stats" | awk '/^Compile requests[[:space:]]+[0-9]/{print $NF; exit}')" + sccache_hits="$(printf '%s\n' "$sccache_stats" | awk '/^Cache hits[[:space:]]+[0-9]/{print $NF; exit}')" + if [ -n "$sccache_req" ] && [ -n "$sccache_hits" ] && [ "$sccache_req" -gt 0 ] 2>/dev/null; then + sccache_rate="$(awk "BEGIN{printf \"%.1f\", ($sccache_hits/$sccache_req)*100}")" + { + echo "### sccache statistics" + echo "" + echo "| Cache hits | Requests | Hit rate |" + echo "|------------|----------|----------|" + echo "| ${sccache_hits} | ${sccache_req} | ${sccache_rate}% |" + } >> "$GITHUB_STEP_SUMMARY" + fi + fi fi diff --git a/.github/scripts/llama-next-version.sh b/.github/scripts/llama-next-version.sh new file mode 100755 index 00000000..517d2e9b --- /dev/null +++ b/.github/scripts/llama-next-version.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026 Bernard Ladenthin +# +# SPDX-License-Identifier: MIT +# +# Pick the NEXT llama.cpp tag to bump the pin to, one reviewable chunk at a time. +# +# The runbook this supports is docs/upgrade/llama-cpp-version-bump.md. Strategy: +# * TARGET = the topmost RELEASE on the GitHub releases page (read from the release atom feed), +# or an explicit "b" passed as $1. +# * CURRENT = the pinned tag in llama/CMakeLists.txt (GIT_TAG b). +# * If `git diff CURRENT..TARGET` is smaller than the threshold (default 100 KiB), bump straight +# to TARGET. Otherwise CHUNK: pick the largest intermediate b tag whose diff from CURRENT +# is still under the threshold, so each bump stays a small, reviewable patch. Re-run after each +# bump to walk the remaining chunks up to TARGET. +# +# This tool only READS (a cached mirror clone + the pin file); it never edits the repo. Apply the +# bump by hand per the runbook. It prints the compare/.patch URLs for the chosen step. +# +# Env: +# LLAMA_BUMP_MAX_DIFF_KB per-step diff-size threshold in KiB (default 100) +# LLAMA_BUMP_EXCLUDE_WEBUI if "1", size the diff EXCLUDING tools/ui (the auto-followed WebUI, which +# does not need per-bump review); default 0 = the full diff you paste/review +# LLAMA_BUMP_CACHE mirror-clone location (default ~/.cache/jllama-llamacpp-mirror) +# +# Network: needs read access to github.com (git clone/fetch + the release atom feed). No token. + +set -euo pipefail + +THRESHOLD_KB="${LLAMA_BUMP_MAX_DIFF_KB:-100}" +THRESHOLD=$((THRESHOLD_KB * 1024)) +EXCLUDE_WEBUI="${LLAMA_BUMP_EXCLUDE_WEBUI:-0}" +REPO="ggml-org/llama.cpp" +GIT_URL="https://github.com/${REPO}.git" +CACHE="${LLAMA_BUMP_CACHE:-$HOME/.cache/jllama-llamacpp-mirror}" +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +CMAKELISTS="$ROOT/llama/CMakeLists.txt" + +# --- current pinned tag number, e.g. "GIT_TAG b9866" -> 9866 ----------------------------------- +cur="$(grep -oE 'GIT_TAG[[:space:]]+b[0-9]+' "$CMAKELISTS" | grep -oE '[0-9]+' | head -1 || true)" +[ -n "$cur" ] || { echo "ERROR: could not read 'GIT_TAG b' from $CMAKELISTS" >&2; exit 1; } + +# --- cached blobless mirror of llama.cpp (clone once, then fetch tags) -------------------------- +if [ -d "$CACHE/.git" ]; then + git -C "$CACHE" fetch --quiet --tags --prune origin || true +else + echo "cloning ${REPO} (blobless) into $CACHE (one-time) ..." >&2 + git clone --filter=blob:none --no-checkout --quiet "$GIT_URL" "$CACHE" +fi + +# --- target: explicit "$1" (b) or the latest RELEASE from the atom feed ------------------- +if [ "${1:-}" != "" ]; then + target="$(printf '%s' "$1" | grep -oE '[0-9]+' | head -1)" + [ -n "$target" ] || { echo "ERROR: '$1' is not a b tag" >&2; exit 1; } +else + feed="$(curl -sSL --fail --retry 4 --retry-delay 2 "https://github.com/${REPO}/releases.atom" 2>/dev/null || true)" + [ -n "$feed" ] || { echo "ERROR: cannot fetch the releases feed (network/rate limit). Read the topmost release at https://github.com/${REPO}/releases and pass it: $0 b" >&2; exit 2; } + target="$(printf '%s' "$feed" | grep -oE 'releases/tag/b[0-9]+' | grep -oE '[0-9]+' | sort -un | tail -1)" + [ -n "$target" ] || { echo "ERROR: parsed no release tags from the feed." >&2; exit 3; } +fi + +git -C "$CACHE" rev-parse -q --verify "b${cur}^{commit}" >/dev/null 2>&1 || { echo "ERROR: b$cur is not a tag in the mirror" >&2; exit 3; } +git -C "$CACHE" rev-parse -q --verify "b${target}^{commit}" >/dev/null 2>&1 || { echo "ERROR: b$target is not a tag in the mirror" >&2; exit 3; } + +# diff byte size between two tag numbers, honoring the WebUI-exclusion toggle +diffsize() { + if [ "$EXCLUDE_WEBUI" = "1" ]; then + git -C "$CACHE" diff "b$1" "b$2" -- . ':(exclude)tools/ui' 2>/dev/null | wc -c + else + git -C "$CACHE" diff "b$1" "b$2" 2>/dev/null | wc -c + fi +} + +scope="full diff" +[ "$EXCLUDE_WEBUI" = "1" ] && scope="diff excluding tools/ui" +echo "current pin : b$cur" +echo "latest release : b$target" +echo "threshold : ${THRESHOLD_KB} KiB per step (${scope})" + +if [ "$cur" -ge "$target" ]; then + echo "=> up to date — no bump needed." + exit 0 +fi + +# --- choose next step: TARGET if it fits, else the largest intermediate tag under the threshold - +if [ "$(diffsize "$cur" "$target")" -lt "$THRESHOLD" ]; then + next="$target" +else + # existing b-tags strictly after cur, up to and including target, ascending + # shellcheck disable=SC2207 + cands=($(git -C "$CACHE" tag -l 'b*' | grep -oE 'b[0-9]+' | grep -oE '[0-9]+' | sort -un \ + | awk -v c="$cur" -v t="$target" '$1 > c && $1 <= t')) + # binary search for the largest candidate whose diff from cur is under the threshold + # (diff size grows monotonically enough with the tag number for this to be a safe heuristic) + lo=0; hi=$(( ${#cands[@]} - 1 )); best="" + while [ "$lo" -le "$hi" ]; do + mid=$(( (lo + hi) / 2 )); T="${cands[$mid]}" + if [ "$(diffsize "$cur" "$T")" -lt "$THRESHOLD" ]; then best="$T"; lo=$(( mid + 1 )); else hi=$(( mid - 1 )); fi + done + if [ -n "$best" ]; then + next="$best" + else + next="${cands[0]}" + echo "NOTE: even b$cur..b$next exceeds ${THRESHOLD_KB} KiB — a single-commit step this large is unavoidable." >&2 + fi +fi + +full=$(git -C "$CACHE" diff "b$cur" "b$next" | wc -c) +noui=$(git -C "$CACHE" diff "b$cur" "b$next" -- . ':(exclude)tools/ui' | wc -c) +commits=$(git -C "$CACHE" rev-list --count "b$cur".."b$next") +echo +echo "next step : b$cur -> b$next" +echo " diff size : $((full / 1024)) KiB full / $((noui / 1024)) KiB excluding tools/ui (auto-followed WebUI)" +echo " commits : $commits" +if [ "$next" -eq "$target" ]; then + echo " progress : reaches the latest release — final chunk" +else + echo " progress : intermediate chunk — re-run this script after the bump for the next one" +fi +echo " review diff : https://github.com/${REPO}/compare/b$cur...b$next" +echo " raw .patch : https://github.com/${REPO}/compare/b$cur...b$next.patch" +echo +echo "Apply this bump per docs/upgrade/llama-cpp-version-bump.md (b$cur -> b$next)." diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 4966652c..c8cc9b2d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -437,6 +437,137 @@ jobs: name: Linux-aarch64-libraries path: ${{ github.workspace }}/llama/src/main/resources/net/ladenthin/llama/ + build-linux-s390x: + name: Build and Test Linux s390x (big-endian, qemu) + needs: [startgate, build-webui] + # Cross-compile for IBM Z (s390x, BIG-ENDIAN) with the GCC cross toolchain, then run the full + # C++ unit suite under qemu-user — a real big-endian correctness gate for our helpers and + # serializers (esp. the little-endian WAV writer, JSON/token/embedding transforms). The BUILD + # is native speed (x86 cross-gcc); only the tiny test binary is emulated. s390x is a DEFAULT-jar + # CPU platform (like aarch64), so the artifact merges via the `*-libraries` glob (no classifier / + # pom profile). Model-backed Java tests are NOT run under emulation (a JVM + GGUF inference under + # qemu-user is slow/flaky); the C++ gate covers the actual byte-order risk since the Java<->JNI + # boundary uses host-native array copies. GGML_OPENMP=OFF avoids cross-libgomp issues (ggml uses + # its own std::thread pool). CMAKE_CROSSCOMPILING_EMULATOR makes ctest run the s390x exe via qemu; + # QEMU_LD_PREFIX lets the emulated binary find the s390x sysroot libs. + runs-on: ubuntu-latest + env: + QEMU_LD_PREFIX: /usr/s390x-linux-gnu + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - uses: actions/setup-java@v5 + with: + distribution: 'temurin' + java-version: ${{ env.JAVA_VERSION }} + - name: Install s390x cross toolchain + qemu-user + run: | + sudo apt-get update + sudo apt-get install -y gcc-s390x-linux-gnu g++-s390x-linux-gnu qemu-user-static + - name: Build libraries (cross-compile s390x) + shell: bash + run: | + mvn --no-transfer-progress -f llama/pom.xml compile + .github/build.sh "-DGGML_NATIVE=OFF -DGGML_OPENMP=OFF -DBUILD_TESTING=ON -DCMAKE_SYSTEM_NAME=Linux -DCMAKE_SYSTEM_PROCESSOR=s390x -DCMAKE_C_COMPILER=s390x-linux-gnu-gcc -DCMAKE_CXX_COMPILER=s390x-linux-gnu-g++ -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-s390x-static -DOS_NAME=Linux -DOS_ARCH=s390x" + - name: Run C++ unit tests under qemu-s390x (big-endian gate) + run: ctest --test-dir llama/build --output-on-failure + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Linux-s390x-libraries + path: ${{ github.workspace }}/llama/src/main/resources/net/ladenthin/llama/ + + build-linux-x86_64-vulkan: + name: Build Linux x86_64 Vulkan + needs: [startgate, build-webui] + # Native ubuntu build (NOT dockcross) — the Vulkan SDK is trivial to apt-install here, and + # upstream llama.cpp builds its ubuntu-vulkan artifact the same way. GPU runtime libvulkan.so.1 + # is supplied by the consumer's driver (nothing bundled). GitHub runners have NO GPU, so this + # is a BUILD-ONLY job (no -DBUILD_TESTING/ctest: a Vulkan-linked jllama_test errors enumerating + # devices on a GPU-less runner — same rationale as the Windows GPU jobs). GGML_NATIVE=OFF keeps + # the artifact portable across x86_64 CPU generations. Trade-off vs the manylinux CPU jar: the + # glibc floor rises to the ubuntu-latest baseline (same as the native aarch64 job). build.sh + # self-fetches sccache; the probe guards it (a miss just builds uncached). + runs-on: ubuntu-latest + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - uses: actions/setup-java@v5 + with: + distribution: 'temurin' + java-version: ${{ env.JAVA_VERSION }} + - name: Install Vulkan SDK (headers + loader + glslc shader compiler) + run: | + sudo apt-get update + sudo apt-get install -y libvulkan-dev glslc glslang-tools spirv-headers + - name: Build libraries + shell: bash + run: | + mvn --no-transfer-progress -f llama/pom.xml compile + .github/build.sh "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64" + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Linux-x86_64-vulkan + path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/ + if-no-files-found: error + + build-linux-aarch64-vulkan: + name: Build Linux aarch64 Vulkan + needs: [startgate, build-webui] + # Native ARM64 Vulkan build on GitHub's free arm64 runner (same runner as the aarch64 CPU job). + # Build-only (GPU-less runner); GGML_NATIVE=OFF for portability across ARMv8 generations; GCC 14 + # to match the aarch64 CPU job. Reuses the resources_linux_vulkan tree (arch subdir Linux/aarch64); + # the vulkan-linux-aarch64 Maven profile packages only that subtree. + runs-on: ubuntu-24.04-arm + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - uses: actions/setup-java@v5 + with: + distribution: 'temurin' + java-version: ${{ env.JAVA_VERSION }} + - name: Install toolchain (GCC 14) + Vulkan SDK + run: | + sudo apt-get update + sudo apt-get install -y gcc-14 g++-14 libvulkan-dev glslc glslang-tools spirv-headers + echo "CC=gcc-14" >> "$GITHUB_ENV" + echo "CXX=g++-14" >> "$GITHUB_ENV" + - name: Build libraries + shell: bash + run: | + mvn --no-transfer-progress -f llama/pom.xml compile + .github/build.sh "-DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=aarch64" + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Linux-aarch64-vulkan + path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/ + if-no-files-found: error + crosscompile-android-aarch64: name: Cross-Compile Android aarch64 needs: [startgate, build-webui] @@ -788,6 +919,57 @@ jobs: name: Windows-x86-libraries path: ${{ github.workspace }}/llama/src/main/resources/net/ladenthin/llama/ + build-windows-arm64: + name: Build and Test Windows 11 arm64 (Ninja Multi-Config, default) + needs: [startgate, build-webui] + # Native arm64 build on GitHub's free windows-11-arm runner. Goes into the DEFAULT JAR (no + # classifier): OSInfo maps a Windows-on-ARM JVM (os.arch=aarch64) to Windows/aarch64, the same + # path CMake emits here, and the `*-libraries` glob in the package/publish jobs merges it into + # src/main/resources. sccache is intentionally omitted (the existing install step pulls the + # x86_64 sccache zip; an arm64 build would need the aarch64 release — not worth the extra path + # for one CPU job, so build.bat just builds uncached when sccache is absent). + # + # Compiler: clang-cl, NOT MSVC cl.exe. ggml's ggml-cpu/CMakeLists.txt aborts with "MSVC is not + # supported for ARM, use clang" via `if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")`. + # clang-cl (LLVM's MSVC-compatible driver) satisfies that guard (its compiler id is "Clang") + # while still leaving CMake's MSVC=TRUE, so our static /MT CRT block (CMAKE_MSVC_RUNTIME_LIBRARY + # in CMakeLists.txt) keeps applying and the generator stays Ninja Multi-Config. msvc-dev-cmd + # (arm64) supplies the MSVC headers/libs/linker AND the bundled clang-cl / lld-link under + # VC\Tools\Llvm\ARM64, so no separate LLVM install is needed. + # + # GGML_OPENMP=OFF: with clang-cl, ggml links LLVM's OpenMP (libomp.lib -> needs libomp140.aarch64.dll + # at runtime), which is NOT on PATH like MSVC's ambient vcomp140.dll on x64 — so gtest_discover_tests + # (and any consumer) failed to launch the binary with 0xc0000135 STATUS_DLL_NOT_FOUND. Turning OpenMP + # off makes ggml use its own std::thread threadpool, so the arm64 jllama.dll (and the test exe) are + # self-contained with no libomp dependency to ship. The x86_64/x86 jobs keep OpenMP (MSVC vcomp). + runs-on: windows-11-arm + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - name: Set up MSVC developer environment (arm64) + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: arm64 + - name: Build libraries + shell: cmd + # No mvn compile needed: the JNI header (jllama.h) is committed and the native build + # uses the bundled JNI headers in .github/include, and OS_NAME/OS_ARCH are passed + # explicitly (so the OSInfo-class OS-detection path is skipped) — same as the x86_64 job. + # clang-cl (see the job comment) is required: ggml refuses MSVC cl.exe on ARM. + run: | + .github\build.bat -G "Ninja Multi-Config" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DGGML_OPENMP=OFF -DOS_NAME=Windows -DOS_ARCH=aarch64 -DBUILD_TESTING=ON + - name: Run C++ unit tests + run: ctest --test-dir llama/build --output-on-failure + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Windows-aarch64-libraries + path: ${{ github.workspace }}/llama/src/main/resources/net/ladenthin/llama/ + # --------------------------------------------------------------------------- # Windows GPU classifiers (x86_64 only) — CUDA, Vulkan, OpenCL. # All three use the same Ninja Multi-Config + MSVC + sccache toolchain as the @@ -951,6 +1133,328 @@ jobs: path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/ if-no-files-found: error + # --------------------------------------------------------------------------- + # Additional GPU-backend classifiers (fail-loud, same wiring as the CUDA/Vulkan/ + # OpenCL jobs): AMD ROCm/HIP, Intel SYCL (oneAPI), Windows-on-ARM OpenCL (Adreno), + # Intel OpenVINO. All BUILD-ONLY (GitHub runners have no AMD/Intel/Adreno GPU, and + # no ctest — a GPU-linked jllama_test can't enumerate a device). GPU runtime libs + # are NOT bundled — the consumer's driver/toolkit supplies them. CMakeLists.txt + # routes each backend to its own src/main/resources_* tree; the matching Maven + # profile turns it into a classifier JAR. Toolchain install steps are first-pass — + # if a vendor URL/version 404s in CI, adjust it (the failure is intentional signal). + # --------------------------------------------------------------------------- + + build-linux-x86_64-rocm: + name: Build Linux x86_64 ROCm/HIP (AMD) + needs: [startgate, build-webui] + runs-on: ubuntu-latest + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - uses: actions/setup-java@v5 + with: + distribution: 'temurin' + java-version: ${{ env.JAVA_VERSION }} + - name: Install ROCm/HIP (AMD apt repo) + run: | + sudo mkdir --parents --mode=0755 /etc/apt/keyrings + wget -qO- https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.3.4 noble main" | sudo tee /etc/apt/sources.list.d/rocm.list + printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600\n' | sudo tee /etc/apt/preferences.d/rocm-pin-600 + sudo apt-get update + sudo apt-get install -y rocm-hip-sdk rocblas-dev hipblas-dev + echo "/opt/rocm/bin" >> "$GITHUB_PATH" + echo "ROCM_PATH=/opt/rocm" >> "$GITHUB_ENV" + - name: Build libraries + shell: bash + run: | + mvn --no-transfer-progress -f llama/pom.xml compile + .github/build.sh "-DGGML_HIP=ON -DAMDGPU_TARGETS=gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102 -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64" + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Linux-x86_64-rocm + path: ${{ github.workspace }}/llama/src/main/resources_linux_rocm/net/ladenthin/llama/ + if-no-files-found: error + + build-windows-x86_64-rocm: + name: Build Windows x86_64 ROCm/HIP (AMD) + needs: [startgate, build-webui] + # windows-2022 (MSVC 14.4x), NOT windows-2025-vs2026 (VS 2026 / MSVC 14.51): ROCm 7.1's + # HIP clang headers (__clang_hip_cmath.h) cannot overload the __host__ __device__ + # isgreater/isless/... that the very new MSVC declares via _CLANG_BUILTIN2, so the + # device-code compile fails. Upstream llama.cpp builds win-hip on windows-2022 for the same + # reason (it drives the HIP SDK's own clang and relies on the older MSVC STL). + runs-on: windows-2022 + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - name: Set up MSVC developer environment (x64) + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + - name: Install AMD HIP SDK for Windows + shell: pwsh + # Mirrors upstream llama.cpp's windows-hip release job: HIP SDK 26.Q1, then + # resolve HIP_PATH from the installed ROCm dir and point the compilers + + # CMAKE_PREFIX_PATH at it so ggml-hip's find_package(hip) resolves. + run: | + $url = "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-26.Q1-Win11-For-HIP.exe" + Invoke-WebRequest -Uri $url -OutFile "$env:RUNNER_TEMP\rocm-install.exe" + $proc = Start-Process "$env:RUNNER_TEMP\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru -Wait + if ($proc.ExitCode -ne 0) { Write-Error "HIP SDK install failed with exit code $($proc.ExitCode)"; exit 1 } + $hip = $(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Split-Path | Split-Path) + "HIP_PATH=$hip" | Out-File -FilePath $env:GITHUB_ENV -Append + "$hip\bin" | Out-File -FilePath $env:GITHUB_PATH -Append + - name: Build libraries + shell: cmd + run: | + .github\build.bat -G "Ninja Multi-Config" -DGGML_HIP=ON -DGPU_TARGETS=gfx1030;gfx1100;gfx1101;gfx1102 -DCMAKE_PREFIX_PATH="%HIP_PATH%" -DCMAKE_C_COMPILER="%HIP_PATH%\bin\clang.exe" -DCMAKE_CXX_COMPILER="%HIP_PATH%\bin\clang++.exe" -DOS_NAME=Windows -DOS_ARCH=x86_64 + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Windows-x86_64-rocm + path: ${{ github.workspace }}/llama/src/main/resources_windows_rocm/net/ladenthin/llama/ + if-no-files-found: error + + build-linux-x86_64-sycl-fp16: + name: Build Linux x86_64 SYCL fp16 (Intel oneAPI) + needs: [startgate, build-webui] + runs-on: ubuntu-latest + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - uses: actions/setup-java@v5 + with: + distribution: 'temurin' + java-version: ${{ env.JAVA_VERSION }} + - name: Install Intel oneAPI (DPC++ + MKL) + run: | + wget -qO- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list + sudo apt-get update + sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp intel-oneapi-mkl-devel + - name: Build libraries + shell: bash + run: | + source /opt/intel/oneapi/setvars.sh + mvn --no-transfer-progress -f llama/pom.xml compile + .github/build.sh "-DGGML_SYCL=ON -DGGML_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64" + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Linux-x86_64-sycl-fp16 + path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/ + if-no-files-found: error + + build-linux-x86_64-sycl-fp32: + name: Build Linux x86_64 SYCL fp32 (Intel oneAPI) + needs: [startgate, build-webui] + runs-on: ubuntu-latest + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - uses: actions/setup-java@v5 + with: + distribution: 'temurin' + java-version: ${{ env.JAVA_VERSION }} + - name: Install Intel oneAPI (DPC++ + MKL) + run: | + wget -qO- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list + sudo apt-get update + sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp intel-oneapi-mkl-devel + - name: Build libraries + shell: bash + run: | + source /opt/intel/oneapi/setvars.sh + mvn --no-transfer-progress -f llama/pom.xml compile + .github/build.sh "-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64" + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Linux-x86_64-sycl-fp32 + path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/ + if-no-files-found: error + + build-windows-x86_64-sycl: + name: Build Windows 2025 x86_64 SYCL (Intel oneAPI) + needs: [startgate, build-webui] + runs-on: windows-2025-vs2026 + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - name: Set up MSVC developer environment (x64) + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + - name: Install Intel oneAPI (DPC++ + MKL + oneDNN + TBB) + shell: cmd + # Mirrors upstream llama.cpp's windows-sycl release job: extract the offline + # installer, then run its bootstrapper with the DPC++/MKL/oneDNN/TBB components. + run: | + curl -fSL -o "%RUNNER_TEMP%\oneapi.exe" "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe" + "%RUNNER_TEMP%\oneapi.exe" -s -x -f "%RUNNER_TEMP%\oneapi_extracted" --log "%RUNNER_TEMP%\extract.log" + "%RUNNER_TEMP%\oneapi_extracted\bootstrapper.exe" -s --action install --components=intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel --eula=accept -p=NEED_VS2022_INTEGRATION=0 --log-dir="%RUNNER_TEMP%" + - name: Build libraries + shell: cmd + run: | + call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force + .github\build.bat -G "Ninja Multi-Config" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DOS_NAME=Windows -DOS_ARCH=x86_64 + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Windows-x86_64-sycl + path: ${{ github.workspace }}/llama/src/main/resources_windows_sycl/net/ladenthin/llama/ + if-no-files-found: error + + build-windows-arm64-opencl: + name: Build Windows 11 arm64 OpenCL (Adreno) + needs: [startgate, build-webui] + # Windows-on-ARM OpenCL (Snapdragon X / Adreno). Same clang-cl + GGML_OPENMP=OFF + # toolchain as the arm64 CPU job (ggml refuses MSVC cl.exe on ARM). Reuses the + # resources_windows_opencl tree under Windows/aarch64; the opencl-windows-aarch64 + # Maven profile packages only that subtree. build_opencl_windows.bat stages the + # OpenCL headers + ICD loader before delegating to build.bat. + runs-on: windows-11-arm + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - name: Set up MSVC developer environment (arm64) + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: arm64 + - name: Build libraries + shell: cmd + run: | + .github\build_opencl_windows.bat -G "Ninja Multi-Config" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DGGML_OPENMP=OFF -DGGML_OPENCL=ON -DGGML_OPENCL_EMBED_KERNELS=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DOS_NAME=Windows -DOS_ARCH=aarch64 + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Windows-aarch64-opencl + path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/ + if-no-files-found: error + + build-linux-x86_64-openvino: + name: Build Linux x86_64 OpenVINO (Intel) + needs: [startgate, build-webui] + runs-on: ubuntu-latest + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - uses: actions/setup-java@v5 + with: + distribution: 'temurin' + java-version: ${{ env.JAVA_VERSION }} + - name: Install OpenCL dev + Intel OpenVINO 2026.2.1 (archive) + run: | + # Intel's OpenVINO APT repo only publishes up to ~2025 (the /openvino/2026 path 404s), and + # 2025.x has the older ov::Allocator API that breaks ggml-openvino's template compile. So use + # the ARCHIVE for 2026.2.1 — exactly what upstream llama.cpp's linux-setup-openvino action does. + # OpenCL headers (incl. the C++ CL/cl2.hpp via opencl-clhpp-headers) come from Ubuntu's own repos. + sudo apt-get update + sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd + url="https://storage.openvinotoolkit.org/repositories/openvino/packages/2026.2.1/linux/openvino_toolkit_ubuntu24_2026.2.1.21919.ede283a88e3_x86_64.tgz" + sudo mkdir -p /opt/intel/openvino + curl -fSL "$url" | sudo tar -xz --strip-components=1 -C /opt/intel/openvino + echo "OpenVINO_DIR=/opt/intel/openvino/runtime/cmake" >> "$GITHUB_ENV" + - name: Build libraries + shell: bash + run: | + source /opt/intel/openvino/setupvars.sh || true + mvn --no-transfer-progress -f llama/pom.xml compile + .github/build.sh "-DGGML_OPENVINO=ON -DOpenVINO_DIR=$OpenVINO_DIR -DGGML_NATIVE=OFF -DOS_NAME=Linux -DOS_ARCH=x86_64" + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Linux-x86_64-openvino + path: ${{ github.workspace }}/llama/src/main/resources_linux_openvino/net/ladenthin/llama/ + if-no-files-found: error + + build-windows-x86_64-openvino: + name: Build Windows 2025 x86_64 OpenVINO (Intel) + needs: [startgate, build-webui] + runs-on: windows-2025-vs2026 + steps: + - uses: actions/checkout@v7 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/llama/webui-generated/ + - name: Set up MSVC developer environment (x64) + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + - name: Install OpenCL headers (vcpkg) + Intel OpenVINO 2026.2.1 + shell: pwsh + # vcpkg's opencl port ships the full C++ headers incl. CL/cl2.hpp that OpenVINO's + # ocl_wrapper.hpp needs (the Khronos OpenCL-Headers dropped cl2.hpp) — same as upstream + # llama.cpp's windows-openvino job. OpenVINO 2026.2.1 matches ggml-openvino's target API. + run: | + C:\vcpkg\vcpkg install opencl:x64-windows + $url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/2026.2.1/windows/openvino_toolkit_windows_2026.2.1.21919.ede283a88e3_x86_64.zip" + Invoke-WebRequest -Uri $url -OutFile "$env:RUNNER_TEMP\openvino.zip" + Expand-Archive -Path "$env:RUNNER_TEMP\openvino.zip" -DestinationPath "C:\openvino" -Force + # The archive extracts into a nested versioned folder; point OpenVINO_DIR at its runtime/cmake. + $root = (Get-ChildItem "C:\openvino" -Directory | Select-Object -First 1).FullName + "OpenVINO_DIR=$root\runtime\cmake" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: Build libraries + shell: cmd + # vcpkg toolchain file wires in the OpenCL (incl. cl2.hpp) that ggml-openvino needs. + run: | + .github\build.bat -G "Ninja Multi-Config" -DGGML_OPENVINO=ON -DOpenVINO_DIR="%OpenVINO_DIR%" -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake -DOS_NAME=Windows -DOS_ARCH=x86_64 + - name: Upload artifacts + uses: actions/upload-artifact@v7 + with: + name: Windows-x86_64-openvino + path: ${{ github.workspace }}/llama/src/main/resources_windows_openvino/net/ladenthin/llama/ + if-no-files-found: error + # --------------------------------------------------------------------------- # CI-only jobs — no release artifact, purely for test coverage # --------------------------------------------------------------------------- @@ -1521,15 +2025,27 @@ jobs: needs: - crosscompile-linux-x86_64-cuda - crosscompile-linux-aarch64 + - build-linux-s390x + - build-linux-x86_64-vulkan + - build-linux-aarch64-vulkan - crosscompile-android-aarch64 - crosscompile-android-aarch64-opencl - build-windows-x86_64 - build-windows-x86 + - build-windows-arm64 - build-windows-x86_64-msvc - build-windows-x86-msvc - build-windows-x86_64-cuda - build-windows-x86_64-vulkan - build-windows-x86_64-opencl + - build-linux-x86_64-rocm + - build-windows-x86_64-rocm + - build-linux-x86_64-sycl-fp16 + - build-linux-x86_64-sycl-fp32 + - build-windows-x86_64-sycl + - build-windows-arm64-opencl + - build-linux-x86_64-openvino + - build-windows-x86_64-openvino - test-cpp-linux-x86_64 - build-macos-arm64-metal-15 - test-java-linux-x86_64 @@ -1550,6 +2066,16 @@ jobs: with: name: linux-libraries-cuda path: ${{ github.workspace }}/llama/src/main/resources_linux_cuda/net/ladenthin/llama/ + # Linux Vulkan classifiers (x86_64 + aarch64) share one tree; the two Maven profiles + # split it by arch subdir into one single-arch classifier JAR each. + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-vulkan + path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-aarch64-vulkan + path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/ - uses: actions/download-artifact@v8 with: name: android-libraries-opencl @@ -1577,6 +2103,38 @@ jobs: with: name: Windows-x86_64-opencl path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-rocm + path: ${{ github.workspace }}/llama/src/main/resources_linux_rocm/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-rocm + path: ${{ github.workspace }}/llama/src/main/resources_windows_rocm/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-sycl-fp16 + path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-sycl-fp32 + path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-sycl + path: ${{ github.workspace }}/llama/src/main/resources_windows_sycl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-aarch64-opencl + path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-openvino + path: ${{ github.workspace }}/llama/src/main/resources_linux_openvino/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-openvino + path: ${{ github.workspace }}/llama/src/main/resources_windows_openvino/net/ladenthin/llama/ - uses: actions/setup-java@v5 with: distribution: 'temurin' @@ -1590,7 +2148,7 @@ jobs: # Windows classifier JARs: `windows-msvc` (MSVC-built CPU natives) plus the GPU # backends `cuda-windows` / `vulkan-windows` / `opencl-windows`. The default JAR's # Windows natives are the Ninja `*-libraries` merged into src/main/resources/ above. - run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,assembly -Dmaven.test.skip=true -Dgpg.skip=true package + run: mvn --batch-mode --no-transfer-progress -P release,cuda,vulkan-linux,vulkan-linux-aarch64,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,rocm-linux,rocm-windows,sycl-fp16-linux,sycl-fp32-linux,sycl-windows,opencl-windows-aarch64,openvino-linux,openvino-windows,assembly -Dmaven.test.skip=true -Dgpg.skip=true package - name: Upload JARs uses: actions/upload-artifact@v7 with: @@ -1664,6 +2222,14 @@ jobs: with: name: linux-libraries-cuda path: ${{ github.workspace }}/llama/src/main/resources_linux_cuda/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-vulkan + path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-aarch64-vulkan + path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/ - uses: actions/download-artifact@v8 with: name: android-libraries-opencl @@ -1688,6 +2254,38 @@ jobs: with: name: Windows-x86_64-opencl path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-rocm + path: ${{ github.workspace }}/llama/src/main/resources_linux_rocm/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-rocm + path: ${{ github.workspace }}/llama/src/main/resources_windows_rocm/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-sycl-fp16 + path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-sycl-fp32 + path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-sycl + path: ${{ github.workspace }}/llama/src/main/resources_windows_sycl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-aarch64-opencl + path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-openvino + path: ${{ github.workspace }}/llama/src/main/resources_linux_openvino/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-openvino + path: ${{ github.workspace }}/llama/src/main/resources_windows_openvino/net/ladenthin/llama/ - name: Set up Maven Central Repository uses: actions/setup-java@v5 with: @@ -1712,7 +2310,7 @@ jobs: # :llama-langchain4j. The `release` profile (GPG + Central Publishing) is inherited # from the parent, so every module — including the parent pom — is signed. - name: Publish snapshot (reactor - parent + llama + llama-langchain4j) - run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows -Dmaven.test.skip=true deploy + run: mvn --batch-mode --no-transfer-progress -P release,cuda,vulkan-linux,vulkan-linux-aarch64,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,rocm-linux,rocm-windows,sycl-fp16-linux,sycl-fp32-linux,sycl-windows,opencl-windows-aarch64,openvino-linux,openvino-windows -Dmaven.test.skip=true deploy env: MAVEN_USERNAME: ${{ secrets.CENTRAL_USERNAME }} MAVEN_PASSWORD: ${{ secrets.CENTRAL_TOKEN }} @@ -1774,6 +2372,14 @@ jobs: with: name: linux-libraries-cuda path: ${{ github.workspace }}/llama/src/main/resources_linux_cuda/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-vulkan + path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-aarch64-vulkan + path: ${{ github.workspace }}/llama/src/main/resources_linux_vulkan/net/ladenthin/llama/ - uses: actions/download-artifact@v8 with: name: android-libraries-opencl @@ -1798,6 +2404,38 @@ jobs: with: name: Windows-x86_64-opencl path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-rocm + path: ${{ github.workspace }}/llama/src/main/resources_linux_rocm/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-rocm + path: ${{ github.workspace }}/llama/src/main/resources_windows_rocm/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-sycl-fp16 + path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-sycl-fp32 + path: ${{ github.workspace }}/llama/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-sycl + path: ${{ github.workspace }}/llama/src/main/resources_windows_sycl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-aarch64-opencl + path: ${{ github.workspace }}/llama/src/main/resources_windows_opencl/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Linux-x86_64-openvino + path: ${{ github.workspace }}/llama/src/main/resources_linux_openvino/net/ladenthin/llama/ + - uses: actions/download-artifact@v8 + with: + name: Windows-x86_64-openvino + path: ${{ github.workspace }}/llama/src/main/resources_windows_openvino/net/ladenthin/llama/ - name: Set up Maven Central Repository uses: actions/setup-java@v5 with: @@ -1813,7 +2451,7 @@ jobs: # :llama-langchain4j. The `release` profile (GPG + Central Publishing) is inherited # from the parent, so every module — including the parent pom — is signed. - name: Publish release (reactor - parent + llama + llama-langchain4j) - run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows -Dmaven.test.skip=true deploy + run: mvn --batch-mode --no-transfer-progress -P release,cuda,vulkan-linux,vulkan-linux-aarch64,opencl-android,windows-msvc,cuda-windows,vulkan-windows,opencl-windows,rocm-linux,rocm-windows,sycl-fp16-linux,sycl-fp32-linux,sycl-windows,opencl-windows-aarch64,openvino-linux,openvino-windows -Dmaven.test.skip=true deploy env: MAVEN_USERNAME: ${{ secrets.CENTRAL_USERNAME }} MAVEN_PASSWORD: ${{ secrets.CENTRAL_TOKEN }} diff --git a/.gitignore b/.gitignore index dfead6a8..d160476a 100644 --- a/.gitignore +++ b/.gitignore @@ -39,13 +39,21 @@ replay_pid* models/*.gguf llama/src/main/cpp/net_ladenthin_llama_*.h -llama/src/main/resources_cuda_linux/ +llama/src/main/resources_linux_cuda/ # Per-classifier native trees, staged by CI before the matching Maven profile runs, # never committed (same policy as the default-tree native libs below). +llama/src/main/resources_linux_vulkan/ llama/src/main/resources_windows_msvc/ llama/src/main/resources_windows_cuda/ llama/src/main/resources_windows_vulkan/ llama/src/main/resources_windows_opencl/ +llama/src/main/resources_linux_rocm/ +llama/src/main/resources_windows_rocm/ +llama/src/main/resources_linux_sycl_fp16/ +llama/src/main/resources_linux_sycl_fp32/ +llama/src/main/resources_windows_sycl/ +llama/src/main/resources_linux_openvino/ +llama/src/main/resources_windows_openvino/ llama/src/main/resources/**/*.so llama/src/main/resources/**/*.dylib llama/src/main/resources/**/*.dll diff --git a/CLAUDE.md b/CLAUDE.md index e6da18c3..1c861a53 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b9859** +Current llama.cpp pinned version: **b9870** ## Upgrading CUDA Version @@ -198,7 +198,8 @@ Wiring (mirrors the CUDA-Linux / OpenCL-Android classifier pattern): 1. **`llama/CMakeLists.txt`** — the `if(GGML_CUDA) … elseif(GGML_VULKAN) … elseif(GGML_OPENCL) … else()` chain is **OS-aware**: CUDA → `resources_windows_cuda` on Windows (else `resources_linux_cuda`), - Vulkan → `resources_windows_vulkan`, OpenCL → `resources_windows_opencl` on Windows (else + Vulkan → `resources_windows_vulkan` on Windows (else `resources_linux_vulkan` — see "Linux Vulkan + classifiers" above), OpenCL → `resources_windows_opencl` on Windows (else `resources_android_opencl`). The default CPU build (both generators) still emits to the canonical `src/main/resources/.../Windows/{x86_64,x86}/`, so the Ninja-vs-MSVC split is purely a CI-artifact-name + pom-profile concern (no CMake change for it). @@ -253,6 +254,95 @@ ctest --test-dir build --output-on-failure .github\build_opencl_windows.bat -G "Ninja Multi-Config" -DGGML_OPENCL=ON -DGGML_OPENCL_EMBED_KERNELS=ON -DOS_NAME=Windows -DOS_ARCH=x86_64 ``` +## Linux Vulkan classifiers + Windows arm64 CPU + +Three additional artifacts extend the matrix toward upstream llama.cpp's release set. They follow +the same classifier/resource-tree pattern as CUDA-Linux and Vulkan-Windows. + +**Linux Vulkan (`vulkan-linux-x86-64` + `vulkan-linux-aarch64`).** A vendor-neutral GPU jar for +Linux (NVIDIA / AMD / Intel) with no CUDA toolkit — the intersection of the existing Vulkan-Windows +and CUDA-Linux wiring. Four places: + +1. **`llama/CMakeLists.txt`** — the `elseif(GGML_VULKAN)` branch is now **OS-aware** (mirrors + `GGML_CUDA`): Windows → `resources_windows_vulkan`, else → `resources_linux_vulkan` + (`.../Linux/${OS_ARCH}/`). One tree holds both arches under `Linux/{x86_64,aarch64}`. +2. **`.github/workflows/publish.yml`** — `build-linux-x86_64-vulkan` (native `ubuntu-latest`, **not** + dockcross — the Vulkan SDK is a trivial apt install and upstream builds ubuntu-vulkan the same way) + and `build-linux-aarch64-vulkan` (`ubuntu-24.04-arm` + GCC 14). Both `apt-get install libvulkan-dev + glslc glslang-tools`, build `-DGGML_VULKAN=ON -DGGML_NATIVE=OFF`, and are **build-only** (no + `ctest`: a Vulkan-linked `jllama_test` errors enumerating devices on a GPU-less runner — same as the + Windows GPU jobs). Artifacts `Linux-{x86_64,aarch64}-vulkan` → both downloaded into the **one** + `resources_linux_vulkan/` tree by `package`/`publish-*`. Glibc floor rises to the ubuntu baseline + (like the aarch64 CPU jar); acceptable for a GPU artifact. +3. **`llama/pom.xml`** — profiles `vulkan-linux` (classifier `vulkan-linux-x86-64`) and + `vulkan-linux-aarch64` (classifier `vulkan-linux-aarch64`). Both read the shared + `resources_linux_vulkan` tree but the resource-copy `` is **arch-scoped** + (`net/ladenthin/llama/Linux/{x86_64,aarch64}/**`), so each classifier JAR carries only its own + arch (verified: each jar contains exactly one `libjllama.so`). Separate output dirs + `_linux_vulkan` / `_linux_vulkan_aarch64` avoid collision. Activated in CI via + `-P …,vulkan-linux,vulkan-linux-aarch64,…`. +4. **`README.md`** — classifier table + dependency snippets. + +`src/main/resources_linux_vulkan/` is git-ignored (staged by CI, never committed). GPU runtime +`libvulkan.so.1` is supplied by the consumer's driver — nothing is bundled (same policy as every GPU +classifier). + +**Windows arm64 CPU (default JAR, no classifier).** `build-windows-arm64` runs natively on GitHub's +free `windows-11-arm` runner (`ilammy/msvc-dev-cmd` `arch: arm64`, Ninja Multi-Config, `-DOS_ARCH=aarch64`, +build + `ctest`). It emits to the **canonical** `resources/.../Windows/aarch64/` and uploads +`Windows-aarch64-libraries`, which the `package`/`publish-*` `*-libraries` glob merges into the default +tree — so it ships in the **default** JAR alongside Windows x86-64 / x86 (like those, it is not a +classifier). No Java change was needed: `OSInfo` already maps a Windows-on-ARM JVM (`os.arch=aarch64`) +to `Windows/aarch64` (it isn't in `archMapping`, so it falls through `translateArchNameToFolderName`). +sccache is intentionally omitted (the shared install step pulls the x86_64 sccache zip; not worth an +arm64 path for one CPU job — `build.bat` just builds uncached). **Compiler: `clang-cl`, not MSVC +`cl.exe`.** ggml's `ggml-cpu/CMakeLists.txt` aborts with *"MSVC is not supported for ARM, use clang"* +via `if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")`; `clang-cl` (LLVM's MSVC-compatible driver) +satisfies that guard (compiler id `"Clang"`) while keeping CMake's `MSVC=TRUE`, so the static `/MT` CRT +block still applies and the generator stays Ninja Multi-Config. The job passes +`-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl`; `msvc-dev-cmd` supplies the MSVC +headers/libs/linker **and the bundled clang-cl/lld-link** (`VC\Tools\Llvm\ARM64`), so no separate +LLVM install is needed. It also passes **`-DGGML_OPENMP=OFF`**: with clang-cl, ggml links LLVM's +OpenMP (`libomp.lib` → `libomp140.aarch64.dll` at runtime), which — unlike MSVC's ambient +`vcomp140.dll` on x64 — is not on `PATH`, so the test exe (and any consumer) failed to launch with +`0xc0000135` (`STATUS_DLL_NOT_FOUND`). Disabling OpenMP makes ggml use its own `std::thread` +threadpool, leaving the arm64 `jllama.dll` self-contained (the x86_64/x86 jobs keep OpenMP via MSVC +`vcomp`). (Upstream llama.cpp instead cross-compiles arm64 from an +x64 runner with `vcvarsall amd64_arm64` + a `clang`/`clang++` toolchain file and no arm64 tests; the +native-runner + `clang-cl` route here keeps the `/MT` CRT and lets `ctest` run on real ARM hardware.) + +## Additional GPU-backend classifiers (ROCm/HIP, SYCL, Win-arm64 OpenCL, OpenVINO) + +Eight further GPU classifiers extend the matrix toward upstream llama.cpp's full release set. They +follow the **exact same 5-place wiring** as the CUDA/Vulkan classifiers (no special cases — KISS): a +`CMakeLists.txt` backend branch, a `publish.yml` build job (in `package.needs`, **fail-loud** — a +broken build reds the pipeline, same policy as every GPU job), a `pom.xml` classifier profile, a +`README.md` row, and a git-ignored `resources_*` tree. All are **build-only** (GitHub runners have no +matching GPU) and bundle **no** vendor runtime. + +| Classifier | GGML flag(s) | Job runner / toolchain | Tree | +|---|---|---|---| +| `rocm-linux-x86-64` | `GGML_HIP=ON -DAMDGPU_TARGETS=…` | `ubuntu-latest` + ROCm apt repo (`/opt/rocm/llvm/bin/clang`) | `resources_linux_rocm` | +| `rocm-windows-x86-64` | `GGML_HIP=ON` | `windows-2025-vs2026` + AMD HIP SDK | `resources_windows_rocm` | +| `sycl-fp16-linux-x86-64` | `GGML_SYCL=ON -DGGML_SYCL_F16=ON` (`icx`/`icpx`) | `ubuntu-latest` + Intel oneAPI apt | `resources_linux_sycl_fp16` | +| `sycl-fp32-linux-x86-64` | `GGML_SYCL=ON` (`icx`/`icpx`) | `ubuntu-latest` + Intel oneAPI apt | `resources_linux_sycl_fp32` | +| `sycl-windows-x86-64` | `GGML_SYCL=ON` (`icx`) | `windows-2025-vs2026` + oneAPI installer | `resources_windows_sycl` | +| `opencl-windows-aarch64` | `GGML_OPENCL=ON …ADRENO_KERNELS=ON` (clang-cl, `GGML_OPENMP=OFF`) | `windows-11-arm` (arm64 CPU job's toolchain) | `resources_windows_opencl` (arch subdir `aarch64`) | +| `openvino-linux-x86-64` | `GGML_OPENVINO=ON` | `ubuntu-latest` + OpenVINO apt | `resources_linux_openvino` | +| `openvino-windows-x86-64` | `GGML_OPENVINO=ON` | `windows-2025-vs2026` + OpenVINO archive | `resources_windows_openvino` | + +Two routing notes mirror existing precedent: **Linux SYCL** ships two precision variants at the *same* +arch, so `CMakeLists.txt` routes them to two *distinct* trees by `GGML_SYCL_F16` (fp16 vs fp32). +**Windows OpenCL** now holds both `x86_64` (desktop ICD) and `aarch64` (Snapdragon/Adreno) in the one +`resources_windows_opencl` tree, split by the `opencl-windows` / `opencl-windows-aarch64` profiles' +arch-scoped `` — exactly like the `vulkan-linux` / `vulkan-linux-aarch64` split. + +The vendor toolchain install steps in `publish.yml` are **first-pass** (apt repos / vendor installers +pinned to a specific version): if a URL/version 404s in CI, the job fails loud and the step is adjusted +— the failure is intentional signal, not a regression to hide behind `continue-on-error`. +`src/main/resources_{linux_rocm,windows_rocm,linux_sycl_fp16,linux_sycl_fp32,windows_sycl,linux_openvino,windows_openvino}/` +are all git-ignored (staged by CI, never committed). + ## WebUI (llama.cpp Svelte UI) embedding The llama.cpp WebUI is **built once in CI and shared to every native build**, then @@ -286,7 +376,7 @@ needs no extra step here, `build-webui` re-reads the tag and rebuilds the matchi ships no UI): ```bash # needs node/npm + network; embed.cpp is plain C++17 (no npm) -git clone --depth 1 --branch b9859 https://github.com/ggml-org/llama.cpp /tmp/lc +git clone --depth 1 --branch b9870 https://github.com/ggml-org/llama.cpp /tmp/lc ( cd /tmp/lc/tools/ui && npm ci && npm run build \ && ( cd dist && find . -type f -not -path './_gzip/*' \ | while read -r f; do mkdir -p "_gzip/$(dirname "$f")"; gzip -9 -c "$f" > "_gzip/$f"; done ) \ @@ -314,13 +404,19 @@ jobs therefore set `BUILD_JOBS: 2` to bound peak memory. **`sccache` → Depot Cache — shared compiler cache.** When `USE_CACHE=true` **and** `sccache` plus a cache token are present, `build.sh` adds `-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache` and prints -`sccache --show-stats`. The cache lives in **Depot Cache** over sccache's **WebDAV** backend: +`sccache --show-stats`. **Per-job cache summary:** when running in CI (`GITHUB_STEP_SUMMARY` set), +`build.sh`/`build.bat` also parse those stats and append a small `### sccache statistics` table +(`Cache hits | Requests | Hit rate`) to the job summary — the sccache/Depot analogue of upstream +llama.cpp's `ccache-action` "CCache Statistics" table, per-job (GitHub does not merge job +summaries). It is best-effort (skipped silently if the numbers can't be parsed) and only emitted +when sccache was actually the launcher; local runs (no `GITHUB_STEP_SUMMARY`) are untouched. The +cache lives in **Depot Cache** over sccache's **WebDAV** backend: - `SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev` - `SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}` — a Depot **organization** token, stored as the repo secret **`DEPOT_TOKEN`**. -Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9859`), the +Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9870`), the ~280 upstream object files are byte-identical every run, so a warm cache recompiles only the *changed* files. Depot's cache is **shared across all branches** (unlike GitHub's per-branch `actions/cache`), so every branch builds incrementally; a `b` version bump @@ -432,6 +528,8 @@ Current patches: | `0002-server-preserve-caller-load-progress-callback.patch` | Load-progress-callback regression introduced in llama.cpp **b9789**: `server_context::load_model` (`tools/server/server-context.cpp`) now **unconditionally** installs the server's own load-progress reporter on `params_base.load_progress_callback` immediately before `common_init_from_params`, clobbering any callback the embedding caller already set. libjllama's `LoadProgressCallback` feature wires `common_params.load_progress_callback` to a JNI trampoline *before* calling `load_model`, so the bump silently killed it — `LoadProgressCallbackTest` saw zero progress updates and the abort-on-`false` path never threw. The patch guards the assignment with `if (params_base.load_progress_callback == nullptr)`, so the server installs its own reporter **only when the caller hasn't** — a caller-supplied callback survives and fires during load. Standalone `llama-server` (no caller callback, so the field is null) is unaffected. Same JNI-vs-standalone divergence class as `0001`. | | `0003-pr22393-server-add-slot-prompt-similarity-getter-setter.patch` | **Upstream-PR carry** of [ggml-org/llama.cpp#22393](https://github.com/ggml-org/llama.cpp/pull/22393) ("server : add slot_prompt_similarity getter/setter") while it is still open upstream. Purely additive: adds `server_context::get_slot_prompt_similarity()` / `set_slot_prompt_similarity(float)` (`tools/server/server-context.{cpp,h}`) so an embedding/JNI caller can query and tune the slot-selection threshold at runtime without reloading the model. Verbatim copy of the PR — drop it once a pinned `b` includes the change. | | `0004-pr23116-server-per-request-reasoning-budget-tokens.patch` | **Upstream-PR carry** of [ggml-org/llama.cpp#23116](https://github.com/ggml-org/llama.cpp/pull/23116) ("server: honour per-request reasoning_budget_tokens in chat completions"), motivated by java-llama.cpp#140, while it is still open upstream. `oaicompat_chat_params_parse` (`tools/server/server-common.cpp`) only read the Anthropic `thinking_budget_tokens` alias and always wrote the server-level `reasoning_budget_message`, so a per-request `reasoning_budget_tokens` / `reasoning_budget_message` on a chat-completions request was ignored. The patch reads both overrides **before** the generic copy loop (precedence: `reasoning_budget_tokens` > `thinking_budget_tokens` alias > server default) and threads the per-request message through. Carries the upstream `tests/test-chat.cpp` additions verbatim so the patch is submittable as-is; like `0001`'s test/call-site flips they are **applied-but-not-compiled** here (`LLAMA_BUILD_TESTS` is OFF for the FetchContent subproject). Drop it once a pinned `b` includes the change. | +| `0005-server-recurrent-near-prompt-end-checkpoints.patch` | **Multi-turn tool-calling perf fix for recurrent/hybrid models (e.g. Granite-4)**, upstream-submittable. In `server_context::update_slots` (`tools/server/server-context.cpp`) the near-prompt-end context checkpoints are gated by `checkpoint_min_step` (default 8192 tokens). An agentic conversation that appends only assistant/tool messages never produces a new user-message checkpoint (`is_user_start`/`is_last_user_message` match `COMMON_CHAT_ROLE_USER` only), so after turn 1 no new checkpoint is ever created and — because recurrent state can only roll back to a checkpoint — **every turn re-prefills the whole conversation tail** (measured on a synthetic granitehybrid model: prefilled tokens grew 901 → 1544 → 2187 → 2830 → 3473 over turns 2–6). The patch (1) exempts near-prompt-end checkpoints from the min-step spacing when the memory can only roll back via checkpoints (`ctx_tgt_seq_rm_type` is `FULL` or `RS` — SWA-only models are unaffected), and (2) skips creating a checkpoint whose position equals the newest one (the last-user-message checkpoint was re-created identically on every turn, flooding the 32-entry list). After the patch each turn restores the previous turn's near-end checkpoint and prefill is constant (~new-turn-sized; 647 tokens/turn in the same measurement, ≈5.4× less prefill at turn 6 and growing with conversation length). Validated output-identical (`temperature=0`) vs. unpatched. Complements — not duplicates — open upstream PRs #24035/#24899/#24891 (they fix checkpoint *invalidation/retention*; this fixes checkpoint *starvation*). Drop once upstream solves agentic checkpoint placement (e.g. a merged role-boundary checkpointing design, cf. #21885 / #22826 discussion). | +| `0006-server-embed-native-server-jni.patch` | **Makes `server.cpp`'s `llama_server` embeddable in the JVM** so the `NativeServer` JNI bridge can run the full upstream HTTP server (WebUI included) inside `libjllama` — see "Two server modes" below. b9870 already exposes `int llama_server(int, char**)` (non-static; no `main` in the file), so the patch only adds embedded-mode support: (1) a `g_llama_server_embedded` flag + `llama_server_set_embedded()` / `llama_server_request_shutdown()` (declared in the committed `src/main/cpp/native_server_bridge.h`); (2) skips installing the process-wide SIGINT/SIGTERM handlers when embedded (they would hijack the JVM's); (3) in embedded mode parses the **forwarded** argv via `common_params_parse` instead of `common_params_parse_main` (whose `GetCommandLineW` recovery would pick up `java.exe`'s command line — the same Windows class of bug `0001` fixes). `llama_server_request_shutdown()` mirrors the SIGTERM path (invokes the installed `shutdown_handler` → `ctx_server.terminate()` unblocks `start_loop()`), giving JNI an out-of-band stop since `ctx_server` is loop-local. Applies **after `0001`** (which flips this call site to `common_params_parse_main`), so its context is the post-`0001` tree; regenerate against `0001`+source on a bump. Only touches `tools/server/server.cpp`. | ## OuteTTS build-time extraction (`cmake/generate-tts-upstream.cmake`) @@ -470,6 +568,13 @@ re-verify the generator the same way you re-verify `patches/`. ## Upgrading/Downgrading llama.cpp Version +**Runbook (documentation root):** [`docs/upgrade/llama-cpp-version-bump.md`](docs/upgrade/llama-cpp-version-bump.md) +covers the full bump process end-to-end — picking the target (topmost GitHub release, via the atom +feed), **chunking by `git diff` byte-size** (bump straight to the target when the diff is < 100 KiB, +else step through the largest intermediate tag still under the threshold), the +`.github/scripts/llama-next-version.sh` helper that computes the next reviewable step, and the +edit/verify/commit loop below. Use it for any non-trivial bump; the steps here are the mechanical core. + To change the llama.cpp version, update the following **three** files (and re-verify `patches/`): 1. **llama/CMakeLists.txt** — the `GIT_TAG` line for llama.cpp: `GIT_TAG b8831` @@ -834,7 +939,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in - `LlamaLoader` — Extracts the platform-specific native library from the JAR to a temp directory, or finds it on `java.library.path`. - `OSInfo` — Detects OS and architecture for library resolution. - **`server` package — OpenAI-compatible HTTP endpoint (a single implementation).** - - `server.OpenAiCompatServer` — built only on the JDK's `com.sun.net.httpserver` (no new dependency), both embeddable and the fat-jar `Main-Class`. Serves `POST /v1/chat/completions` (streaming via SSE + non-streaming), `POST /v1/completions`, `POST /v1/embeddings`, `POST /v1/rerank`, `POST /infill`, `GET /v1/models` and `GET /health` (every route is also reachable without the `/v1` prefix), so editors that speak the OpenAI protocol (e.g. VS Code Copilot "Custom Endpoint", Cline, Roo Code, Continue) can drive a local model. Streaming chat uses the native OAI chunk path (`LlamaModel.streamChatCompletion` → `requestChatCompletionStream` / `receiveChatCompletionChunk` + the C++ `wrap_stream_chunk` helper), preserving `delta.tool_calls`; completions/embeddings/infill forward verbatim to the matching `LlamaModel.handle*`; rerank reshapes `handleRerank` into the OAI `results`/`data` shape. The chat mapper forwards `stream_options` and `response_format` and defaults `cache_prompt=true`; a CORS `Filter` answers `OPTIONS` preflights; `OpenAiSseFormatter.ensureUsageCachedTokens` guarantees `usage.prompt_tokens_details.cached_tokens` on the streamed usage chunk (Copilot crash fix, microsoft/vscode #273482). **Agentic tool-calling is the primary target**; a C++ guard (`test_server.cpp`) pins `tool_calls.function.arguments` as a JSON string (llama.cpp #20198). + - `server.OpenAiCompatServer` — built only on the JDK's `com.sun.net.httpserver` (no new dependency), embeddable and runnable via `java -cp net.ladenthin.llama.server.OpenAiCompatServer …` (the fat-jar default `Main-Class` is now `NativeServer` — see "Two server modes"). Serves `POST /v1/chat/completions` (streaming via SSE + non-streaming), `POST /v1/completions`, `POST /v1/embeddings`, `POST /v1/rerank`, `POST /infill`, `GET /v1/models` and `GET /health` (every route is also reachable without the `/v1` prefix), so editors that speak the OpenAI protocol (e.g. VS Code Copilot "Custom Endpoint", Cline, Roo Code, Continue) can drive a local model. Streaming chat uses the native OAI chunk path (`LlamaModel.streamChatCompletion` → `requestChatCompletionStream` / `receiveChatCompletionChunk` + the C++ `wrap_stream_chunk` helper), preserving `delta.tool_calls`; completions/embeddings/infill forward verbatim to the matching `LlamaModel.handle*`; rerank reshapes `handleRerank` into the OAI `results`/`data` shape. The chat mapper forwards `stream_options` and `response_format` and defaults `cache_prompt=true`; a CORS `Filter` answers `OPTIONS` preflights; `OpenAiSseFormatter.ensureUsageCachedTokens` guarantees `usage.prompt_tokens_details.cached_tokens` on the streamed usage chunk (Copilot crash fix, microsoft/vscode #273482). **Agentic tool-calling is the primary target**; a C++ guard (`test_server.cpp`) pins `tool_calls.function.arguments` as a JSON string (llama.cpp #20198). - **Alternative protocol surfaces** (pure translation over the OpenAI chat core — no second inference path; each reconstructs streamed tool calls via `ToolCallDeltaAccumulator`): **Ollama-native** (`GET /api/version`, `/api/tags`, `POST /api/show`, `/api/chat` with NDJSON streaming, `/api/generate` prompt-completion/FIM — `OllamaApiSupport`; `/api/show` advertises tools/insert/vision capabilities + context length for Copilot's Ollama provider), **Anthropic Messages** (`POST /v1/messages`, SSE event stream — `AnthropicApiSupport` + `AnthropicStreamTranslator`), and **OpenAI Responses** (`POST /v1/responses`, SSE event stream — `ResponsesApiSupport` + `ResponsesStreamTranslator`). The llama.cpp-native `GET /props` (context length + `modalities`) is served via `OpenAiSseFormatter.propsJson` for autocomplete clients that size their context from it. - Supporting classes: `OpenAiServerConfig` (builder; optional bearer auth; binds `127.0.0.1`; `corsAllowOrigin`; `supportsVision`), `OpenAiServerCli` (testable CLI arg parser → `ModelParameters` + `OpenAiServerConfig`; flags incl. `--mmproj`/`--embedding`/`--reranking`), `OpenAiRequestMapper` (OAI chat request → `InferenceParameters`), `OpenAiSseFormatter` (SSE/models/error JSON + usage normalization), `OaiRerankSupport` (pure rerank request/response shaping), and the model-free test seam `OpenAiBackend`/`ChunkSink` + `LlamaModelBackend`. The streaming envelope is parsed by `json.ChatStreamChunkParser`. - The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`); `noInternalJdkImports` carries an explicit exception for the supported `com.sun.net.httpserver` (the exported `jdk.httpserver` module, which `module-info.java` `requires`). See README "OpenAI-compatible HTTP server". @@ -845,7 +950,14 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in - `json_helpers.hpp` — Pure JSON transformation helpers (no JNI, no llama state). Independently unit-testable. - `jni_helpers.hpp` — JNI bridge helpers (handle management + server orchestration). Includes `json_helpers.hpp`. - Uses `nlohmann/json` for JSON deserialization of parameters. -- The upstream server library (`server-context.cpp`, `server-queue.cpp`, `server-task.cpp`, `server-schema.cpp`, `server-models.cpp`, and — since b9829 — `server-stream.cpp`) is compiled directly into `jllama` via CMake — there is no hand-ported `server.hpp` fork. **`server-stream.cpp` is mandatory, not optional:** it defines the resumable-streaming SSE replay buffer (`g_stream_sessions`, `stream_session_attach_pipe`, `stream_aware_should_stop`, `stream_conv_id_from_headers`, the `stream_pipe_*` types) that `server-context.cpp` / `server-http.cpp` / `server-models.cpp` now `#include "server-stream.h"` and call, so omitting it fails the link with undefined references. It is platform-neutral (threads + std mutex/condvar, no `subprocess.h`/`posix_spawn_*`), so it builds on Android too and sits outside the `server-models.cpp` Android guard. `jllama` wires its own JNI routes and never calls `g_stream_sessions.start_gc()` (only the excluded standalone `server.cpp` `main()` does), so its GC thread stays dormant. **Phase 2:** the upstream HTTP transport (`tools/server/server-http.cpp`) and its `cpp-httplib` backend (`vendor/cpp-httplib/httplib.cpp`) are now compiled into `jllama` too, so the OpenAI-compatible server can be driven natively from JNI *inside* `libjllama` — no separate `llama-server` executable (a JNI shared library loads anywhere a JVM runs, which a standalone binary does not). `server-http.cpp` does `#include "ui.h"` (the WebUI asset table that `tools/ui`/`llama-ui` normally generates); since the Svelte WebUI is not shipped, `src/main/cpp/webui_stub/ui.h` supplies the upstream **empty-asset** interface and leaves `LLAMA_UI_HAS_ASSETS` undefined (all static-asset-serving blocks compile out). `` already resolves via `llama-common`'s `vendor/` include dir (same nlohmann/json 3.12.0 as the FetchContent copy). No SSL: `CPPHTTPLIB_OPENSSL_SUPPORT` is left undefined (plain-HTTP; bind localhost / front with a TLS proxy). Only `server.cpp` (the standalone `main()` + route wiring) remains excluded — wiring the routes to JNI is the next step. +- The upstream server library (`server-context.cpp`, `server-queue.cpp`, `server-task.cpp`, `server-schema.cpp`, `server-models.cpp`, and — since b9829 — `server-stream.cpp`) is compiled directly into `jllama` via CMake — there is no hand-ported `server.hpp` fork. **`server-stream.cpp` is mandatory, not optional:** it defines the resumable-streaming SSE replay buffer (`g_stream_sessions`, `stream_session_attach_pipe`, `stream_aware_should_stop`, `stream_conv_id_from_headers`, the `stream_pipe_*` types) that `server-context.cpp` / `server-http.cpp` / `server-models.cpp` now `#include "server-stream.h"` and call, so omitting it fails the link with undefined references. It is platform-neutral (threads + std mutex/condvar, no `subprocess.h`/`posix_spawn_*`), so it builds on Android too and sits outside the `server-models.cpp` Android guard. `jllama` wires its own JNI routes and never calls `g_stream_sessions.start_gc()` (only the excluded standalone `server.cpp` `main()` does), so its GC thread stays dormant. **Phase 2:** the upstream HTTP transport (`tools/server/server-http.cpp`) and its `cpp-httplib` backend (`vendor/cpp-httplib/httplib.cpp`) are now compiled into `jllama` too, so the OpenAI-compatible server can be driven natively from JNI *inside* `libjllama` — no separate `llama-server` executable (a JNI shared library loads anywhere a JVM runs, which a standalone binary does not). `server-http.cpp` does `#include "ui.h"` (the WebUI asset table that `tools/ui`/`llama-ui` normally generates); since the Svelte WebUI is not shipped, `src/main/cpp/webui_stub/ui.h` supplies the upstream **empty-asset** interface and leaves `LLAMA_UI_HAS_ASSETS` undefined (all static-asset-serving blocks compile out). `` already resolves via `llama-common`'s `vendor/` include dir (same nlohmann/json 3.12.0 as the FetchContent copy). No SSL: `CPPHTTPLIB_OPENSSL_SUPPORT` is left undefined (plain-HTTP; bind localhost / front with a TLS proxy). **`server.cpp` is now compiled in too** (on non-Android — it and `server-tools.cpp` pull in `subprocess.h`/`posix_spawn_*`, so they share `server-models.cpp`'s Android guard): b9870 exposes its entry as `int llama_server(int, char**)` (no `main` in the file), and `patches/0006` makes it embeddable (no process signal handlers, forwarded-argv parse, out-of-band shutdown). The `NativeServer` JNI bridge (`src/main/cpp/native_server.cpp`) calls `llama_server` on a worker thread, so the **full** upstream server — WebUI and all — runs inside `libjllama`. See "Two server modes" below. + +### Two server modes (`OpenAiCompatServer` vs `NativeServer`) + +The library exposes **two** ways to serve a model over HTTP, on two different transports. The fat jar's `Main-Class` is `server.ServerLauncher`, a tiny dispatcher: it runs `OpenAiCompatServer` when `--jllama-openai-compat` is present (that marker is stripped, the rest forwarded) and the default `NativeServer` otherwise. Both mains are also runnable directly by class name via `java -cp`. The two modes: + +1. **`server.OpenAiCompatServer` (Java transport).** OpenAI/Ollama/Anthropic-compatible JSON API on the JDK's `com.sun.net.httpserver`, driving the compiled server *core* over JNI. Embeddable, no extra dependency, and it can share/reuse a `LlamaModel`. It serves **no** static assets — its `/` route is a 404, so **no WebUI**. It has its own `main` (run via `java -cp net.ladenthin.llama.server.OpenAiCompatServer …`); its CLI (`OpenAiServerCli`) maps a curated flag subset (`-m/-c/-b/-ub/-ngl/-t/-tb/-ctk/-ctv/--jinja/--chat-template-kwargs/--host/--port/--parallel/--mmproj/--api-key/--embedding/--reranking`). +2. **`server.NativeServer` (native transport) — the default fat-jar server (when `--jllama-openai-compat` is absent).** Runs the **full upstream `llama_server`** (via `patches/0006` + `native_server.cpp`) inside `libjllama`, forwarding the raw llama-server argv verbatim — so **every** llama-server flag works and the **embedded WebUI is served** (when the assets are compiled in; CI's released jars have them, local `cmake` builds use the empty-asset stub). It is an **independent lifecycle** (loads its own model from the argv, like `llama-server.exe`; owns the process's llama backend + stderr logging while running), **single-instance per process** (upstream keeps shutdown state in file-scope globals), and **not available on Android** (the `subprocess.h` guard). Reusing an already-loaded `LlamaModel`'s context is a documented TODO. `libjllama` loading anywhere a JVM runs is what makes this "no separate `llama-server.exe`" possible. ### Native Helper Architecture @@ -955,6 +1067,24 @@ Wiring (mirrors the macOS native jobs, not the dockcross jobs): - Branch protection: if a required check pinned the old name "Cross-Compile Linux aarch64 (LTS)", repoint it to "Build and Test Linux aarch64". +### Linux s390x: big-endian cross-build + qemu test gate + +`build-linux-s390x` extends the default JAR to **IBM Z (s390x, big-endian)** — the one target whose +byte order differs from every other platform. It **cross-compiles** with the GCC s390x toolchain +(`g++-s390x-linux-gnu`, native x86 speed — no emulated build) and then runs the **full C++ unit suite +under `qemu-user`** (`CMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-s390x-static`, `QEMU_LD_PREFIX=/usr/s390x-linux-gnu`). +That `ctest` run is a **real big-endian correctness gate** for the byte-order-sensitive surface — the +little-endian WAV writer (`tts_wav.hpp`), the JSON/token/embedding transforms, and the JNI helpers — +which is where an endian bug in *our* code could hide. Model-backed **Java** tests are deliberately +**not** run under emulation (a JVM + GGUF inference under `qemu-user` is slow and flaky); the Java↔JNI +boundary uses host-native array copies (endian-transparent), so the C++ gate covers the actual risk. +`-DGGML_OPENMP=OFF` sidesteps cross-libgomp issues (ggml uses its own `std::thread` pool). s390x is a +CPU platform like aarch64, so it ships in the **default** JAR (`Linux-s390x-libraries` merges via the +`*-libraries` glob; `OSInfo` maps `os.arch=s390x` → `Linux/s390x`) — no classifier, no pom profile. +**Fail-loud** and in `package.needs` like every other build. (Upstream llama.cpp already supports s390x +— it ships `ubuntu-s390x` with GGUF big-endian handling — so the native inference path is upstream's +concern; this job validates only *our* layer's endian-safety.) + ## Testing ### Java tests @@ -1017,17 +1147,17 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson" | File | Tests | Scope | |------|-------|-------| | `src/test/cpp/test_utils.cpp` | 156 | Upstream helpers: `server_tokens`, `server_grammar_trigger`, `gen_tool_call_id`, `json_value`, `json_get_nested_values`, UTF-8 helpers, `format_response_rerank`, `format_embeddings_response_oaicompat`, `oaicompat_completion_params_parse`, `oaicompat_chat_params_parse`, `are_lora_equal`, `strip_flag_from_argv`, `token_piece_value`, `json_is_array_and_contains_numbers`, `format_oai_sse`, `format_oai_resp_sse`, `format_anthropic_sse` | -| `src/test/cpp/test_server.cpp` | 194 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths + per-request `dry_*` field round-trips), `response_fields` projection | +| `src/test/cpp/test_server.cpp` | 197 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths + per-request `dry_*` and `sse_ping_interval` field round-trips incl. hard-limit + server-default inheritance), `response_fields` projection | | `src/test/cpp/test_json_helpers.cpp` | 47 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config`, `wrap_stream_chunk` | | `src/test/cpp/test_log_helpers.cpp` | 13 | All functions in `log_helpers.hpp`: `log_level_name`, `format_log_as_json` | | `src/test/cpp/test_jni_helpers.cpp` | 47 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock | | `src/test/cpp/test_tts_wav.cpp` | 2 | The in-memory WAV writer `pcm_to_wav16_bytes` in `tts_wav.hpp` (WAV header/payload + little-endian clamping). The OuteTTS DSP it pairs with is derived from upstream `tts.cpp` and covered end-to-end by the Java `TtsIntegrationTest`, not unit-tested here. | -**Current total: 459 tests (all passing).** +**Current total: 462 tests (all passing).** #### Upstream source location (in CMake build tree) -llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9859`. +llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9870`. **GoogleTest** is a separate `BUILD_TESTING`-only FetchContent (`GIT_TAG v1.17.0`), used solely by the `jllama_test` C++ unit-test binary — not by the shipped library, and not coupled to the diff --git a/README.md b/README.md index ec6f5e75..772dd18f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ **Build:** ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) ![Platform](https://img.shields.io/badge/Platform-Linux%20%7C%20macOS%20%7C%20Windows%20%7C%20Android-lightgrey) -[![llama.cpp b9859](https://img.shields.io/badge/llama.cpp-%23b9859-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9859) +[![llama.cpp b9870](https://img.shields.io/badge/llama.cpp-%23b9870-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b9870) [![JPMS](https://img.shields.io/badge/JPMS-modular%20JAR-25A162)](https://openjdk.org/projects/jigsaw/) ![JUnit](https://img.shields.io/badge/tested%20with-JUnit6-25A162) [![JSpecify](https://img.shields.io/badge/JSpecify-1.0.0%20%40NullMarked-25A162)](https://jspecify.dev) @@ -107,7 +107,7 @@ Inference of Meta's LLaMA model (and others) in pure C/C++. - **Infilling** (fill-in-the-middle) for code models. - **Tokenize / detokenize** and **JSON-schema → grammar** conversion. - **Raw JSON endpoint handlers** mirroring the upstream llama.cpp HTTP server (`/completions`, `/v1/completions`, `/embeddings`, `/infill`, `/tokenize`, `/detokenize`). -- **Runnable OpenAI-compatible HTTP server** (`OpenAiCompatServer`, the fat-jar `Main-Class`, streaming SSE, zero extra dependency): `java -jar …-jar-with-dependencies.jar --model model.gguf --port 8080`. +- **Two runnable HTTP server modes, one fat-jar entry.** The fat jar's `Main-Class` is `ServerLauncher`, which dispatches on the `--jllama-openai-compat` flag. Without it, `java -jar …-jar-with-dependencies.jar -m model.gguf --port 8080` runs the full upstream llama.cpp server (embedded **WebUI**, every llama-server flag forwarded) hosted inside `libjllama` over JNI — no separate `llama-server.exe`. With it, `java -jar … --jllama-openai-compat --model model.gguf --port 8080` runs the Java-transport, zero-extra-dependency **OpenAI-compatible** server (`OpenAiCompatServer`, streaming SSE) instead. Both are also runnable directly by class name via `java -cp … net.ladenthin.llama.server.{NativeServer,OpenAiCompatServer}`. - **Model metadata** access (`getModelMeta()`) and **server management** (metrics, slot save/restore, runtime thread reconfiguration). - Pre-built native binaries for Linux (x86-64, aarch64), macOS (x86-64, arm64), and Windows (x86-64, x86); CUDA, Metal, and Vulkan supported via local build. @@ -164,20 +164,40 @@ If any of these match your platform, you can include the Maven dependency and ge The Maven coordinate `net.ladenthin:llama` publishes one default JAR (CPU-only; its Windows natives are built with the Ninja Multi-Config + MSVC toolchain) plus -optional JARs selected via a Maven ``: three Windows GPU builds -(CUDA / Vulkan / OpenCL), the Linux CUDA and Android OpenCL builds, and an -alternate-toolchain MSVC Windows CPU build. Pick at most one GPU/accelerator -classifier — those are mutually exclusive — and optionally a CPU Windows build. +optional JARs selected via a Maven ``: NVIDIA CUDA (Linux / Windows), +Vulkan (Linux x86-64 / aarch64, Windows), AMD ROCm/HIP (Linux / Windows), Intel +SYCL (Linux fp16 / fp32, Windows) and OpenVINO (Linux / Windows) GPU builds, OpenCL +(Android Adreno, Windows x86-64 / Snapdragon-arm64), and an alternate-toolchain MSVC +Windows CPU build. Pick at most one GPU/accelerator classifier — those are mutually +exclusive — and optionally a CPU Windows build. | Classifier | Backend | Target platform | Runtime requirement | |---|---|---|---| -| _(none)_ | CPU | Linux x86-64 / aarch64, macOS x86-64 / aarch64, Windows x86-64 / x86 (Ninja Multi-Config + MSVC), Android aarch64 (CPU) | A JDK 8+ JVM. **Linux `aarch64` additionally requires glibc ≥ 2.39** (e.g. Ubuntu 24.04+, Debian 13+) — it is built natively on `ubuntu-24.04-arm`, matching upstream llama.cpp's own ARM binaries; older-glibc ARM hosts (Ubuntu 22.04, Debian 12, RHEL 8/9, Amazon Linux 2023) are not supported. Linux x86-64 keeps a glibc 2.17 floor (manylinux2014). | +| _(none)_ | CPU | Linux x86-64 / aarch64 / s390x, macOS x86-64 / aarch64, Windows x86-64 / x86 / aarch64 (Ninja Multi-Config + MSVC), Android aarch64 (CPU) | A JDK 8+ JVM. **Linux `aarch64` additionally requires glibc ≥ 2.39** (e.g. Ubuntu 24.04+, Debian 13+) — it is built natively on `ubuntu-24.04-arm`, matching upstream llama.cpp's own ARM binaries; older-glibc ARM hosts (Ubuntu 22.04, Debian 12, RHEL 8/9, Amazon Linux 2023) are not supported. Linux x86-64 keeps a glibc 2.17 floor (manylinux2014). **Windows `aarch64`** (Windows on ARM — Snapdragon X / Surface) is built natively on `windows-11-arm` and ships in the default JAR alongside the x86-64 / x86 natives. | | `msvc-windows` | CPU (MSVC / Visual Studio generator) | Windows x86-64 and x86 | None beyond a JDK 8+ JVM. Same CPU backend as the default JAR's Windows natives, but compiled with the Visual Studio generator instead of `Ninja Multi-Config`. Both use the same MSVC toolchain (static `/MT` CRT), so they are functionally equivalent — provided as an alternate-toolchain option. | | `cuda13-windows-x86-64` | CUDA 13 | Windows x86-64 with NVIDIA GPU | NVIDIA driver + CUDA 13 Toolkit installed on the host (`cudart64_13.dll`, `cublas64_13.dll`, `cublasLt64_13.dll` resolvable on `PATH`). The runtime libraries are **not bundled** in the JAR; native-library load fails with `UnsatisfiedLinkError` if they are absent. No CPU fallback. | | `vulkan-windows-x86-64` | Vulkan | Windows x86-64 with a Vulkan 1.2+ GPU (NVIDIA / AMD / Intel) | A Vulkan runtime (`vulkan-1.dll`), which current GPU drivers install. No Vulkan SDK is needed at runtime. The most portable Windows GPU option (vendor-independent). | | `opencl-windows-x86-64` | OpenCL | Windows x86-64 with an OpenCL 2.0+ GPU | A vendor OpenCL ICD (`OpenCL.dll`, installed by the GPU driver). **Note:** the GGML OpenCL backend is Adreno-tuned; on desktop GPUs CUDA or Vulkan are better supported. | | `cuda13-linux-x86-64` | CUDA 13 | Linux x86-64 with NVIDIA GPU | NVIDIA driver + CUDA 13 runtime libraries (`libcudart.so.13`, `libcublas.so.13`) installed on the host. The shared library is dynamically linked against them and will fail to `dlopen` if they are absent — there is no automatic fallback to CPU. | +| `vulkan-linux-x86-64` | Vulkan | Linux x86-64 with a Vulkan 1.2+ GPU (NVIDIA / AMD / Intel) | A Vulkan runtime (`libvulkan.so.1`), which current GPU drivers install. No Vulkan SDK is needed at runtime. The most portable Linux GPU option (vendor-independent, no CUDA toolkit). Built natively on `ubuntu-latest`, so it shares the aarch64 build's higher glibc floor (≈ 2.39). | +| `vulkan-linux-aarch64` | Vulkan | Linux aarch64 with a Vulkan 1.2+ GPU | A Vulkan runtime (`libvulkan.so.1`) from the device/driver. glibc ≥ 2.39 (built on `ubuntu-24.04-arm`). | | `opencl-android-aarch64` | OpenCL (Adreno) | Android aarch64 with Qualcomm Adreno GPU | A device-supplied OpenCL ICD (`libOpenCL.so`). Devices without an ICD (e.g. most non-Snapdragon Android hardware) must use the default CPU JAR. | +| `rocm-linux-x86-64` | ROCm / HIP | Linux x86-64 with AMD GPU | An installed AMD ROCm runtime (`libamdhip64.so`, `librocblas.so`, `libhipblas.so`) on the host. Not bundled; native load fails without it. No CPU fallback. | +| `rocm-windows-x86-64` | ROCm / HIP | Windows x86-64 with AMD GPU | The AMD HIP SDK runtime DLLs (`amdhip64.dll`, `rocblas.dll`, `hipblas.dll`) on `PATH`. Not bundled. No CPU fallback. | +| `sycl-fp16-linux-x86-64` | SYCL (Intel oneAPI, fp16) | Linux x86-64 with Intel GPU (Arc / iGPU) | An installed Intel oneAPI / Level-Zero runtime. fp16 accumulation (faster, slightly lower precision). Not bundled. | +| `sycl-fp32-linux-x86-64` | SYCL (Intel oneAPI, fp32) | Linux x86-64 with Intel GPU (Arc / iGPU) | An installed Intel oneAPI / Level-Zero runtime. fp32 accumulation (higher precision). Not bundled. | +| `sycl-windows-x86-64` | SYCL (Intel oneAPI) | Windows x86-64 with Intel GPU (Arc / iGPU) | The Intel oneAPI / Level-Zero runtime DLLs on `PATH`. Not bundled. | +| `opencl-windows-aarch64` | OpenCL (Adreno) | Windows-on-ARM aarch64 (Snapdragon X) with Adreno GPU | A device-supplied OpenCL ICD (`OpenCL.dll`, from the Adreno driver). Not bundled. | +| `openvino-linux-x86-64` | OpenVINO | Linux x86-64 (Intel GPU / NPU / CPU) | An installed Intel OpenVINO runtime. Not bundled. | +| `openvino-windows-x86-64` | OpenVINO | Windows x86-64 (Intel GPU / NPU / CPU) | The Intel OpenVINO runtime DLLs on `PATH`. Not bundled. | + +> [!NOTE] +> The AMD (`rocm-*`), Intel SYCL (`sycl-*`), Windows-on-ARM OpenCL +> (`opencl-windows-aarch64`) and Intel OpenVINO (`openvino-*`) classifiers are +> newly added GPU backends. Like the other GPU classifiers they are validated +> **build-only** in CI (GitHub runners have no matching GPU), so end-to-end +> inference is verified locally / on self-hosted hardware. As with every GPU JAR, +> the vendor runtime is supplied by the consumer's driver/toolkit and is not bundled. ```xml @@ -219,6 +239,22 @@ classifier — those are mutually exclusive — and optionally a CPU Windows bui vulkan-windows-x86-64 + + + net.ladenthin + llama + 5.0.4 + vulkan-linux-x86-64 + + + + + net.ladenthin + llama + 5.0.4 + vulkan-linux-aarch64 + + net.ladenthin @@ -234,6 +270,70 @@ classifier — those are mutually exclusive — and optionally a CPU Windows bui 5.0.4 msvc-windows + + + + net.ladenthin + llama + 5.0.4 + rocm-linux-x86-64 + + + + + net.ladenthin + llama + 5.0.4 + rocm-windows-x86-64 + + + + + net.ladenthin + llama + 5.0.4 + sycl-fp16-linux-x86-64 + + + + + net.ladenthin + llama + 5.0.4 + sycl-fp32-linux-x86-64 + + + + + net.ladenthin + llama + 5.0.4 + sycl-windows-x86-64 + + + + + net.ladenthin + llama + 5.0.4 + opencl-windows-aarch64 + + + + + net.ladenthin + llama + 5.0.4 + openvino-linux-x86-64 + + + + + net.ladenthin + llama + 5.0.4 + openvino-windows-x86-64 + ``` > [!IMPORTANT] @@ -591,7 +691,9 @@ array alone at `GET /slots`. OpenAI responses preserve `net.ladenthin.llama.server.OpenAiCompatServer` turns a loaded model into a local OpenAI-compatible HTTP endpoint using only the JDK's built-in `com.sun.net.httpserver` — no extra -dependency and no separate server process. It is both embeddable and the fat-jar `Main-Class`. It +dependency and no separate server process. It is embeddable, and runnable via +`java -cp net.ladenthin.llama.server.OpenAiCompatServer …` (the fat jar's default +`Main-Class` is instead `NativeServer` — see "Native server with the built-in WebUI" below). It serves: | Method & path | Backed by | @@ -646,23 +748,27 @@ try (LlamaModel model = new LlamaModel(modelParams); } ``` -…or run it standalone. The fat jar built by the `assembly` profile (`mvn -P assembly package`) is -runnable (its `Main-Class` is `net.ladenthin.llama.server.OpenAiCompatServer`); the plain library jar -works too via `-cp`: +…or run it standalone. The fat jar's `Main-Class` is the `ServerLauncher` dispatcher, so add +`--jllama-openai-compat` to select this Java server (the launcher strips that flag and forwards the rest); +or name the class explicitly via `-cp`: ```bash -# fat jar (bundles the native lib + Java deps) -java -jar target/llama--jar-with-dependencies.jar \ +# fat jar (bundles the native lib + Java deps) — select the Java server with --jllama-openai-compat +java -jar target/llama--jar-with-dependencies.jar --jllama-openai-compat \ --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99 -# or the plain jar +# or name the class explicitly (fat jar or plain library jar) java -cp target/llama-.jar net.ladenthin.llama.server.OpenAiCompatServer \ --model models/model.gguf --port 8080 --model-id local-model ``` Run with `--help` for the full option list (`-m/--model`, `--host`, `-p/--port`, `-c/--ctx-size`, -`-ngl/--n-gpu-layers`, `-t/--threads`, `--parallel`, `--model-id`, `--api-key`, `--mmproj`, -`--embedding`, `--reranking`). +`-b/--batch-size`, `-ub/--ubatch-size`, `-ngl/--n-gpu-layers`, `-t/--threads`, `-tb/--threads-batch`, +`-ctk/--cache-type-k`, `-ctv/--cache-type-v`, `--jinja`, `--chat-template-kwargs`, `--parallel`, +`--model-id`, `--api-key`, `--mmproj`, `--embedding`, `--reranking`). The tuning flags mirror +llama.cpp's server, so an invocation like +`--jinja --chat-template-kwargs '{"reasoning_effort":"low"}' -ctk q8_0 -ctv q8_0 -b 4096 -ub 2048` +works directly. Verify with curl (streaming chat): @@ -706,6 +812,45 @@ tool calling depends on the model's own tool-calling quality. Pass `--api-key` ( `OpenAiServerConfig.apiKey(...)`) to require an `Authorization: Bearer` token; the server binds to `127.0.0.1` by default. +### Native server with the built-in WebUI (`NativeServer`) + +`OpenAiCompatServer` above is a JSON **API** server (its `/` is a 404 — no web page). If you want +the **full upstream llama.cpp server, including its bundled Svelte WebUI**, use +`net.ladenthin.llama.server.NativeServer`. It runs the real `llama_server` inside `libjllama` over +JNI — no separate `llama-server.exe` — and **forwards the raw llama-server arguments verbatim**, so +every flag works exactly as it does for the standalone binary. The fat jar runs it **by default** +(when `--jllama-openai-compat` is absent), forwarding its args to the native server (pass `--help` for the +full llama-server option list): + +```bash +java -jar target/llama--jar-with-dependencies.jar \ + -m models/model.gguf --host 127.0.0.1 --port 8080 -c 65536 --jinja +# then open http://127.0.0.1:8080/ for the WebUI +``` + +Or embed it: + +```java +try (NativeServer server = new NativeServer( + "-m", "gpt-oss-20b-UD-Q4_K_XL.gguf", + "--host", "127.0.0.1", "--port", "8080", + "-c", "65536", "-b", "4096", "-ub", "2048", + "--jinja", "-ngl", "0", "-t", "8", "-tb", "16", + "-ctk", "q8_0", "-ctv", "q8_0", + "--chat-template-kwargs", "{\"reasoning_effort\":\"low\"}", + "--parallel", "1").start()) { + // Open http://127.0.0.1:8080/ in a browser for the WebUI; the OpenAI API is at /v1/... too. + Thread.currentThread().join(); +} +``` + +Differences from `OpenAiCompatServer`: it **loads its own model** from the arguments (an independent +lifecycle, like `llama-server.exe`, not a shared `LlamaModel`), it is **single-instance per +process**, it serves the **WebUI** (in released jars — local `cmake` builds ship the empty-asset +stub, so no UI there), and it is **not available on Android** (the upstream server needs +`posix_spawn`). Readiness: poll `GET /health`. No SSL (plain HTTP — bind localhost or front with a +TLS proxy). + ### LangChain4j integration A separate artifact, **`net.ladenthin:llama-langchain4j`**, adapts a `LlamaModel` to diff --git a/TODO.md b/TODO.md index 66de274f..e6390529 100644 --- a/TODO.md +++ b/TODO.md @@ -13,6 +13,36 @@ cross-cutting initiative. ## Open — jllama-specific +### NativeServer — reuse an already-loaded `LlamaModel` (open, enhancement) + +`net.ladenthin.llama.server.NativeServer` (the native-transport server mode that runs the full +upstream `llama_server` — WebUI included — inside `libjllama` over JNI) currently loads its **own** +model from the forwarded argv, exactly like running `llama-server.exe`. This is the "independent +lifecycle" v1: simple, and every llama-server flag is forwarded verbatim. + +**Enhancement:** let `NativeServer` optionally attach to an **already-loaded** `LlamaModel`'s +`server_context` instead of loading a second copy of the weights (saves the RAM/VRAM and load time +of a duplicate model when a caller already has a `LlamaModel` open). Feasibility notes from the +initial investigation: + +- The upstream HTTP transport (`server_http_context`) and the route bundle + (`server_routes routes(params, ctx_server)`) only need a reference to a `server_context`. A + `LlamaModel` already owns and drives one (`jllama_context` in `jni_helpers.hpp`), and its JNI + methods already post tasks to that context's queue — so a second driver (the HTTP routes) posting + to the same queue is plausible; the queue is the synchronization point. +- The real work is **lifecycle/ownership**: today `llama_server()` owns the whole flow (parse → + backend init → `ctx_server.load_model` → `start_loop` on its own thread → cleanup). Reuse would + need a *different* entry that skips model loading and the `start_loop`/backend ownership (the + existing `LlamaModel` worker already runs the loop), registers the HTTP routes against the shared + `server_context`, and starts only `server_http_context`. That is a separate, smaller C++ entry + point (not `llama_server`), plus reconciling params (the loaded model's params vs. server params) + and ensuring only one thread drives `update_slots`. +- Logging: `llama_server` calls `common_init()` which routes llama.cpp logging to stderr/file; a + reuse path must not clobber the JNI log callback a `LlamaModel` consumer may rely on. + +Until then, run `NativeServer` standalone (it owns the process's llama backend + logging while +running), or use the Java-transport `OpenAiCompatServer` when sharing a `LlamaModel`. + ### PIT gate not hermetic — `value.ContentPart.audioFile(Path)` (open) The PIT mutation gate reaches 100% **only when the audio test fixture is present**. Without it the diff --git a/docs/history/llama-cpp-breaking-changes.md b/docs/history/llama-cpp-breaking-changes.md index 6c6885af..cf745f86 100644 --- a/docs/history/llama-cpp-breaking-changes.md +++ b/docs/history/llama-cpp-breaking-changes.md @@ -412,3 +412,14 @@ Used during `llama.cpp` version bumps: when upgrading, scan this file from the r | b9842–b9859 | `common/arg.cpp` + `common/http.h` + `tools/server/server-{http,models}.cpp` + `tools/server/server-cors-proxy.h` | **IPv6 URL handling + hf-split primary fix**, all inside upstream-compiled TUs the project already builds. (1) `common/http.h` gains a `common_http_format_host()` helper that brackets an IPv6 literal host (`[::1]`) per RFC 3986, and `common_http_parse_url` now splits the authority so a bracketed IPv6 literal keeps its inner colons; `server-http.cpp` (listening-address string), `server-models.cpp` (proxy `Host` header) and `server-cors-proxy.h` (proxy log) each `#include "http.h"` and route the host through it. `server-http.cpp`/`server-models.cpp`/`server-cors-proxy.h` are already compiled into `jllama`; the project binds none of these symbols and passes host/port as plain params, so behaviour is unchanged for localhost binds. (2) `common/arg.cpp` `common_models_handler_apply` now threads a `primary` hf-split file (the `00001-of` part) through the `add_tasks` lambda instead of assuming index 0 — internal to the `--hf`/`--hf-repo-v`/`--spec-draft-hf` download planner, which the project never calls (`grep -rn "common_models_handler\|common_http_format_host" src/main/cpp src/test/cpp` → zero matches). No project source changes required. | | b9842–b9859 | `ggml/src/ggml-cpu/` + `ggml/src/ggml-cuda/` + `ggml/src/ggml-opencl/` + `ggml/src/ggml-vulkan/` + `ggml/src/ggml-webgpu/` + `ggml/src/ggml-hexagon/` + `ggml/src/ggml-backend.cpp` + `src/models/qwen3next.cpp` + `tools/ui/**` | Backend-internal only, no API surface visible to `jllama.cpp`. CPU adds an AVX2/AVX `ggml_vec_dot_nvfp4_q8_0` + a UE4M3 lookup table (`kvalues_mxfp4` renamed to shared `kvalues_fp4`); CUDA adds head-dim-512 flash-attention MMA/tile instances, a strided `get_rows_back` grid-clamp fix (new `test-backend-ops` case for row count > 65535), a gfx900 MMQ gate, and drops the CPU→CUDA async-copy path (scheduler now copies inputs synchronously); OpenCL adds full Q1_0 mul_mat/mul_mv + a `GGML_OPENCL_USE_ADRENO_BIN_KERNELS` prebuilt-binary-kernel loader (OFF by default; affects only the `opencl-*` classifiers); Vulkan rolls the mul_mm BK loop on Asahi/Honeykrisp; WebGPU adds NVFP4 support; Hexagon reworks HVX/HMX flash-attention (new `flash-attn-ops.h`/`hmx-fa-kernels.h`, MUL_MAT_ADD fusion). `qwen3next.cpp` records `t_layer_inp[il]` for MTP. All internal to upstream-compiled `libllama`/`ggml`/backends; the WebUI **auto-follows** the pinned `GIT_TAG` (the `build-webui` CI job rebuilds it), so its edits (PWA navigate-fallback, chat-store foreign-conversation guards) need no manual step. No project source changes required. | | b9842–b9859 | upstream verification (sandbox) | All four patches (`0001`–`0004`) re-verified to **apply cleanly** against b9859 via `git apply --check` over the actual b9859 sources fetched from `raw.githubusercontent.com` (github.com git-clone is blocked in this sandbox, so a full `FetchContent` build could not run — exit 0 for `common/arg.{cpp,h}`, `tests/test-arg-parser.cpp`, `tools/server/server-context.{cpp,h}`, `server-common.cpp`, `tests/test-chat.cpp`). The only patch-target file that changed in this range is `common/arg.cpp`, whose b9859 edit is in `common_models_handler_apply` (~L496) — disjoint from patch 0001's `make_utf8_argv`/`common_params_parse` hunks (~L931/L971) and the ~34 standalone-main flips (unchanged in this range), so patch 0001 still applies. Patches 0002/0003/0004 target files untouched in b9842→b9859, so their hunks are byte-identical to b9842. OuteTTS generator anchors hold (`tools/tts/tts.cpp` unchanged). Full build + `ctest` (target 459/459) to be confirmed by the CI pipeline. | +| b9859–b9862 | `include/llama.h` + `src/llama-model-loader.cpp` + `src/llama-model.{cpp,h}` + `tools/server/server-context.{cpp,h}` + `tools/cli/cli.cpp` | **New feature (additive C API), no break.** Upstream promoted the previously-`static` `llama_model_ftype_name(llama_ftype)` (in `llama-model-loader.cpp`) to a **public** `LLAMA_API const char * llama_ftype_name(enum llama_ftype)` and added `LLAMA_API enum llama_ftype llama_model_ftype(const llama_model *)` (backed by a new `llama_model::ftype()` / `impl::ftype` cached from `ml.ftype` at `load_hparams`). `server_context::get_meta()` now fills a **new `std::string model_ftype`** field on `server_context_meta` (`server-context.h`) and `server_routes::get_model_info()` emits a `"ftype"` key — so the **NativeServer** mode's model-info/`/props` surface gains the quant type automatically (WebUI + `llama-server` clients). `cli.cpp` prints an `ftype :` line. **All inside upstream-compiled `libllama`/server TUs the project already links** — the project binds none of the new symbols (`grep` → only a *comment* mentions `server_context_meta` in `jllama.cpp`; nothing constructs it, and adding a trailing field is source-additive). No project source changes required for the bump itself. **Follow-up (done):** the quant type is now also surfaced through the Java layer — `getModelMetaJson` emits `"ftype"` (from `server_context_meta::model_ftype`), `ModelMeta.getFtype()` / `LlamaModel.getModelFtype()` expose it, and the Java `OpenAiCompatServer` advertises it as `data[].ftype` in `GET /v1/models` (threaded through `OpenAiServerConfig.modelFtype`, mirroring how `supportsVision` is threaded), matching the upstream `get_model_info()` key. | +| b9859–b9862 | `ggml/src/ggml-cuda/gated_delta_net.{cu,cuh}` + `ggml/src/ggml-cuda/ggml-cuda.cu` + `vendor/cpp-httplib/httplib.{cpp,h}` (v0.48.0→v0.49.0) | Backend/vendor-internal only, no API surface visible to `jllama.cpp`. (1) **CUDA gated-delta-net perf**: a fused `gated_delta_net → cpy` path (`ggml_cuda_op_gated_delta_net_fused_cache` + `ggml_cuda_try_gdn_cache_fusion`) lets the kernel scatter recurrent-state snapshots straight into the rollback cache and skip the follow-up strided copy (a decode win for gated-delta / hybrid-recurrent models, e.g. Qwen3-Next); plus a `ggml_cuda_is_view_or_noop` refactor. Affects only the `cuda13-*` classifiers. (2) **cpp-httplib bumped to v0.49.0** (the vendored copy inside llama.cpp, compiled into `jllama` via `server-http.cpp`): locale-independent ASCII classifiers (`is_ascii_digit/alpha/alnum` replacing `std::isdigit`/`isalnum`), a new additive `MultipartFormDataWriter` + `is_valid_multipart_boundary`, multipart field-name/filename escaping (WHATWG), an unsigned base64 accumulator (UB fix), a `ThreadPool` `idle_timeout_sec` ctor param (defaulted — backward-compatible), a `perform_websocket_handshake` `is_ssl` arg (internal), and a `path_encode_`-gated query-normalization skip. All internal to the compiled TU; the project binds no httplib symbol directly (it uses the upstream `server-http.cpp` transport). No project source changes required. | +| b9859–b9862 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9862. The b9859→b9862 diff touches only two patch-target files — `tools/server/server-context.cpp` and `server-context.h` (the `model_ftype`/`get_meta`/`get_model_info` additions at ~L3989/~L5121 and the new struct field at ~L50). Patches **0002** (load-progress guard, ~L1152), **0003** (slot-prompt-similarity getter/setter, ~L3965 + `server_context` struct ~L106) and **0005** (near-prompt-end checkpoints, `update_slots` ~L3560) were **applied in sequence** against the actual b9862 `server-context.{cpp,h}` fetched from `raw.githubusercontent.com` — all three applied cleanly (their regions are disjoint from and far from the b9862 additions). Patches **0001** (`common/arg.{cpp,h}`, `test-arg-parser.cpp`, ~34 standalone mains), **0004** (`server-common.cpp`, `test-chat.cpp`) and **0006** (`server.cpp`) target files **not present** in the b9859→b9862 changed-file list, so their hunks are byte-identical to b9859 and apply unchanged. OuteTTS generator anchors hold (`tools/tts/tts.cpp` unchanged in this range). Full build + `ctest` (target 459/459) to be confirmed by the CI pipeline. | +| b9862–b9864 | `tools/server/server-context.cpp` + `server-schema.cpp` + `server-task.h` + `tools/server/README.md` + `tools/ui/**` | **New feature (additive), no break.** Adds a **per-request `sse_ping_interval`** to the completion API: `task_params` gains `int32_t sse_ping_interval = 30` (`server-task.h`), `make_llama_cmpl_schema` exposes it as a `field_num` with hard limits `[-1, INT32_MAX]` and `eval_llama_cmpl_schema` seeds it from `params_base.sse_ping_interval` (`server-schema.cpp`), and `handle_completions_impl` (`server-context.cpp`, ~L4089) captures the per-task value (instead of the server-level `params.sse_ping_interval`) into the SSE `next` lambda so a request can override the server `--sse-ping-interval` (`-1` disables pings). All inside upstream-compiled server TUs the project already links; the project binds no new symbol. **NativeServer** mode gets it for free (full `llama_server`). The rest of the diff is the **Svelte WebUI** (`tools/ui/**`: MCP server recommendations dialog, a bearer-token Authorization field, migration of the MCP default-enabled key into settings config, `STREAM_VISIBILITY_KICK_MS` 1000→3000, + Vitest units) — the WebUI **auto-follows** the pinned `GIT_TAG` (the `build-webui` CI job rebuilds it), so no manual step. No project source changes required for the bump itself. **Follow-up (done):** `InferenceParameters.withSsePingInterval(int)` now emits the `sse_ping_interval` key (it flows through the OAI-compat completion path via `eval_llama_cmpl_schema`), covered by a Java wither test + three C++ schema round-trip guards (round-trip, `-1` disables, below-hard-limit throws, absent inherits the server default). The same follow-up **audited the completion schema for other already-parseable-but-unexposed fields** and added the plain-scalar wins as withers: `withXtcProbability`/`withXtcThreshold` (XTC sampler), `withNDiscard`, `withNIndent`, `withTMaxPredictMs`, `withPostSamplingProbs`, `withTimingsPerToken`, `withReturnTokens`. (`t_max_prompt_ms` was deliberately skipped — it is commented out `// TODO: implement` in b9864's `make_llama_cmpl_schema`, so it is not parseable.) Remaining schema fields left unexposed on purpose: OAI aliases already covered (`max_tokens`/`max_completion_tokens` → `n_predict`), OAI/server-internal or array-shaped/advanced knobs (`n`/`n_cmpl`, `logprobs`, `echo`, `verbose`, `include_usage`, `return_progress`, `response_fields`, `lora`, `grammar_lazy`/`grammar_triggers`/`preserved_tokens`, `chat_format`, `parse_tool_calls`, `reasoning_control`, `backend_sampling`, `adaptive_*`). | +| b9862–b9864 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9864. The b9862→b9864 diff touches exactly one patch-target file — `tools/server/server-context.cpp` — and only in `handle_completions_impl` (~L4089), far below every patched region (0002 load-progress guard ~L1152, 0005 near-prompt-end checkpoints ~L3560, 0003 slot-prompt-similarity getter/setter ~L3965). Patches **0002/0003/0005** were **applied in sequence** against the actual b9864 `server-context.{cpp,h}` fetched from `raw.githubusercontent.com` — all clean. `server-context.h` is unchanged in this range (so 0003's `.h` hunk is byte-identical); `server-schema.cpp`/`server-task.h` are **not** patch targets. Patches **0001** (`common/arg.*`, `test-arg-parser.cpp`, ~34 mains), **0004** (`server-common.cpp`, `test-chat.cpp`) and **0006** (`server.cpp`) target files **not** in the changed-file list, so they apply unchanged. Confirmed end-to-end by a clean `cmake` configure: b9864 fetched and **all six patches applied via the fail-loud `PATCH_COMMAND`** (exit 0; 0005's `is_ckpt_only_rollback` marker present), OuteTTS generator anchors held (`tools/tts/tts.cpp` unchanged). Full build + `ctest` (target 459/459) to be confirmed by the CI pipeline. | +| b9864–b9866 | `ggml/src/ggml-cuda/topk-moe.cu` + `tests/test-backend-ops.cpp` + `tools/ui/**` | Backend/WebUI-only, no API surface. (1) **CUDA topk-moe** gains a `case 288` instantiation (`topk_moe_cuda<288>`) and `ggml_cuda_should_use_topk_moe` now also accepts `n_expert == 288` (the non-power-of-2 expert count of **StepFun 3.7**) — a device-side kernel add, internal to `ggml-cuda`, affecting only the `cuda13-*` classifiers (a StepFun-3.7 MoE GGUF now uses the fused topk-moe path on CUDA instead of the generic fallback). (2) `test-backend-ops.cpp` adds the matching `test_topk_moe({288,22,1,1}, …)` case — **not built here** (`LLAMA_BUILD_TESTS` OFF for the FetchContent subproject). (3) **WebUI** (`tools/ui/**`): a `config-type-normalization-v1` migration coercing legacy string-encoded booleans in persisted config back to real booleans (the strict server schema now rejects `"true"`/`"false"` strings), and a thinking-enabled default flip to `true` — the WebUI **auto-follows** the pinned `GIT_TAG` (the `build-webui` CI job rebuilds it), so no manual step. No project source changes required. | +| b9864–b9866 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9866. The b9864→b9866 diff touches **no** patch-target file (`common/arg.*`, `tools/server/server-context.{cpp,h}`, `server-common.cpp`, `server-schema.cpp`, `server-task.h`, `server.cpp`, `test-arg-parser.cpp`, `test-chat.cpp`, the ~34 standalone mains) and **no** OuteTTS generator anchor (`tools/tts/tts.cpp` unchanged) — the only edits are `ggml-cuda/topk-moe.cu`, `tests/test-backend-ops.cpp` and `tools/ui/**` — so every patch hunk/offset is byte-identical to b9864. Confirmed end-to-end by a clean `cmake` configure: b9866 fetched and **all six patches applied via the fail-loud `PATCH_COMMAND`** (exit 0; 0005's `is_ckpt_only_rollback` marker present), OuteTTS generator anchors held. Full build + `ctest` (target 462/462) to be confirmed by the CI pipeline. | +| b9866–b9867 | `common/speculative.cpp` | Internal-only, no API surface. A tweak to the **DFlash** block-diffusion speculative draft path (`common_speculative_impl_draft_dflash`, from the b9829–b9839 DFlash feature): (1) the block-size clamp now also clamps `params.n_min` (not just `n_max`) to `block_size - 1` and logs both; (2) the per-step draft sampler's `top_k` goes `1 → 10`; (3) drafting now **stops early** when the top candidate's probability drops below `params.p_min` (upstream b9867 title "spec: support spec-draft-p-min in DFlash"), and a step that produced fewer than `params.n_min` tokens is discarded (`result.clear()`). All three use **already-existing** `common_speculative_params` fields (`n_min`/`n_max`/`p_min`) — no struct/header/API change (`common/speculative.h` untouched). Entirely inside upstream-compiled `common`; the project binds no `common_speculative_*` symbol and exposes no `--spec-*` inference param, so it flows through `libllama` unchanged. No project source changes required. | +| b9866–b9867 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9867. The b9866→b9867 diff touches **no** patch-target file (`common/arg.*`, `tools/server/server-context.{cpp,h}`, `server-common.cpp`, `server-schema.cpp`, `server-task.h`, `server.cpp`, `test-arg-parser.cpp`, `test-chat.cpp`, the ~34 standalone mains) and **no** OuteTTS generator anchor (`tools/tts/tts.cpp` unchanged) — the only edit is `common/speculative.cpp` — so every patch hunk/offset is byte-identical to b9866. Confirmed end-to-end by a clean `cmake` configure: b9867 fetched and **all six patches applied via the fail-loud `PATCH_COMMAND`** (exit 0; 0005's `is_ckpt_only_rollback` and 0006's `g_llama_server_embedded` markers present), OuteTTS generator anchors held. First bump driven by `.github/scripts/llama-next-version.sh` (b9866→b9867, 2 KiB single-commit final chunk). Full build + `ctest` (target 462/462) to be confirmed by the CI pipeline. | +| b9867–b9870 | `common/chat.cpp` + `models/templates/stepfun-ai-Step-3.5-Flash.jinja` (removed) + `tests/test-chat*.cpp` | Internal-only, no API surface. Adds a **StepFun** message-content whitespace workaround (issue #24181): `common_chat_templates_apply_jinja` detects a StepFun template (`src.find("You have access to the following functions in JSONSchema format")`) and, before rendering, trims leading/trailing whitespace from each `common_chat_msg`'s `content`/`reasoning_content` and its `"text"` `content_parts` via a new `static` `workaround::trim_all_content(...)` — otherwise leftover whitespace drove the model into reasoning loops. Uses only existing `common_chat_msg` fields; `common/chat.h` is untouched (no struct/API change). The removed `stepfun-ai-Step-3.5-Flash.jinja` embedded template and the `test-chat*.cpp` additions are **not built here** (`LLAMA_BUILD_TESTS` OFF for the FetchContent subproject). All inside upstream-compiled `common`, flowing through the embedded server / `LlamaModel` chat path automatically. No project source changes required. | +| b9867–b9870 | upstream verification (sandbox) | All **six** patches (`0001`–`0006`) re-verified against b9870. The b9867→b9870 diff touches **no** patch-target file (`common/arg.*`, `tools/server/server-context.{cpp,h}`, `server-common.cpp`, `server-schema.cpp`, `server-task.h`, `server.cpp`, `test-arg-parser.cpp`, the ~34 standalone mains) and **no** OuteTTS generator anchor (`tools/tts/tts.cpp` unchanged) — the only source edit is `common/chat.cpp` (a StepFun whitespace workaround), plus `tools/ui/**` (WebUI, auto-followed) and `tests/test-chat*.cpp` (not built) — so every patch hunk/offset is byte-identical to b9867. **Note:** patch `0004` also targets `tests/test-chat.cpp`, which b9870 edits, but `0004`'s hunks add the reasoning-budget cases in a disjoint region (verified clean by the configure below). Confirmed end-to-end by a clean `cmake` configure: b9870 fetched and **all six patches applied via the fail-loud `PATCH_COMMAND`** (exit 0; 0005's `is_ckpt_only_rollback` and 0006's `g_llama_server_embedded` markers present, b9870's `trim_all_content` present), OuteTTS generator anchors held. Full build + `ctest` (target 462/462) to be confirmed by the CI pipeline. | diff --git a/docs/upgrade/llama-cpp-version-bump.md b/docs/upgrade/llama-cpp-version-bump.md new file mode 100644 index 00000000..c12c4d57 --- /dev/null +++ b/docs/upgrade/llama-cpp-version-bump.md @@ -0,0 +1,138 @@ + + +# llama.cpp version-bump runbook + +This is the **documentation root** for bumping the pinned llama.cpp version. It links the +mechanical edit steps in [`../../CLAUDE.md`](../../CLAUDE.md#upgradingdowngrading-llamacpp-version) +together with a repeatable **target-selection + chunking** strategy so a bump never lands an +unreviewably large diff in one step. + +The current pin lives in `llama/CMakeLists.txt` as `GIT_TAG b`. llama.cpp tags **every** +master commit as `b`, but only a subset get GitHub *Releases*. + +--- + +## TL;DR + +```bash +# From the repo root. Prints the next reviewable step (b -> b) and its compare/.patch URLs. +.github/scripts/llama-next-version.sh # target = latest RELEASE (atom feed) +.github/scripts/llama-next-version.sh b9900 # target = an explicit tag +``` + +Then apply the printed `b -> b` step per [§ Applying a bump](#applying-a-bump) and re-run +the script to walk the next chunk, until it prints **"reaches the latest release — final chunk"**. + +--- + +## 1. Pick the target (topmost release) + +The **target candidate is the topmost release** on +. Read it from the release **atom feed**, which is +reachable from restricted sandboxes where the ggml-org REST API is blocked: + +``` +https://github.com/ggml-org/llama.cpp/releases.atom +``` + +The first ``'s `releases/tag/b` is the latest release. `llama-next-version.sh` does this +for you; if the feed is rate-limited (repeated unauthenticated fetches can return empty), open the +releases page in a browser and pass the tag explicitly: `llama-next-version.sh b`. + +> **Why releases, not just the newest `b` tag:** releases are the versions upstream deems +> shippable; an arbitrary master commit tag may be mid-refactor. Intermediate **chunk** steps +> (below) are allowed to land on non-release tags — they are transient waypoints, not the target. + +## 2. Chunk by diff **byte-size**, not commit count + +The step size is governed by the **size of `git diff` between the pinned tag and the target**, not by +how many commits separate them: + +- If `git diff b b` is **< 100 KiB**, bump straight to the target in one step. +- If it is **≥ 100 KiB**, pick an **intermediate** `b` tag whose diff from the current pin is the + largest still **under** the threshold, bump to that first, then repeat. Each step stays a small, + reviewable patch. + +The threshold is a knob (`LLAMA_BUMP_MAX_DIFF_KB`, default `100`). This is a heuristic: diff size grows +monotonically enough with the tag number that the helper binary-searches the intermediate tags safely. + +> **`tools/ui` (the WebUI) dominates the full diff** and is *auto-followed* — CI rebuilds the matching +> Svelte UI from the pinned `GIT_TAG`, so it needs no per-bump source review. To size the diff on the +> code you actually review, set `LLAMA_BUMP_EXCLUDE_WEBUI=1` (the helper prints both figures regardless). + +### The helper: `.github/scripts/llama-next-version.sh` + +It only **reads** — a cached blobless mirror clone of llama.cpp plus `llama/CMakeLists.txt`; it never +edits the repo. It prints the chosen `b -> b` step, its full and WebUI-excluded diff size, +the commit count, and the `compare` / `.patch` URLs. Environment: + +| Var | Default | Meaning | +|---|---|---| +| `LLAMA_BUMP_MAX_DIFF_KB` | `100` | Per-step diff-size threshold, in KiB. | +| `LLAMA_BUMP_EXCLUDE_WEBUI` | `0` | `1` = size the diff **excluding** `tools/ui`. | +| `LLAMA_BUMP_CACHE` | `~/.cache/jllama-llamacpp-mirror` | Mirror-clone location (cloned once, then fetched). | + +Worked example — pin `b9859`, latest release `b9866` (full diff 133 KiB ≥ 100 KiB, so it chunks): + +``` +$ .github/scripts/llama-next-version.sh b9866 +current pin : b9859 +latest release : b9866 +threshold : 100 KiB per step (full diff) + +next step : b9859 -> b9862 + diff size : 45 KiB full / ... KiB excluding tools/ui (auto-followed WebUI) + commits : 3 + progress : intermediate chunk — re-run this script after the bump for the next one + review diff : https://github.com/ggml-org/llama.cpp/compare/b9859...b9862 + raw .patch : https://github.com/ggml-org/llama.cpp/compare/b9859...b9862.patch +``` + +## 3. Review the chunk's diff + +Fetch the printed `compare/...patch` URL (or open the `compare` page). Walk it against the +**priority-ordered API-compatibility review list** in +[`../../CLAUDE.md`](../../CLAUDE.md#files-to-check-for-api-compatibility) — the 8 header rows that have +historically caused breaks (`common.h`, `chat.h`, `speculative.h`, `mtmd.h`, `llama-cpp.h`, `arg.h`, +`llama.h`, `download.h`), plus the project `CMakeLists.txt` for renamed link targets. Note any new +API surface worth wiring through the Java layer (e.g. a new completion param or model-metadata getter). + +--- + +## Applying a bump + +Once you have the `b -> b` step, apply it exactly as +[`CLAUDE.md § Upgrading/Downgrading`](../../CLAUDE.md#upgradingdowngrading-llamacpp-version) describes. +Concretely: + +1. **Edit the pin — three files:** + - `llama/CMakeLists.txt` — the `GIT_TAG b` line **and** the `-DLLAMA_TAG=b` used by the + WebUI/TTS extraction (both must move together). + - `README.md` — the llama.cpp badge and link (version appears twice). + - `CLAUDE.md` — the "Current llama.cpp pinned version" line (and any build-example `b`). +2. **Re-verify `patches/`** — a clean configure re-runs the fail-loud `PATCH_COMMAND`, so every patch + `0001`–`0006` must still apply. Use a **fresh** build dir (a stale one re-applies over an + already-patched tree and reports a false "does not apply"): + ```bash + cd llama && mvn -q compile # generates the OSInfo class CMake's OS-detection needs + rm -rf build && cmake -B build # fail-loud: aborts here if any patch no longer applies + ``` + If a patch no longer applies, refresh its diff against the new source and recommit it. +3. **Append the history rows** — add a pair of rows to + [`../history/llama-cpp-breaking-changes.md`](../history/llama-cpp-breaking-changes.md) covering the + `b -> b` range (what broke / what was new; "no source change" is a valid row). +4. **Commit + push** on the working branch (do not open a new PR if one already tracks the branch): + ```bash + git add llama/CMakeLists.txt README.md CLAUDE.md docs/history/llama-cpp-breaking-changes.md + git commit -m "Upgrade llama.cpp from b to b" + git push -u origin + ``` +5. **Re-run the helper** for the next chunk. Repeat until it reports the **final chunk** (target + reached). + +CI builds every native classifier from the new pin; the full model-backed Java + C++ suites gate the +result. A build failure at the configure step almost always means a patch needs refreshing (step 2). diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt index 4c480f81..523d4b3b 100644 --- a/llama/CMakeLists.txt +++ b/llama/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.15) +cmake_minimum_required(VERSION 3.22) project(jllama CXX) @@ -8,7 +8,14 @@ project(jllama CXX) # Must be set before any FetchContent_MakeAvailable() so that llama.cpp and all # other subprojects inherit the same CRT choice (mixing /MT and /MD in a single # link is a linker error). -if(MSVC) +# +# EXCEPTION: the Intel oneAPI SYCL and OpenVINO backends must use the DYNAMIC /MD +# runtime — `icx -fsycl` rejects /MT outright ("invalid argument 'MT' not allowed +# with '-fsycl'") and the OpenVINO import libraries are built /MD (mixing would be a +# link error). Those classifiers already require the vendor runtime on the host, so +# the self-contained-DLL rationale does not apply to them; the CPU + CUDA/Vulkan/OpenCL +# classifiers keep /MT. +if(MSVC AND NOT GGML_SYCL AND NOT GGML_OPENVINO) set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" CACHE STRING "" FORCE) endif() @@ -143,7 +150,7 @@ set(LLAMA_BUILD_APP OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b9859 + GIT_TAG b9870 PATCH_COMMAND ${CMAKE_COMMAND} -DPATCH_DIR=${CMAKE_CURRENT_SOURCE_DIR}/patches -DLLAMA_SRC= @@ -166,7 +173,7 @@ execute_process( COMMAND ${CMAKE_COMMAND} -DTTS_SRC=${llama.cpp_SOURCE_DIR}/tools/tts/tts.cpp -DOUT_CPP=${JLLAMA_TTS_GEN_CPP} - -DLLAMA_TAG=b9859 + -DLLAMA_TAG=b9870 -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate-tts-upstream.cmake RESULT_VARIABLE JLLAMA_TTS_GEN_RESULT ) @@ -247,10 +254,18 @@ endif() # under its own Maven classifier, so it must land in a backend-specific resource # root (the default CPU tree stays src/main/resources/). The GPU branches are # OS-aware because the same GGML flag is used on more than one platform: -# - GGML_CUDA -> Linux (resources_linux_cuda) AND Windows (resources_windows_cuda) -# - GGML_OPENCL -> Android (resources_android_opencl) AND Windows (resources_windows_opencl) -# - GGML_VULKAN -> Windows only (resources_windows_vulkan) -# The classifier->tree mapping is mirrored by the matching Maven profile in pom.xml. +# - GGML_CUDA -> Linux (resources_linux_cuda) AND Windows (resources_windows_cuda) +# - GGML_OPENCL -> Android (resources_android_opencl) AND Windows (resources_windows_opencl) +# - GGML_VULKAN -> Windows (resources_windows_vulkan) AND Linux (resources_linux_vulkan) +# - GGML_HIP -> Linux (resources_linux_rocm) AND Windows (resources_windows_rocm) [AMD ROCm/HIP] +# - GGML_SYCL -> Windows (resources_windows_sycl) AND Linux (fp16/fp32 split, see below) [Intel oneAPI] +# - GGML_OPENVINO -> Linux (resources_linux_openvino) AND Windows (resources_windows_openvino) [Intel OpenVINO] +# The classifier->tree mapping is mirrored by the matching Maven profile in pom.xml. The Linux +# Vulkan tree holds both x86_64 and aarch64 under Linux/${OS_ARCH}; two Maven profiles +# (vulkan-linux / vulkan-linux-aarch64) split it into one single-arch classifier JAR each. The +# Windows OpenCL tree likewise holds both x86_64 (desktop ICD) and aarch64 (Snapdragon/Adreno), +# split by the opencl-windows / opencl-windows-aarch64 profiles. Linux SYCL ships two precision +# variants at the SAME arch, so it is routed to two distinct trees by GGML_SYCL_F16 (fp16 vs fp32). if(GGML_CUDA) if(OS_NAME STREQUAL "Windows") set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_cuda/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) @@ -260,8 +275,13 @@ if(GGML_CUDA) message(STATUS "GPU (CUDA Linux) build - Installing files to ${JLLAMA_DIR}") endif() elseif(GGML_VULKAN) - set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_vulkan/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) - message(STATUS "GPU (Vulkan) build - Installing files to ${JLLAMA_DIR}") + if(OS_NAME STREQUAL "Windows") + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_vulkan/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (Vulkan Windows) build - Installing files to ${JLLAMA_DIR}") + else() + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_vulkan/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (Vulkan Linux) build - Installing files to ${JLLAMA_DIR}") + endif() elseif(GGML_OPENCL) if(OS_NAME STREQUAL "Windows") set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_opencl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) @@ -270,6 +290,33 @@ elseif(GGML_OPENCL) set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_android_opencl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) message(STATUS "GPU (OpenCL Android) build - Installing files to ${JLLAMA_DIR}") endif() +elseif(GGML_HIP) + if(OS_NAME STREQUAL "Windows") + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_rocm/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (ROCm/HIP Windows) build - Installing files to ${JLLAMA_DIR}") + else() + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_rocm/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (ROCm/HIP Linux) build - Installing files to ${JLLAMA_DIR}") + endif() +elseif(GGML_SYCL) + if(OS_NAME STREQUAL "Windows") + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_sycl/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (SYCL Windows) build - Installing files to ${JLLAMA_DIR}") + elseif(GGML_SYCL_F16) + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_sycl_fp16/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (SYCL Linux fp16) build - Installing files to ${JLLAMA_DIR}") + else() + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_sycl_fp32/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (SYCL Linux fp32) build - Installing files to ${JLLAMA_DIR}") + endif() +elseif(GGML_OPENVINO) + if(OS_NAME STREQUAL "Windows") + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_windows_openvino/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (OpenVINO Windows) build - Installing files to ${JLLAMA_DIR}") + else() + set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources_linux_openvino/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) + message(STATUS "GPU (OpenVINO Linux) build - Installing files to ${JLLAMA_DIR}") + endif() else() set(JLLAMA_DIR ${CMAKE_SOURCE_DIR}/src/main/resources/net/ladenthin/llama/${OS_NAME}/${OS_ARCH}) message(STATUS "CPU build - Installing files to ${JLLAMA_DIR}") @@ -355,6 +402,23 @@ if(NOT ANDROID_ABI AND NOT OS_NAME MATCHES "Android") ) endif() +# Native-server mode (net.ladenthin.llama.server.NativeServer): compile the standalone server +# entry point (server.cpp's `llama_server`, made embeddable by patches/0006) and its tools helper +# (server-tools.cpp); jllama's JNI bridge (native_server.cpp) then calls llama_server on a worker +# thread. This runs the *full* upstream HTTP server — WebUI included, every llama-server flag +# forwarded — inside libjllama, with no separate llama-server executable. server.cpp and +# server-tools.cpp both pull in vendor/sheredom/subprocess.h (posix_spawn_*), so they share the +# non-Android guard used for server-models.cpp above; native_server.cpp links against llama_server +# and is guarded too. On Android the NativeServer native methods are simply absent (its JNI calls +# throw UnsatisfiedLinkError) — use OpenAiCompatServer there. +if(NOT ANDROID_ABI AND NOT OS_NAME MATCHES "Android") + target_sources(jllama PRIVATE + ${llama.cpp_SOURCE_DIR}/tools/server/server-tools.cpp + ${llama.cpp_SOURCE_DIR}/tools/server/server.cpp + ${CMAKE_SOURCE_DIR}/src/main/cpp/native_server.cpp + ) +endif() + # Phase 2: also compile the upstream HTTP transport (server-http.cpp) and its # cpp-httplib backend directly into jllama, so the OpenAI-compatible server can be # driven natively from JNI — shipped inside libjllama, with no separate diff --git a/llama/patches/0005-server-recurrent-near-prompt-end-checkpoints.patch b/llama/patches/0005-server-recurrent-near-prompt-end-checkpoints.patch new file mode 100644 index 00000000..59f729ff --- /dev/null +++ b/llama/patches/0005-server-recurrent-near-prompt-end-checkpoints.patch @@ -0,0 +1,39 @@ +diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp +index 39aa20b..d3d5978 100644 +--- a/tools/server/server-context.cpp ++++ b/tools/server/server-context.cpp +@@ -3560,8 +3560,32 @@ private: + // do not checkpoint after mtmd chunks + do_checkpoint = do_checkpoint && !has_mtmd; + +- // no need to create checkpoints that are too close together, unless it's the last user message +- do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || is_last_user_message || n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step); ++ // recurrent (and hybrid) models cannot partially roll back their state, so the only way to ++ // avoid re-processing an entire multi-turn conversation on the next request is a checkpoint ++ // near the end of the current prompt. without this, a conversation that appends only ++ // assistant/tool messages (agentic tool-calling) re-processes the whole tail every turn, ++ // because no new user-message checkpoint is ever created and the min-step spacing blocks ++ // the near-prompt-end ones. exempt those models' near-end checkpoints from the spacing. ++ const bool is_ckpt_only_rollback = ++ ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL || ++ ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS; ++ ++ // don't create checkpoints too close together, unless it's the last user message or a ++ // near-prompt-end checkpoint for a checkpoint-only-rollback model (leading empty() guards ++ // the checkpoints.back() access via short-circuit) ++ const bool checkpoint_well_spaced = ++ slot.prompt.checkpoints.empty() || ++ is_last_user_message || ++ (near_prompt_end && is_ckpt_only_rollback) || ++ n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step; ++ ++ // and never duplicate the newest checkpoint's position (else the last-user-message ++ // checkpoint is re-created every turn, flooding the list until useful entries are evicted) ++ const bool checkpoint_not_duplicate = ++ slot.prompt.checkpoints.empty() || ++ slot.prompt.checkpoints.back().n_tokens != n_tokens_start; ++ ++ do_checkpoint = do_checkpoint && checkpoint_well_spaced && checkpoint_not_duplicate; + SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max); + + // note: we create the checkpoint before calling llama_decode(), so the current batch is not diff --git a/llama/patches/0006-server-embed-native-server-jni.patch b/llama/patches/0006-server-embed-native-server-jni.patch new file mode 100644 index 00000000..35a146d5 --- /dev/null +++ b/llama/patches/0006-server-embed-native-server-jni.patch @@ -0,0 +1,67 @@ +diff --git a/tools/server/server.cpp b/tools/server/server.cpp +index 84c7f0b..5c9fac9 100644 +--- a/tools/server/server.cpp ++++ b/tools/server/server.cpp +@@ -25,6 +25,28 @@ + static std::function shutdown_handler; + static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; + ++// [jllama] Embedded-mode support: when llama_server() is hosted inside libjllama and driven over ++// JNI (net.ladenthin.llama.server.NativeServer), it must NOT install process-wide signal handlers ++// (that would hijack the JVM's SIGINT/SIGTERM), and it must be stoppable out-of-band because ++// ctx_server is local to llama_server(). It also parses exactly the forwarded argv rather than ++// re-deriving it from the process command line (which would be java.exe's — the Windows bug the ++// 0001 patch fixes for the embedded path). These symbols are declared in ++// src/main/cpp/native_server_bridge.h and called by native_server.cpp. ++static std::atomic g_llama_server_embedded{false}; ++ ++void llama_server_set_embedded(bool embedded) { ++ g_llama_server_embedded.store(embedded); ++} ++ ++void llama_server_request_shutdown() { ++ // Mirrors the SIGTERM path: invoke the installed shutdown_handler, which unblocks ++ // ctx_server.start_loop() (single-model) / ctx_http.stop() (router). No-op if the server has ++ // not finished starting (handler not yet installed) — stop after /health reports ready. ++ if (shutdown_handler) { ++ shutdown_handler(SIGTERM); ++ } ++} ++ + static inline void signal_handler(int signal) { + if (is_terminating.test_and_set()) { + // in case it hangs, we can force terminate the server by hitting Ctrl+C twice +@@ -87,7 +109,13 @@ int llama_server(int argc, char ** argv) { + // touch it. lifecycle is symmetric, stop_gc() runs in clean_up() before backend free + g_stream_sessions.start_gc(); + +- if (!common_params_parse_main(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { ++ // [jllama] embedded (JNI) callers forward a clean UTF-8 argv, so honor it exactly via ++ // common_params_parse; only the standalone tool needs common_params_parse_main's ++ // process-command-line (GetCommandLineW) UTF-8 recovery. ++ const bool parsed_ok = g_llama_server_embedded.load() ++ ? common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER) ++ : common_params_parse_main(argc, argv, params, LLAMA_EXAMPLE_SERVER); ++ if (!parsed_ok) { + return 1; + } + +@@ -412,6 +440,10 @@ int llama_server(int argc, char ** argv) { + } + + // TODO: refactor in common/console ++ // [jllama] skip installing process-wide signal handlers when embedded in the JVM (they would ++ // hijack the JVM's own SIGINT/SIGTERM). NativeServer stops the embedded server via ++ // llama_server_request_shutdown() instead. ++ if (!g_llama_server_embedded.load()) { + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = signal_handler; +@@ -425,6 +457,7 @@ int llama_server(int argc, char ** argv) { + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); + #endif ++ } + + SRV_INF("listening on %s\n", ctx_http.listening_address.c_str()); + diff --git a/llama/pom.xml b/llama/pom.xml index 67e6e563..da5a58d2 100644 --- a/llama/pom.xml +++ b/llama/pom.xml @@ -1149,14 +1149,180 @@ SPDX-License-Identifier: MIT + + + vulkan-linux + + + + org.apache.maven.plugins + maven-compiler-plugin + + + vulkan-linux + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_linux_vulkan + + + + + + maven-resources-plugin + + + copy-resources-vulkan-linux + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_linux_vulkan + + + + ${basedir}/src/main/resources_linux_vulkan/ + + net/ladenthin/llama/Linux/x86_64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + vulkan-linux + package + + jar + + + vulkan-linux-x86-64 + + ${project.build.outputDirectory}_linux_vulkan + + + + + + + + + + + vulkan-linux-aarch64 + + + + org.apache.maven.plugins + maven-compiler-plugin + + + vulkan-linux-aarch64 + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_linux_vulkan_aarch64 + + + + + + maven-resources-plugin + + + copy-resources-vulkan-linux-aarch64 + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_linux_vulkan_aarch64 + + + + ${basedir}/src/main/resources_linux_vulkan/ + + net/ladenthin/llama/Linux/aarch64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + vulkan-linux-aarch64 + package + + jar + + + vulkan-linux-aarch64 + + ${project.build.outputDirectory}_linux_vulkan_aarch64 + + + + + + + - + are better supported. The resource copy includes ONLY the Windows/x86_64 + subtree so the aarch64 natives (opencl-windows-aarch64, staged into the same + tree by the sibling job) do not leak into this JAR. Staged by CI before this + profile runs. --> opencl-windows @@ -1201,7 +1367,7 @@ SPDX-License-Identifier: MIT ${basedir}/src/main/resources_windows_opencl/ - **/*.* + net/ladenthin/llama/Windows/x86_64/** @@ -1231,6 +1397,633 @@ SPDX-License-Identifier: MIT + + + rocm-linux + + + + org.apache.maven.plugins + maven-compiler-plugin + + + rocm-linux + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_linux_rocm + + + + + + maven-resources-plugin + + + copy-resources-rocm-linux + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_linux_rocm + + + + ${basedir}/src/main/resources_linux_rocm/ + + net/ladenthin/llama/Linux/x86_64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + rocm-linux + package + + jar + + + rocm-linux-x86-64 + + ${project.build.outputDirectory}_linux_rocm + + + + + + + + + + rocm-windows + + + + org.apache.maven.plugins + maven-compiler-plugin + + + rocm-windows + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_windows_rocm + + + + + + maven-resources-plugin + + + copy-resources-rocm-windows + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_windows_rocm + + + + ${basedir}/src/main/resources_windows_rocm/ + + net/ladenthin/llama/Windows/x86_64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + rocm-windows + package + + jar + + + rocm-windows-x86-64 + + ${project.build.outputDirectory}_windows_rocm + + + + + + + + + + sycl-fp16-linux + + + + org.apache.maven.plugins + maven-compiler-plugin + + + sycl-fp16-linux + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_linux_sycl_fp16 + + + + + + maven-resources-plugin + + + copy-resources-sycl-fp16-linux + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_linux_sycl_fp16 + + + + ${basedir}/src/main/resources_linux_sycl_fp16/ + + net/ladenthin/llama/Linux/x86_64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + sycl-fp16-linux + package + + jar + + + sycl-fp16-linux-x86-64 + + ${project.build.outputDirectory}_linux_sycl_fp16 + + + + + + + + + + sycl-fp32-linux + + + + org.apache.maven.plugins + maven-compiler-plugin + + + sycl-fp32-linux + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_linux_sycl_fp32 + + + + + + maven-resources-plugin + + + copy-resources-sycl-fp32-linux + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_linux_sycl_fp32 + + + + ${basedir}/src/main/resources_linux_sycl_fp32/ + + net/ladenthin/llama/Linux/x86_64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + sycl-fp32-linux + package + + jar + + + sycl-fp32-linux-x86-64 + + ${project.build.outputDirectory}_linux_sycl_fp32 + + + + + + + + + + sycl-windows + + + + org.apache.maven.plugins + maven-compiler-plugin + + + sycl-windows + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_windows_sycl + + + + + + maven-resources-plugin + + + copy-resources-sycl-windows + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_windows_sycl + + + + ${basedir}/src/main/resources_windows_sycl/ + + net/ladenthin/llama/Windows/x86_64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + sycl-windows + package + + jar + + + sycl-windows-x86-64 + + ${project.build.outputDirectory}_windows_sycl + + + + + + + + + + opencl-windows-aarch64 + + + + org.apache.maven.plugins + maven-compiler-plugin + + + opencl-windows-aarch64 + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_windows_opencl_aarch64 + + + + + + maven-resources-plugin + + + copy-resources-opencl-windows-aarch64 + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_windows_opencl_aarch64 + + + + ${basedir}/src/main/resources_windows_opencl/ + + net/ladenthin/llama/Windows/aarch64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + opencl-windows-aarch64 + package + + jar + + + opencl-windows-aarch64 + + ${project.build.outputDirectory}_windows_opencl_aarch64 + + + + + + + + + + openvino-linux + + + + org.apache.maven.plugins + maven-compiler-plugin + + + openvino-linux + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_linux_openvino + + + + + + maven-resources-plugin + + + copy-resources-openvino-linux + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_linux_openvino + + + + ${basedir}/src/main/resources_linux_openvino/ + + net/ladenthin/llama/Linux/x86_64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + openvino-linux + package + + jar + + + openvino-linux-x86-64 + + ${project.build.outputDirectory}_linux_openvino + + + + + + + + + + openvino-windows + + + + org.apache.maven.plugins + maven-compiler-plugin + + + openvino-windows + compile + + compile + + + + module-info.java + + + -h + src/main/cpp + + + ${project.build.outputDirectory}_windows_openvino + + + + + + maven-resources-plugin + + + copy-resources-openvino-windows + process-classes + + copy-resources + + + + ${project.build.outputDirectory}_windows_openvino + + + + ${basedir}/src/main/resources_windows_openvino/ + + net/ladenthin/llama/Windows/x86_64/** + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + openvino-windows + package + + jar + + + openvino-windows-x86-64 + + ${project.build.outputDirectory}_windows_openvino + + + + + + + + vmlens @@ -1296,8 +2089,10 @@ SPDX-License-Identifier: MIT + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/llama/src/main/cpp/jllama.cpp b/llama/src/main/cpp/jllama.cpp index f3871f89..faa8fadd 100644 --- a/llama/src/main/cpp/jllama.cpp +++ b/llama/src/main/cpp/jllama.cpp @@ -802,6 +802,7 @@ JNIEXPORT jstring JNICALL Java_net_ladenthin_llama_LlamaModel_getModelMetaJson(J {"modalities", {{"vision", m.has_inp_image}, {"audio", m.has_inp_audio}}}, {"name", m.model_name}, {"architecture", std::string(arch_buf)}, + {"ftype", m.model_ftype}, }; // Resolved default chat template (Jinja); empty when the model ships none. const char *chat_tmpl = mdl != nullptr ? llama_model_chat_template(mdl, /*name*/ nullptr) : nullptr; diff --git a/llama/src/main/cpp/native_server.cpp b/llama/src/main/cpp/native_server.cpp new file mode 100644 index 00000000..d9cfa527 --- /dev/null +++ b/llama/src/main/cpp/native_server.cpp @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +// JNI bridge for net.ladenthin.llama.server.NativeServer: runs the full upstream llama.cpp HTTP +// server (llama_server(), including its embedded WebUI) inside libjllama, driven over JNI. The +// argv is forwarded verbatim from Java, so every llama-server flag is supported. This is an +// independent server lifecycle (it loads its own model from the argv), distinct from LlamaModel +// and the Java-side OpenAiCompatServer. +// +// Only ONE native server may run per process: server.cpp keeps its shutdown_handler / +// is_terminating state in file-scope globals, so a second concurrent llama_server() would clobber +// them. NativeServer enforces this on the Java side. + +#include "native_server_bridge.h" + +#include + +#include +#include +#include +#include +#include + +namespace { + +// Owns the argv storage for the lifetime of the running server plus the worker thread that runs +// llama_server(). The argv pointers reference the std::string storage in `args`, which is filled +// once (with reserve) and never mutated afterwards, so the pointers stay valid. +struct native_server { + std::vector args; // args[0] is the program name ("llama-server") + std::vector argv; // points into `args` + std::thread worker; + std::atomic finished{false}; + int exit_code = -1; +}; + +} // namespace + +extern "C" { + +JNIEXPORT jlong JNICALL Java_net_ladenthin_llama_server_NativeServer_startNativeServer(JNIEnv *env, jclass, + jobjectArray jargs) { + auto *srv = new native_server(); + + const jsize n = (jargs != nullptr) ? env->GetArrayLength(jargs) : 0; + srv->args.reserve(static_cast(n) + 1); + srv->args.emplace_back("llama-server"); // argv[0] + for (jsize i = 0; i < n; ++i) { + auto js = static_cast(env->GetObjectArrayElement(jargs, i)); + if (js != nullptr) { + const char *chars = env->GetStringUTFChars(js, nullptr); + srv->args.emplace_back(chars != nullptr ? chars : ""); + if (chars != nullptr) { + env->ReleaseStringUTFChars(js, chars); + } + env->DeleteLocalRef(js); + } else { + srv->args.emplace_back(""); + } + } + + srv->argv.reserve(srv->args.size()); + for (auto &arg : srv->args) { + srv->argv.push_back(const_cast(arg.c_str())); + } + + // Embedded mode: no process signal handlers, honor the forwarded argv (see patches/0006). + llama_server_set_embedded(true); + + srv->worker = std::thread([srv]() { + srv->exit_code = llama_server(static_cast(srv->argv.size()), srv->argv.data()); + srv->finished.store(true); + }); + + return reinterpret_cast(srv); +} + +JNIEXPORT void JNICALL Java_net_ladenthin_llama_server_NativeServer_stopNativeServer(JNIEnv *, jclass, jlong handle) { + auto *srv = reinterpret_cast(handle); + if (srv == nullptr) { + return; + } + // Signal shutdown, retrying until the worker actually returns: a stop issued before the server + // finished starting (shutdown_handler not yet installed by llama_server) would otherwise be + // lost. Once the handler is installed the first signal takes effect; if the model failed to + // load, llama_server has already returned and `finished` is set. + while (!srv->finished.load()) { + llama_server_request_shutdown(); + if (srv->finished.load()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + if (srv->worker.joinable()) { + srv->worker.join(); + } + delete srv; +} + +JNIEXPORT jboolean JNICALL Java_net_ladenthin_llama_server_NativeServer_isRunningNative(JNIEnv *, jclass, + jlong handle) { + auto *srv = reinterpret_cast(handle); + return (srv != nullptr && !srv->finished.load()) ? JNI_TRUE : JNI_FALSE; +} + +} // extern "C" diff --git a/llama/src/main/cpp/native_server_bridge.h b/llama/src/main/cpp/native_server_bridge.h new file mode 100644 index 00000000..1a40c766 --- /dev/null +++ b/llama/src/main/cpp/native_server_bridge.h @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +#pragma once + +// Declarations for the upstream server entry point (llama.cpp tools/server/server.cpp) that +// jllama's NativeServer JNI bridge (native_server.cpp) calls to run the full llama.cpp HTTP +// server — WebUI included — inside libjllama, with no separate llama-server executable. +// +// - llama_server: upstream's renamed main (b9859 already exposes `int llama_server(int, char**)` +// as a non-static, externally linkable function). Runs the server and blocks until shutdown, +// returning its process-style exit code (0 = clean). +// - llama_server_set_embedded / llama_server_request_shutdown: added by +// patches/0006-server-embed-native-server-jni.patch so the server can run embedded in the JVM +// (does not install process-wide signal handlers, and honors the forwarded argv instead of +// re-deriving it from the process command line) and can be stopped out-of-band (the SIGTERM +// path) since its server_context is local to llama_server(). + +int llama_server(int argc, char ** argv); +void llama_server_set_embedded(bool embedded); +void llama_server_request_shutdown(); diff --git a/llama/src/main/java/net/ladenthin/llama/LlamaModel.java b/llama/src/main/java/net/ladenthin/llama/LlamaModel.java index 59644ebe..6dbe19d0 100644 --- a/llama/src/main/java/net/ladenthin/llama/LlamaModel.java +++ b/llama/src/main/java/net/ladenthin/llama/LlamaModel.java @@ -856,6 +856,18 @@ public boolean supportsAudio() { return getModelMeta().supportsAudio(); } + /** + * Returns the loaded model's file type (quantization) as a human-readable string, e.g. + * {@code "Q8_0"} or {@code "Q4_K - Medium"} (llama.cpp {@code llama_ftype_name}); a guessed + * type is prefixed with {@code "(guessed) "}. Returns an empty string when the native layer does + * not report it. + * + * @return the quantization file-type label, or {@code ""} if absent + */ + public String getModelFtype() { + return getModelMeta().getFtype(); + } + native String getModelMetaJson(); /** diff --git a/llama/src/main/java/net/ladenthin/llama/loader/OSInfo.java b/llama/src/main/java/net/ladenthin/llama/loader/OSInfo.java index 21cf7a3b..138b5900 100644 --- a/llama/src/main/java/net/ladenthin/llama/loader/OSInfo.java +++ b/llama/src/main/java/net/ladenthin/llama/loader/OSInfo.java @@ -114,6 +114,8 @@ public OSInfo() {} public static final String PPC64 = "ppc64"; /** Folder name for 64-bit RISC-V. */ public static final String RISCV64 = "riscv64"; + /** Folder name for 64-bit IBM Z (s390x, big-endian). */ + public static final String S390X = "s390x"; static { // x86 mappings @@ -155,6 +157,8 @@ public OSInfo() {} archMapping.put("ppc64le", PPC64); archMapping.put(RISCV64, RISCV64); + + archMapping.put(S390X, S390X); } /** diff --git a/llama/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java b/llama/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java index a47ee190..a831234e 100644 --- a/llama/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java +++ b/llama/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java @@ -61,6 +61,7 @@ public final class InferenceParameters extends JsonParameters { private static final String PARAM_CACHE_REUSE = "n_cache_reuse"; private static final String PARAM_SLOT_ID = "id_slot"; private static final String PARAM_STREAM_OPTIONS = "stream_options"; + private static final String PARAM_SSE_PING_INTERVAL = "sse_ping_interval"; private static final String PARAM_RESPONSE_FORMAT = "response_format"; private static final String PARAM_N_PREDICT = "n_predict"; private static final String PARAM_TOP_K = "top_k"; @@ -108,6 +109,16 @@ public final class InferenceParameters extends JsonParameters { private static final String PARAM_DRY_ALLOWED_LENGTH = "dry_allowed_length"; private static final String PARAM_DRY_PENALTY_LAST_N = "dry_penalty_last_n"; private static final String PARAM_DRY_SEQUENCE_BREAKERS = "dry_sequence_breakers"; + // Additional completion-schema fields honored by the native parser (eval_llama_cmpl_schema) + // but previously not surfaced as withers. All plain scalars. + private static final String PARAM_XTC_PROBABILITY = "xtc_probability"; + private static final String PARAM_XTC_THRESHOLD = "xtc_threshold"; + private static final String PARAM_N_DISCARD = "n_discard"; + private static final String PARAM_N_INDENT = "n_indent"; + private static final String PARAM_T_MAX_PREDICT_MS = "t_max_predict_ms"; + private static final String PARAM_POST_SAMPLING_PROBS = "post_sampling_probs"; + private static final String PARAM_TIMINGS_PER_TOKEN = "timings_per_token"; + private static final String PARAM_RETURN_TOKENS = "return_tokens"; private static final InferenceParameters EMPTY = new InferenceParameters(); @@ -868,4 +879,113 @@ public InferenceParameters withContinueFinalMessage(ContinuationMode mode) { public InferenceParameters withStream(boolean stream) { return withScalar(PARAM_STREAM, stream); } + + /** + * Returns a new request with the SSE ping interval replaced (llama.cpp {@code sse_ping_interval}, + * added upstream in b9864). In {@code stream} mode the server emits an SSE comment ping every + * {@code seconds} while the stream stays silent (e.g. during long prompt processing), keeping the + * connection observable; this per-request value overrides the server's {@code --sse-ping-interval} + * setting. Use {@code -1} to disable pings. Default: the server setting (30 s upstream). + * + * @param seconds interval in seconds between SSE comment pings, or {@code -1} to disable + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withSsePingInterval(int seconds) { + return withScalar(PARAM_SSE_PING_INTERVAL, seconds); + } + + /** + * Returns a new request with the XTC (Exclude Top Choices) sampler probability replaced + * ({@code xtc_probability}, default 0.0 = disabled). At each step, with this probability the + * sampler removes all but the least-likely of the tokens above {@link #withXtcThreshold(float)}, + * flattening over-confident distributions. + * + * @param xtcProbability the XTC trigger probability in {@code [0, 1]} (0 disables XTC) + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withXtcProbability(float xtcProbability) { + return withScalar(PARAM_XTC_PROBABILITY, xtcProbability); + } + + /** + * Returns a new request with the XTC sampler threshold replaced ({@code xtc_threshold}, + * default 0.1). Only tokens whose probability is at least this value are eligible for XTC removal. + * + * @param xtcThreshold the minimum token probability considered by XTC + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withXtcThreshold(float xtcThreshold) { + return withScalar(PARAM_XTC_THRESHOLD, xtcThreshold); + } + + /** + * Returns a new request with the number of tokens discarded on a context shift replaced + * ({@code n_discard}, default 0 = discard half of {@code n_ctx - n_keep}). When the context fills, + * the oldest {@code n_discard} tokens after the kept prefix are dropped to make room. + * + * @param nDiscard tokens to discard on context shift (0 = half) + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withNDiscard(int nDiscard) { + return withScalar(PARAM_N_DISCARD, nDiscard); + } + + /** + * Returns a new request with the infill indentation hint replaced ({@code n_indent}, default 0). + * Used with {@link #withInputPrefix(String)} / {@link #withInputSuffix(String)}: generated infill + * lines are required to be indented at least this many columns, which helps code models keep block + * structure. + * + * @param nIndent minimum indentation (columns) for infilled lines + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withNIndent(int nIndent) { + return withScalar(PARAM_N_INDENT, nIndent); + } + + /** + * Returns a new request with a wall-clock generation-time budget replaced ({@code t_max_predict_ms}, + * default -1 = no limit). Generation stops once it has run for this many milliseconds, regardless of + * {@link #withNPredict(int)} — useful as an agentic/interactive latency guard. + * + * @param tMaxPredictMs maximum generation time in milliseconds (-1 = no limit) + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withTMaxPredictMs(int tMaxPredictMs) { + return withScalar(PARAM_T_MAX_PREDICT_MS, tMaxPredictMs); + } + + /** + * Returns a new request toggling post-sampling token probabilities ({@code post_sampling_probs}, + * default false). When true, the {@code n_probs} probabilities are reported after the full + * sampling chain is applied rather than from the raw logits. + * + * @param postSamplingProbs whether to report probabilities after sampling + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withPostSamplingProbs(boolean postSamplingProbs) { + return withScalar(PARAM_POST_SAMPLING_PROBS, postSamplingProbs); + } + + /** + * Returns a new request toggling per-token timing telemetry ({@code timings_per_token}, + * default false). When true, streamed responses carry per-token timing information. + * + * @param timingsPerToken whether to include per-token timings + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withTimingsPerToken(boolean timingsPerToken) { + return withScalar(PARAM_TIMINGS_PER_TOKEN, timingsPerToken); + } + + /** + * Returns a new request toggling raw token-id output ({@code return_tokens}, default false). + * When true, the response includes the generated token ids alongside the decoded text. + * + * @param returnTokens whether to include raw token ids in the response + * @return a new instance; this instance is unchanged + */ + public InferenceParameters withReturnTokens(boolean returnTokens) { + return withScalar(PARAM_RETURN_TOKENS, returnTokens); + } } diff --git a/llama/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java b/llama/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java index ce62131b..a8c6965b 100644 --- a/llama/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java +++ b/llama/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java @@ -1142,6 +1142,42 @@ public ModelParameters setSlotPromptSimilarity(float similarity) { return putScalar("--slot-prompt-similarity", similarity); } + /** + * Set the maximum number of context checkpoints kept per slot (default: 32; 0 disables + * checkpointing). + * + *

Context checkpoints let the server roll a slot back to an earlier state instead of + * re-processing the whole prompt when a follow-up request diverges from the cached tokens. + * They are essential for models that cannot truncate their state to an arbitrary position: + * recurrent/hybrid architectures (e.g. Granite-4, Mamba, Jamba) and SWA models. Each + * checkpoint costs host memory proportional to the model's recurrent/SWA state size, so + * lower this value on memory-constrained machines or raise it for very long multi-turn + * (agentic tool-calling) sessions.

+ * + * @param ctxCheckpoints the maximum number of context checkpoints per slot + * @return this builder + */ + public ModelParameters setCtxCheckpoints(int ctxCheckpoints) { + return putScalar("--ctx-checkpoints", ctxCheckpoints); + } + + /** + * Set the minimum spacing between context checkpoints in tokens (default: 8192; 0 = no + * minimum). + * + *

Smaller values create checkpoints more often, improving prompt-cache reuse for + * multi-turn conversations at the cost of more host memory (bounded by + * {@link #setCtxCheckpoints(int)}). This matters most for recurrent/hybrid models + * (e.g. Granite-4), whose state can only be rolled back to a checkpoint — with sparse + * checkpoints a follow-up request may have to re-process most of the conversation.

+ * + * @param checkpointMinStep the minimum number of tokens between two checkpoints (must not be negative) + * @return this builder + */ + public ModelParameters setCheckpointMinStep(int checkpointMinStep) { + return putScalar("--checkpoint-min-step", checkpointMinStep); + } + /** * Load LoRA adapters without applying them (apply later via POST /lora-adapters). * diff --git a/llama/src/main/java/net/ladenthin/llama/server/NativeServer.java b/llama/src/main/java/net/ladenthin/llama/server/NativeServer.java index 024ac827..65caf6c8 100644 --- a/llama/src/main/java/net/ladenthin/llama/server/NativeServer.java +++ b/llama/src/main/java/net/ladenthin/llama/server/NativeServer.java @@ -5,105 +5,241 @@ package net.ladenthin.llama.server; import java.util.Objects; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import lombok.ToString; +import net.ladenthin.llama.loader.LlamaLoader; /** - * Scaffold for the native HTTP server bridge — the planned counterpart to - * {@link OpenAiCompatServer}. + * Runs the full upstream llama.cpp HTTP server — including its embedded + * WebUI — inside {@code libjllama}, driven over JNI, with no separate + * {@code llama-server} executable. It is the second of two server modes, the native counterpart to + * the Java-transport {@link OpenAiCompatServer}. * - *

{@link OpenAiCompatServer} implements the HTTP transport in Java (on the JDK's - * {@code com.sun.net.httpserver}) and drives the native llama.cpp server core over JNI. This - * class is instead the entry point for the upstream native HTTP transport that is already - * compiled into {@code libjllama} (llama.cpp's {@code server-http.cpp} plus its {@code cpp-httplib} - * backend). That native transport is the only component able to serve the embedded llama.cpp - * WebUI (the {@code ui.cpp}/{@code ui.h} asset table compiled in behind - * {@code LLAMA_UI_HAS_ASSETS}).

+ *

The constructor takes the raw llama-server command-line arguments and forwards them verbatim + * to the native entry point ({@code llama_server}), so every llama-server flag is supported + * ({@code -m}, {@code -c}, {@code -b}, {@code -ub}, {@code -ngl}, {@code -t}, {@code -tb}, + * {@code -ctk}, {@code -ctv}, {@code --jinja}, {@code --chat-template-kwargs}, {@code --host}, + * {@code --port}, {@code --ui}/{@code --no-ui}, …). Unlike {@link OpenAiCompatServer}, no per-flag + * Java mapping is involved.

* - *

Status: scaffold only. The route registration that upstream performs in - * {@code server.cpp} (deliberately excluded from this build) is not yet wired to a JNI entry point, so - * {@link #start()} throws {@link UnsupportedOperationException} for now. This class only fixes the - * package structure and the public API shape; the native {@code startServer}/{@code stopServer} - * methods, their C++ implementation, the server lifecycle/threading and WebUI serving are a separate, - * detailed step (see {@code CLAUDE.md}, "WebUI (llama.cpp Svelte UI) embedding").

+ *

Independent lifecycle. {@code NativeServer} loads its own model from + * the forwarded arguments — exactly like running {@code llama-server.exe} — and is unrelated to any + * {@code net.ladenthin.llama.LlamaModel} you may also have open. Reusing an already-loaded + * {@code LlamaModel}'s context instead of loading a second copy is a possible future enhancement + * (see {@code TODO.md}). While the native server runs it owns the process-wide llama backend and + * routes llama.cpp logging to stderr/file (llama-server's own logging), not the JNI log callback.

* - *

It is {@link AutoCloseable} so that, once implemented, callers can drive it with - * try-with-resources exactly like {@link OpenAiCompatServer}.

+ *

Single instance per process. The upstream server keeps its shutdown state in + * file-scope globals, so only one {@code NativeServer} may run at a time; {@link #start()} throws if + * another instance is already running.

+ * + *

Typical use:

+ *
{@code
+ * try (NativeServer server = new NativeServer(
+ *         "-m", "models/model.gguf", "--host", "127.0.0.1", "--port", "8080", "-c", "65536").start()) {
+ *     // Server (and WebUI at http://127.0.0.1:8080/) runs on a native worker thread.
+ *     // Readiness: poll GET /health until it returns {"status":"ok"}.
+ *     Thread.currentThread().join();
+ * }
+ * }
+ * + *

Platform note. The native methods are compiled into {@code libjllama} on all + * platforms except Android (the upstream server pulls in {@code posix_spawn_*}, unavailable there); + * on Android use {@link OpenAiCompatServer}. No SSL: the embedded server is plain HTTP — bind + * localhost or front it with a TLS proxy.

*/ @ToString public final class NativeServer implements AutoCloseable { - /** Message thrown by {@link #start()} until the native route-wiring lands. */ - static final String NOT_WIRED_MESSAGE = - "NativeServer is a scaffold: the upstream native HTTP routes (server-http.cpp) are " - + "not yet wired to JNI. Use OpenAiCompatServer for now; the native server and " - + "embedded WebUI are a planned step."; + /** Guards the process-wide single-instance invariant (upstream uses file-scope globals). */ + private static final AtomicBoolean RUNNING = new AtomicBoolean(false); + + /** Default bind host reported by {@link #getHost()} when {@code --host} is not passed. */ + private static final String DEFAULT_HOST = "127.0.0.1"; + + /** Default port reported by {@link #getPort()} when no port flag is passed. */ + private static final int DEFAULT_PORT = 8080; + + /** The llama-server argument vector, forwarded verbatim to the native entry point. */ + private final String[] args; - /** Immutable server configuration (bind host, port, ...) shared with {@link OpenAiCompatServer}. */ - private final OpenAiServerConfig config; + /** Native handle (pointer) while running, or {@code 0} when not started / stopped. */ + private volatile long handle; /** - * Creates a native-server bridge for the given configuration. + * Creates a native-server bridge for the given llama-server arguments. * - *

Construction performs no native work and binds no socket; it only captures the configuration. - * Call {@link #start()} to launch the server (not implemented yet).

+ *

Construction performs no native work and binds no socket; it only captures the arguments. + * Call {@link #start()} to launch the server.

* - * @param config the server configuration (host, port, ...); must not be {@code null} + * @param args the llama-server command-line arguments (e.g. {@code "-m", "model.gguf", + * "--port", "8080"}); must not be {@code null} and must not contain {@code null} + * elements */ - public NativeServer(OpenAiServerConfig config) { - this.config = Objects.requireNonNull(config, "config"); + public NativeServer(String... args) { + Objects.requireNonNull(args, "args"); + for (final String arg : args) { + Objects.requireNonNull(arg, "args element"); + } + this.args = args.clone(); } /** - * Starts the native HTTP server and begins serving the embedded WebUI. + * Starts the native HTTP server (and its embedded WebUI) on a background thread and returns + * immediately. The server binds and begins serving {@code GET /health} before the model finishes + * loading; poll {@code /health} for readiness. * - *

Not implemented yet — this is a scaffold. The native route registration and - * its JNI binding are a planned step, so this method always throws until then.

- * - * @return this server instance (for fluent / try-with-resources use), once implemented - * @throws UnsupportedOperationException always, until the native routes are wired to JNI + * @return this server instance (for fluent / try-with-resources use) + * @throws IllegalStateException if this instance was already started, or another + * {@code NativeServer} is already running in this process */ - // Scaffold: start() intentionally always throws for now, but must stay callable (not @DoNotCall) - // so the real implementation and its callers/tests keep the same signature. - @SuppressWarnings("DoNotCallSuggester") public NativeServer start() { - throw new UnsupportedOperationException(NOT_WIRED_MESSAGE); + if (handle != 0) { + throw new IllegalStateException("NativeServer already started"); + } + if (!RUNNING.compareAndSet(false, true)) { + throw new IllegalStateException( + "another NativeServer is already running in this process (only one is supported)"); + } + try { + // Load libjllama lazily here (not in a static initializer) so construction, argument + // parsing and close() stay usable — and unit-testable — without the native library. + LlamaLoader.initialize(); + handle = startNativeServer(args); + } catch (final RuntimeException | Error e) { + RUNNING.set(false); + throw e; + } + return this; } /** - * Reports whether the native server is currently running. + * Reports whether the native server worker is currently running. + * + *

Note: this becomes {@code true} as soon as the worker thread starts, which is before the + * socket is necessarily accepting connections — use {@code GET /health} to detect readiness.

* - * @return {@code false} — the scaffold never starts a server yet + * @return {@code true} if the server has been started and its worker has not yet exited */ public boolean isRunning() { - return false; + final long h = handle; + return h != 0 && isRunningNative(h); } /** - * Returns the host the server is configured to bind to. + * Returns the bind host parsed from the arguments ({@code --host}), or {@code 127.0.0.1} when + * absent. Best-effort convenience for logging; the authoritative value is what the native server + * parsed. * * @return the configured bind host */ public String getHost() { - return config.getHost(); + for (int i = 0; i < args.length - 1; i++) { + if ("--host".equals(args[i])) { + return args[i + 1]; + } + } + return DEFAULT_HOST; } /** - * Returns the port the server is configured to bind to. + * Returns the port parsed from the arguments ({@code --port} / {@code -p}), or {@code 8080} when + * absent or unparseable. Best-effort convenience for logging. * * @return the configured port */ public int getPort() { - return config.getPort(); + for (int i = 0; i < args.length - 1; i++) { + if ("--port".equals(args[i]) || "-p".equals(args[i])) { + try { + return Integer.parseInt(args[i + 1].trim()); + } catch (final NumberFormatException e) { + return DEFAULT_PORT; + } + } + } + return DEFAULT_PORT; } /** - * Stops the native server if it is running. - * - *

No-op in the scaffold (nothing is ever started), so it is always safe to call, including from - * try-with-resources. Real lifecycle teardown is part of the planned native-server implementation.

+ * Stops the native server if it is running and releases the native handle. Blocks until the + * server has fully shut down. Safe to call more than once and from try-with-resources even if + * {@link #start()} was never called (no-op then). */ @Override public void close() { - // Nothing is started yet, so there is nothing to release. + final long h = handle; + if (h == 0) { + return; + } + handle = 0; + try { + stopNativeServer(h); + } finally { + RUNNING.set(false); + } } + + /** + * Fat-jar entry point (the assembly JAR's {@code Main-Class}): starts the full native llama.cpp + * server — WebUI included — forwarding every argument to it verbatim, and blocks until the + * server exits or the JVM is asked to shut down (Ctrl-C / SIGTERM), stopping the server cleanly + * on the way out. + * + *

This is the default runnable server. The Java-transport {@link OpenAiCompatServer} remains + * available via its own {@code main} — run it explicitly with + * {@code java -cp net.ladenthin.llama.server.OpenAiCompatServer …}.

+ * + * @param args the llama-server command-line arguments, forwarded verbatim (e.g. {@code -m + * model.gguf --host 127.0.0.1 --port 8080}); pass {@code --help} for the full + * llama-server option list + * @throws InterruptedException if interrupted while waiting for the server to exit + */ + public static void main(String[] args) throws InterruptedException { + // Own the server in a try/finally so close() is guaranteed on normal or exceptional exit of + // the block (satisfies S2095 via the "close in a finally clause" option — try-with-resources + // is not used because the shutdown hook must also call close() explicitly, which javac flags + // under -Werror as an "explicit call to close() on an auto-closeable resource"). close() is + // idempotent (guards on a zero handle), so the finally and the hook both firing is safe. + final NativeServer server = new NativeServer(args); + try { + // Signalled by the shutdown hook so the main thread wakes immediately on Ctrl-C / SIGTERM + // rather than waiting out a poll tick — and so the wait uses a bounded latch await instead + // of Thread.sleep (banned by LlamaArchitectureTest.noThreadSleep). + final CountDownLatch stopSignal = new CountDownLatch(1); + // Graceful Ctrl-C / SIGTERM: the embedded server installs no signal handlers of its own + // (see patches/0006), so the JVM-level shutdown hook is what stops it before exit. + Runtime.getRuntime() + .addShutdownHook(new Thread( + () -> { + server.close(); + stopSignal.countDown(); + }, + "jllama-native-server-shutdown")); + server.start(); + // Keep the JVM alive until the native worker exits — on its own (e.g. a fatal startup/model + // error that llama_server has already logged) or because the shutdown hook stopped it. The + // bounded await returns early when the hook fires; on timeout we re-check isRunning() to + // catch a self-terminated worker. + while (server.isRunning() && !stopSignal.await(200L, TimeUnit.MILLISECONDS)) { + // wait for the native worker to exit or the shutdown hook to fire + } + } finally { + server.close(); + } + } + + /** + * Starts the native server on a worker thread and returns an opaque handle. The argv is + * forwarded verbatim (with a synthetic {@code argv[0]}). + */ + private static native long startNativeServer(String[] args); + + /** Signals shutdown, joins the worker thread, and frees the handle. */ + private static native void stopNativeServer(long handle); + + /** Whether the worker thread for the given handle is still running. */ + private static native boolean isRunningNative(long handle); } diff --git a/llama/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java b/llama/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java index b93e7766..bdd9befd 100644 --- a/llama/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java +++ b/llama/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java @@ -748,7 +748,7 @@ private void handleModels(HttpExchange exchange) throws IOException { sendError(exchange, HTTP_UNAUTHORIZED, ERROR_TYPE_REQUEST, "Missing or invalid API key"); return; } - sendJson(exchange, HTTP_OK, OpenAiSseFormatter.modelsJson(config.getModelId())); + sendJson(exchange, HTTP_OK, OpenAiSseFormatter.modelsJson(config.getModelId(), config.getModelFtype())); } finally { exchange.close(); } @@ -1064,7 +1064,7 @@ public static void main(String[] args) throws IOException { "jllama-openai-shutdown")); try (LlamaModel model = new LlamaModel(options.toModelParameters())) { - OpenAiServerConfig config = options.toServerConfig(model.supportsVision()); + OpenAiServerConfig config = options.toServerConfig(model.supportsVision(), model.getModelFtype()); try (OpenAiCompatServer server = new OpenAiCompatServer(model, config)) { server.start(); printReady(config, server.getPort()); diff --git a/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java b/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java index f32728b8..3e04d5fe 100644 --- a/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java +++ b/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java @@ -4,8 +4,15 @@ package net.ladenthin.llama.server; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import net.ladenthin.llama.args.CacheType; import net.ladenthin.llama.parameters.ModelParameters; import org.jspecify.annotations.Nullable; @@ -19,8 +26,11 @@ * via {@link #isHelpRequested(String[])} so callers can print help without it being treated as an error. * *

Flags mirror llama.cpp's own server where they overlap ({@code -m}, {@code -p}, {@code -c}, - * {@code -ngl}, {@code -t}); a few legacy spellings are accepted as aliases so earlier documented - * invocations keep working. + * {@code -b}, {@code -ub}, {@code -ngl}, {@code -t}, {@code -tb}, {@code -ctk}, {@code -ctv}, + * {@code --jinja}, {@code --chat-template-kwargs}); a few legacy spellings are accepted as aliases so + * earlier documented invocations keep working. The {@code --chat-template-kwargs} JSON is parsed here + * (the only JSON this otherwise dependency-light parser touches) so a malformed object fails fast with + * usage text rather than at native model load. */ public final class OpenAiServerCli { @@ -65,7 +75,14 @@ public static Options parse(String... args) { int ctxSize = 0; int gpuLayers = 0; int threads = 0; + int threadsBatch = 0; int parallel = 0; + int batchSize = 0; + int ubatchSize = 0; + @Nullable CacheType cacheTypeK = null; + @Nullable CacheType cacheTypeV = null; + boolean jinja = false; + @Nullable Map chatTemplateKwargs = null; boolean embedding = false; boolean reranking = false; @@ -97,6 +114,32 @@ public static Options parse(String... args) { case "--threads": threads = intValue(args, ++i, arg); break; + case "-tb": + case "--threads-batch": + threadsBatch = intValue(args, ++i, arg); + break; + case "-b": + case "--batch-size": + batchSize = intValue(args, ++i, arg); + break; + case "-ub": + case "--ubatch-size": + ubatchSize = intValue(args, ++i, arg); + break; + case "-ctk": + case "--cache-type-k": + cacheTypeK = cacheTypeValue(args, ++i, arg); + break; + case "-ctv": + case "--cache-type-v": + cacheTypeV = cacheTypeValue(args, ++i, arg); + break; + case "--jinja": + jinja = true; + break; + case "--chat-template-kwargs": + chatTemplateKwargs = parseChatTemplateKwargs(nextValue(args, ++i, arg), arg); + break; case "--parallel": parallel = intValue(args, ++i, arg); break; @@ -131,7 +174,24 @@ public static Options parse(String... args) { throw error("Missing required argument: -m/--model "); } return new Options( - host, port, modelPath, modelId, apiKey, mmproj, ctxSize, gpuLayers, threads, parallel, embedding, + host, + port, + modelPath, + modelId, + apiKey, + mmproj, + ctxSize, + gpuLayers, + threads, + threadsBatch, + parallel, + batchSize, + ubatchSize, + cacheTypeK, + cacheTypeV, + jinja, + chatTemplateKwargs, + embedding, reranking); } @@ -155,8 +215,16 @@ public static String usage() { " --host Interface to bind (default: " + DEFAULT_HOST + ")", " -p, --port TCP port to listen on (default: " + DEFAULT_PORT + ")", " -c, --ctx-size Context window size (default: llama.cpp default)", + " -b, --batch-size Logical (prompt) batch size (default: llama.cpp default)", + " -ub, --ubatch-size Physical (micro) batch size (default: llama.cpp default)", " -ngl,--n-gpu-layers Layers to offload to GPU (default: 0 = CPU only)", " -t, --threads Inference thread count (default: llama.cpp default)", + " -tb, --threads-batch Thread count for batch/prompt processing (default: same as -t)", + " -ctk,--cache-type-k KV cache K quantization: " + cacheTypeChoices() + " (default: f16)", + " -ctv,--cache-type-v KV cache V quantization: " + cacheTypeChoices() + " (default: f16)", + " --jinja Use the model's Jinja chat template", + " --chat-template-kwargs JSON object of chat-template variables (requires --jinja),", + " e.g. {\"reasoning_effort\":\"low\"}", " --parallel Parallel inference slots (default: llama.cpp default)", " --model-id Model id reported by /v1/models (default: file name)", " --api-key Require an 'Authorization: Bearer ' header", @@ -191,6 +259,53 @@ private static int intValue(String[] args, int valueIndex, String flag) { } } + /** Reusable parser for the {@code --chat-template-kwargs} JSON object; no state, thread-safe. */ + private static final ObjectMapper CHAT_TEMPLATE_KWARGS_MAPPER = new ObjectMapper(); + + private static CacheType cacheTypeValue(String[] args, int valueIndex, String flag) { + final String raw = nextValue(args, valueIndex, flag).trim(); + for (final CacheType type : CacheType.values()) { + if (type.getArgValue().equalsIgnoreCase(raw)) { + return type; + } + } + throw error(flag + " expects one of " + cacheTypeChoices() + ", got: " + raw); + } + + private static String cacheTypeChoices() { + final StringBuilder sb = new StringBuilder(); + for (final CacheType type : CacheType.values()) { + if (sb.length() > 0) { + sb.append(", "); + } + sb.append(type.getArgValue()); + } + return sb.toString(); + } + + /** + * Parse a {@code --chat-template-kwargs} JSON object into the raw-per-value map that + * {@link ModelParameters#setChatTemplateKwargs(Map)} expects: each entry's value is kept as its + * raw JSON text (a string stays quoted, a boolean/number stays bare), so the object is + * reconstructed verbatim for the native flag. Insertion order is preserved. + */ + private static Map parseChatTemplateKwargs(String json, String flag) { + final JsonNode root; + try { + root = CHAT_TEMPLATE_KWARGS_MAPPER.readTree(json); + } catch (JsonProcessingException e) { + throw error(flag + " expects a JSON object (e.g. {\"reasoning_effort\":\"low\"}), got: " + json, e); + } + if (root == null || !root.isObject()) { + throw error(flag + " expects a JSON object (e.g. {\"reasoning_effort\":\"low\"}), got: " + json); + } + final Map kwargs = new LinkedHashMap<>(); + for (final Map.Entry field : root.properties()) { + kwargs.put(field.getKey(), field.getValue().toString()); + } + return Collections.unmodifiableMap(kwargs); + } + private static IllegalArgumentException error(String message) { return error(message, null); } @@ -200,10 +315,12 @@ private static IllegalArgumentException error(String message, @Nullable Throwabl } /** - * Immutable, parsed launcher options. {@code ctxSize}, {@code threads} and {@code parallel} use - * {@code 0} as a sentinel meaning "leave the llama.cpp default" — they are only applied to - * {@link ModelParameters} when positive. {@code gpuLayers} is always applied (its own default of - * {@code 0} already means CPU-only). + * Immutable, parsed launcher options. The integer tuning knobs — {@code ctxSize}, + * {@code threads}, {@code threadsBatch}, {@code parallel}, {@code batchSize} and + * {@code ubatchSize} — use {@code 0} as a sentinel meaning "leave the llama.cpp default", and are + * only applied to {@link ModelParameters} when positive. {@code cacheTypeK}/{@code cacheTypeV} + * and {@code chatTemplateKwargs} use {@code null} as the same "leave the default" sentinel. + * {@code gpuLayers} is always applied (its own default of {@code 0} already means CPU-only). */ public static final class Options { @@ -216,7 +333,14 @@ public static final class Options { private final int ctxSize; private final int gpuLayers; private final int threads; + private final int threadsBatch; private final int parallel; + private final int batchSize; + private final int ubatchSize; + private final @Nullable CacheType cacheTypeK; + private final @Nullable CacheType cacheTypeV; + private final boolean jinja; + private final @Nullable Map chatTemplateKwargs; private final boolean embedding; private final boolean reranking; @@ -230,7 +354,14 @@ private Options( int ctxSize, int gpuLayers, int threads, + int threadsBatch, int parallel, + int batchSize, + int ubatchSize, + @Nullable CacheType cacheTypeK, + @Nullable CacheType cacheTypeV, + boolean jinja, + @Nullable Map chatTemplateKwargs, boolean embedding, boolean reranking) { this.host = host; @@ -242,7 +373,14 @@ private Options( this.ctxSize = ctxSize; this.gpuLayers = gpuLayers; this.threads = threads; + this.threadsBatch = threadsBatch; this.parallel = parallel; + this.batchSize = batchSize; + this.ubatchSize = ubatchSize; + this.cacheTypeK = cacheTypeK; + this.cacheTypeV = cacheTypeV; + this.jinja = jinja; + this.chatTemplateKwargs = chatTemplateKwargs; this.embedding = embedding; this.reranking = reranking; } @@ -341,6 +479,72 @@ public int getParallel() { return parallel; } + /** + * The batch/prompt-processing thread count, or {@code 0} for the llama.cpp default (same as + * {@link #getThreads()}). + * + * @return the batch thread count + */ + public int getThreadsBatch() { + return threadsBatch; + } + + /** + * The logical (prompt) batch size, or {@code 0} for the llama.cpp default. + * + * @return the batch size + */ + public int getBatchSize() { + return batchSize; + } + + /** + * The physical (micro) batch size, or {@code 0} for the llama.cpp default. + * + * @return the micro-batch size + */ + public int getUbatchSize() { + return ubatchSize; + } + + /** + * The KV cache K quantization type, or {@code null} for the llama.cpp default. + * + * @return the K cache type, or {@code null} when unset + */ + public @Nullable CacheType getCacheTypeK() { + return cacheTypeK; + } + + /** + * The KV cache V quantization type, or {@code null} for the llama.cpp default. + * + * @return the V cache type, or {@code null} when unset + */ + public @Nullable CacheType getCacheTypeV() { + return cacheTypeV; + } + + /** + * Whether the model's Jinja chat template is enabled. + * + * @return {@code true} if {@code --jinja} was requested + */ + public boolean isJinja() { + return jinja; + } + + /** + * The parsed {@code --chat-template-kwargs} as a raw-per-value map (see + * {@link ModelParameters#setChatTemplateKwargs(Map)}), or {@code null} when unset. The map is + * unmodifiable. + * + * @return the chat-template variables, or {@code null} when unset + */ + public @Nullable Map getChatTemplateKwargs() { + return chatTemplateKwargs; + } + /** * Whether to load the model in embedding mode. * @@ -376,9 +580,30 @@ public ModelParameters toModelParameters() { if (threads > 0) { params.setThreads(threads); } + if (threadsBatch > 0) { + params.setThreadsBatch(threadsBatch); + } if (parallel > 0) { params.setParallel(parallel); } + if (batchSize > 0) { + params.setBatchSize(batchSize); + } + if (ubatchSize > 0) { + params.setUbatchSize(ubatchSize); + } + if (cacheTypeK != null) { + params.setCacheTypeK(cacheTypeK); + } + if (cacheTypeV != null) { + params.setCacheTypeV(cacheTypeV); + } + if (jinja) { + params.enableJinja(); + } + if (chatTemplateKwargs != null) { + params.setChatTemplateKwargs(chatTemplateKwargs); + } if (embedding) { params.enableEmbedding(); } @@ -395,7 +620,7 @@ public ModelParameters toModelParameters() { * @return the server configuration */ public OpenAiServerConfig toServerConfig() { - return toServerConfig(mmproj != null); + return toServerConfig(mmproj != null, ""); } /** @@ -407,11 +632,25 @@ public OpenAiServerConfig toServerConfig() { * @return the server configuration */ public OpenAiServerConfig toServerConfig(boolean supportsVision) { + return toServerConfig(supportsVision, ""); + } + + /** + * Build the server configuration with capability + metadata values obtained from the loaded + * model. This overload lets the standalone launcher advertise the model's quantization file + * type in {@code /v1/models} alongside the vision capability. + * + * @param supportsVision whether the loaded model reports usable vision input + * @param modelFtype the model's file-type (quantization) label, or {@code ""} if unknown + * @return the server configuration + */ + public OpenAiServerConfig toServerConfig(boolean supportsVision, String modelFtype) { final OpenAiServerConfig.Builder builder = OpenAiServerConfig.builder() .host(host) .port(port) .modelId(getModelId()) - .supportsVision(supportsVision); + .supportsVision(supportsVision) + .modelFtype(modelFtype); if (apiKey != null) { builder.apiKey(apiKey); } diff --git a/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java b/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java index e6d694e9..7f9c3701 100644 --- a/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java +++ b/llama/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java @@ -55,6 +55,7 @@ public final class OpenAiServerConfig { private final String corsAllowOrigin; private final boolean supportsVision; private final int maxRequestBodyBytes; + private final String modelFtype; private OpenAiServerConfig(Builder builder) { this.host = builder.host; @@ -67,6 +68,7 @@ private OpenAiServerConfig(Builder builder) { this.corsAllowOrigin = builder.corsAllowOrigin; this.supportsVision = builder.supportsVision; this.maxRequestBodyBytes = builder.maxRequestBodyBytes; + this.modelFtype = builder.modelFtype; } /** @@ -169,6 +171,17 @@ public boolean isSupportsVision() { return supportsVision; } + /** + * The served model's file type (quantization) as a human-readable string, e.g. {@code "Q8_0"} + * or {@code "Q4_K - Medium"}, advertised in the {@code GET /v1/models} {@code data[].ftype} field + * (matching the upstream llama.cpp server). Empty when unknown. + * + * @return the quantization file-type label, or {@code ""} if unknown + */ + public String getModelFtype() { + return modelFtype; + } + /** * Whether bearer-token authentication is enabled (an API key is configured). * @@ -217,6 +230,7 @@ public static final class Builder { private String corsAllowOrigin = DEFAULT_CORS_ALLOW_ORIGIN; private boolean supportsVision; private int maxRequestBodyBytes = DEFAULT_MAX_REQUEST_BODY_BYTES; + private String modelFtype = ""; private Builder() {} @@ -319,6 +333,18 @@ public Builder supportsVision(boolean supportsVision) { return this; } + /** + * Sets the served model's file type (quantization) label to advertise in {@code /v1/models}. + * + * @param modelFtype the quantization file-type label (e.g. {@code "Q4_K - Medium"}); {@code null} + * is treated as empty (unknown) + * @return this builder + */ + public Builder modelFtype(@Nullable String modelFtype) { + this.modelFtype = modelFtype == null ? "" : modelFtype; + return this; + } + /** * Sets the maximum accepted request-body size in bytes. Bodies larger than this are rejected * with HTTP 413 before being buffered. diff --git a/llama/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java b/llama/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java index f87c2599..764475be 100644 --- a/llama/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java +++ b/llama/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java @@ -122,10 +122,26 @@ static String ensureUsageCachedTokens(String chunkJson) { * @return an OpenAI model-list object serialized as JSON */ static String modelsJson(String modelId) { + return modelsJson(modelId, ""); + } + + /** + * Build the {@code GET /v1/models} body advertising a single model, including the model's file + * type (quantization) as a {@code data[].ftype} field when known — mirroring the upstream + * llama.cpp server's {@code get_model_info()}. + * + * @param modelId the model id to advertise + * @param ftype the model's file-type (quantization) label, or {@code ""}/{@code null} to omit it + * @return an OpenAI model-list object serialized as JSON + */ + static String modelsJson(String modelId, @Nullable String ftype) { ObjectNode model = OBJECT_MAPPER.createObjectNode(); model.put("id", modelId); model.put("object", "model"); model.put("owned_by", "llama.cpp"); + if (ftype != null && !ftype.isEmpty()) { + model.put("ftype", ftype); + } ArrayNode data = OBJECT_MAPPER.createArrayNode(); data.add(model); ObjectNode root = OBJECT_MAPPER.createObjectNode(); diff --git a/llama/src/main/java/net/ladenthin/llama/server/ServerLauncher.java b/llama/src/main/java/net/ladenthin/llama/server/ServerLauncher.java new file mode 100644 index 00000000..964ba4e4 --- /dev/null +++ b/llama/src/main/java/net/ladenthin/llama/server/ServerLauncher.java @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import java.util.ArrayList; +import java.util.List; + +/** + * Fat-jar entry point that dispatches to one of the two server modes based on a single selector + * flag. With {@value #OPENAI_COMPAT_FLAG} present it runs {@link OpenAiCompatServer} (the + * Java-transport, OpenAI-compatible JSON API); without it, {@link NativeServer} (the full native + * llama.cpp server with embedded WebUI, the default). + * + *

The dispatch uses a single primitive, {@link #withoutFlag(String[], String)}: it strips the + * selector from the arguments (the flag is not a llama.cpp flag, and {@code llama_server} rejects + * unknown flags), and the mode is chosen purely by whether that shortened the list — present iff the + * result is smaller. Every other argument is forwarded verbatim.

+ * + *

Flag sets differ. {@link NativeServer} forwards every llama-server + * flag to {@code llama_server}, whereas {@link OpenAiCompatServer}'s CLI ({@link OpenAiServerCli}) + * accepts a curated subset and rejects unknown flags — so native-only flags (e.g. {@code --ui}, + * {@code -fa}) cannot be combined with {@value #OPENAI_COMPAT_FLAG}.

+ * + *

Both underlying mains remain directly runnable by class name via {@code java -cp}; this + * launcher is purely a convenience so a single {@code java -jar} covers both.

+ */ +public final class ServerLauncher { + + /** + * Selector flag: when present, run {@link OpenAiCompatServer} instead of the default + * {@link NativeServer}. + * + *

Namespaced with the {@code jllama} prefix (this project's native-library name) so it can + * never collide with a current or future llama.cpp / llama-server flag — upstream owns the + * {@code --*} space, this launcher owns {@code --jllama-*}. The launcher strips it before + * forwarding, so it never reaches {@code llama_server} (which rejects unknown flags).

+ */ + public static final String OPENAI_COMPAT_FLAG = "--jllama-openai-compat"; + + private ServerLauncher() {} + + /** + * Dispatches to {@link OpenAiCompatServer#main(String[])} when {@value #OPENAI_COMPAT_FLAG} is + * present (with that marker removed), otherwise to {@link NativeServer#main(String[])} with all + * arguments forwarded unchanged. Selection is derived from whether stripping the flag shortened + * the argument list. + * + * @param args the process arguments + * @throws Exception if the selected server's {@code main} throws (it blocks until shutdown) + */ + public static void main(String[] args) throws Exception { + final String[] forwarded = withoutFlag(args, OPENAI_COMPAT_FLAG); + if (forwarded.length != args.length) { + OpenAiCompatServer.main(forwarded); + } else { + NativeServer.main(args); + } + } + + /** + * Returns a copy of {@code args} with every occurrence of {@code flag} removed, preserving the + * order of the remaining arguments. The result is shorter than {@code args} exactly when + * {@code flag} was present — which is how {@link #main(String[])} selects the server mode. + * + * @param args the arguments + * @param flag the flag token to strip + * @return a new array without {@code flag} + */ + static String[] withoutFlag(String[] args, String flag) { + final List filtered = new ArrayList<>(args.length); + for (final String arg : args) { + if (!flag.equals(arg)) { + filtered.add(arg); + } + } + return filtered.toArray(new String[0]); + } +} diff --git a/llama/src/main/java/net/ladenthin/llama/value/ModelMeta.java b/llama/src/main/java/net/ladenthin/llama/value/ModelMeta.java index 91059e71..6f6f8975 100644 --- a/llama/src/main/java/net/ladenthin/llama/value/ModelMeta.java +++ b/llama/src/main/java/net/ladenthin/llama/value/ModelMeta.java @@ -129,6 +129,18 @@ public String getModelName() { return node.path("name").asText(""); } + /** + * The model file type (quantization) as a human-readable string, e.g. {@code "Q8_0"} or + * {@code "Q4_K - Medium"}, from the GGUF {@code general.file_type} the model was loaded with + * (llama.cpp {@code llama_ftype_name}). A guessed type is prefixed with {@code "(guessed) "}. + * Returns an empty string if the native layer does not report it (older native builds). + * + * @return the quantization file-type label, or {@code ""} if absent + */ + public String getFtype() { + return node.path("ftype").asText(""); + } + /** * The model's resolved default chat template (Jinja), from GGUF * {@code tokenizer.chat_template} metadata. diff --git a/llama/src/test/cpp/test_server.cpp b/llama/src/test/cpp/test_server.cpp index 546b618b..5ff9d894 100644 --- a/llama/src/test/cpp/test_server.cpp +++ b/llama/src/test/cpp/test_server.cpp @@ -1732,6 +1732,25 @@ TEST(ParamsFromJsonCmpl, SimpleFields_RoundTrip) { EXPECT_EQ(p.n_predict, 128); } +// b9864: per-request sse_ping_interval overrides the server --sse-ping-interval setting; -1 disables +// pings. Pins that the JSON key emitted by InferenceParameters.withSsePingInterval is honored by the +// schema (field_num with set_hard_limits(-1, INT32_MAX)). +TEST(ParamsFromJsonCmpl, SsePingInterval_RoundTrip) { + EXPECT_EQ(parse_params({{"sse_ping_interval", 5}}).sse_ping_interval, 5); + EXPECT_EQ(parse_params({{"sse_ping_interval", -1}}).sse_ping_interval, -1); // -1 disables pings +} + +TEST(ParamsFromJsonCmpl, SsePingInterval_BelowHardLimit_Throws) { + // hard lower bound is -1; anything below throws (wrapped in std::invalid_argument by the schema). + EXPECT_THROW(parse_params({{"sse_ping_interval", -2}}), std::invalid_argument); +} + +TEST(ParamsFromJsonCmpl, SsePingInterval_Absent_InheritsServerSetting) { + // When omitted from the request, the value inherits params_base.sse_ping_interval (the server setting). + common_params defaults; + EXPECT_EQ(parse_params({}).sse_ping_interval, defaults.sse_ping_interval); +} + TEST(ParamsFromJsonCmpl, RepeatLastN_MinusOne_ExpandsToNCtxSlot) { const auto p = parse_params({{"repeat_last_n", -1}}, /*n_ctx=*/256); EXPECT_EQ(p.sampling.penalty_last_n, 256); diff --git a/llama/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java b/llama/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java index 28897996..a3100e57 100644 --- a/llama/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java +++ b/llama/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java @@ -94,8 +94,10 @@ public class LlamaArchitectureTest { * intend it. Conceptual tiers (informational): {@code Server} > {@code Api} (root) > * {@code Loader} > {@code Json}/{@code Parameters} > * {@code Value}/{@code Callback}/{@code Exception}/{@code Args}. The {@code Server} layer is the - * optional OpenAI-compatible HTTP entry point; it is the only layer permitted to access the - * {@code Api} root. + * optional OpenAI-compatible HTTP / native-server entry point; it is the only layer permitted to + * access the {@code Api} root, and it also reaches the {@code Loader} ({@code NativeServer} + * triggers {@code LlamaLoader.initialize()} before starting the embedded native server) and the + * {@code Args} enums ({@code OpenAiServerCli} maps {@code -ctk}/{@code -ctv} to {@code CacheType}). */ @ArchTest static final ArchRule layeredArchitecture = layeredArchitecture() @@ -121,7 +123,7 @@ public class LlamaArchitectureTest { .whereLayer("Api") .mayOnlyBeAccessedByLayers("Server") .whereLayer("Loader") - .mayOnlyBeAccessedByLayers("Api") + .mayOnlyBeAccessedByLayers("Api", "Server") .whereLayer("Json") .mayOnlyBeAccessedByLayers("Api") .whereLayer("Parameters") @@ -133,7 +135,7 @@ public class LlamaArchitectureTest { .whereLayer("Exception") .mayOnlyBeAccessedByLayers("Api", "Loader") .whereLayer("Args") - .mayOnlyBeAccessedByLayers("Api", "Loader", "Parameters") + .mayOnlyBeAccessedByLayers("Api", "Loader", "Parameters", "Server") .whereLayer("Server") .mayNotBeAccessedByAnyLayer(); diff --git a/llama/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java b/llama/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java index 0faa2626..e0ed445c 100644 --- a/llama/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java +++ b/llama/src/test/java/net/ladenthin/llama/parameters/InferenceParametersTest.java @@ -73,6 +73,36 @@ public void testSetNPredict() { assertThat(params.parameters.get("n_predict"), is("42")); } + @Test + public void testSetSsePingInterval() { + InferenceParameters params = new InferenceParameters("").withSsePingInterval(1); + assertThat(params.parameters.get("sse_ping_interval"), is("1")); + // -1 disables pings and must be accepted (the schema's hard lower bound is -1). + assertThat( + InferenceParameters.empty().withSsePingInterval(-1).parameters.get("sse_ping_interval"), is("-1")); + } + + @Test + public void testAdditionalCompletionScalarsFromB9864Audit() { + // Plain scalars honored by eval_llama_cmpl_schema but previously not surfaced as withers. + assertThat( + InferenceParameters.empty().withXtcProbability(0.5f).parameters.get("xtc_probability"), is("0.5")); + assertThat(InferenceParameters.empty().withXtcThreshold(0.1f).parameters.get("xtc_threshold"), is("0.1")); + assertThat(InferenceParameters.empty().withNDiscard(64).parameters.get("n_discard"), is("64")); + assertThat(InferenceParameters.empty().withNIndent(4).parameters.get("n_indent"), is("4")); + assertThat( + InferenceParameters.empty().withTMaxPredictMs(2000).parameters.get("t_max_predict_ms"), is("2000")); + assertThat( + InferenceParameters.empty() + .withPostSamplingProbs(true) + .parameters + .get("post_sampling_probs"), + is("true")); + assertThat( + InferenceParameters.empty().withTimingsPerToken(true).parameters.get("timings_per_token"), is("true")); + assertThat(InferenceParameters.empty().withReturnTokens(true).parameters.get("return_tokens"), is("true")); + } + @Test public void testSetCacheReuse() { InferenceParameters params = InferenceParameters.empty().withCacheReuse(256); diff --git a/llama/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java b/llama/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java index bc4dc3aa..7bf7b476 100644 --- a/llama/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java +++ b/llama/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java @@ -891,6 +891,18 @@ public void testSetSlotPromptSimilarity() { assertThat(p.parameters.get("--slot-prompt-similarity"), is("0.8")); } + @Test + public void testSetCtxCheckpoints() { + ModelParameters p = new ModelParameters().setCtxCheckpoints(8); + assertThat(p.parameters.get("--ctx-checkpoints"), is("8")); + } + + @Test + public void testSetCheckpointMinStep() { + ModelParameters p = new ModelParameters().setCheckpointMinStep(0); + assertThat(p.parameters.get("--checkpoint-min-step"), is("0")); + } + // ------------------------------------------------------------------------- // Override KV // ------------------------------------------------------------------------- diff --git a/llama/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java b/llama/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java index 7e74dec4..389136f9 100644 --- a/llama/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java +++ b/llama/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java @@ -5,44 +5,61 @@ package net.ladenthin.llama.server; import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.is; import static org.junit.jupiter.api.Assertions.assertThrows; import org.junit.jupiter.api.Test; /** - * Model-free smoke test for the {@link NativeServer} scaffold: it must construct without any native - * work, expose its configured host/port, never report itself running, throw a clear - * {@link UnsupportedOperationException} from {@link NativeServer#start()} until the native routes are - * wired, and be a safe no-op {@link AutoCloseable}. No model and no {@code libjllama} required. + * Model-free, library-free unit tests for {@link NativeServer}'s pure-Java surface: it must + * construct without any native work (libjllama is loaded lazily in {@link NativeServer#start()}, + * not in a static initializer), best-effort parse host/port from the forwarded arguments, report + * itself not running before {@code start()}, and be a safe no-op {@link AutoCloseable} when never + * started. Actually starting the native server is exercised by CI / manual runs with a real model. */ public class NativeServerSmokeTest { - private static OpenAiServerConfig config() { - return OpenAiServerConfig.builder().host("127.0.0.1").port(1234).build(); + @Test + public void parsesHostAndPortFromArgs() { + NativeServer server = new NativeServer("-m", "m.gguf", "--host", "0.0.0.0", "--port", "1234"); + assertThat(server.getHost(), is("0.0.0.0")); + assertThat(server.getPort(), is(1234)); + assertThat(server.isRunning(), is(false)); + } + + @Test + public void shortPortFlagParsed() { + NativeServer server = new NativeServer("-m", "m.gguf", "-p", "9099"); + assertThat(server.getPort(), is(9099)); } @Test - public void exposesConfiguredHostAndPortWithoutStarting() { - NativeServer server = new NativeServer(config()); + public void defaultsWhenFlagsAbsent() { + NativeServer server = new NativeServer("-m", "m.gguf"); assertThat(server.getHost(), is("127.0.0.1")); - assertThat(server.getPort(), is(1234)); - assertThat(server.isRunning(), is(false)); + assertThat(server.getPort(), is(8080)); } @Test - public void startThrowsUntilNativeRoutesAreWired() { - NativeServer server = new NativeServer(config()); - UnsupportedOperationException ex = assertThrows(UnsupportedOperationException.class, server::start); - assertThat(ex.getMessage(), containsString("not yet wired")); - assertThat(server.isRunning(), is(false)); + public void nonIntegerPortFallsBackToDefault() { + NativeServer server = new NativeServer("-m", "m.gguf", "--port", "abc"); + assertThat(server.getPort(), is(8080)); } @Test - public void closeIsSafeNoOpEvenViaTryWithResources() { - try (NativeServer server = new NativeServer(config())) { + public void closeBeforeStartIsSafeNoOpViaTryWithResources() { + try (NativeServer server = new NativeServer("-m", "m.gguf")) { assertThat(server.isRunning(), is(false)); } } + + @Test + public void nullArgsRejected() { + assertThrows(NullPointerException.class, () -> new NativeServer((String[]) null)); + } + + @Test + public void nullArgElementRejected() { + assertThrows(NullPointerException.class, () -> new NativeServer("-m", null)); + } } diff --git a/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java b/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java index ff3dcd11..30204d6a 100644 --- a/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java +++ b/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java @@ -9,6 +9,7 @@ import static org.hamcrest.Matchers.is; import static org.junit.jupiter.api.Assertions.assertThrows; +import net.ladenthin.llama.args.CacheType; import org.junit.jupiter.api.Test; /** @@ -207,4 +208,131 @@ public void modelParametersIncludeModelPath() { OpenAiServerCli.parse("-m", "models/m.gguf").toModelParameters().toString(); assertThat(json, containsString("models/m.gguf")); } + + @Test + public void tuningFlagsDefaultToSentinels() { + OpenAiServerCli.Options options = OpenAiServerCli.parse("-m", "m.gguf"); + assertThat(options.getBatchSize(), is(0)); + assertThat(options.getUbatchSize(), is(0)); + assertThat(options.getThreadsBatch(), is(0)); + assertThat(options.getCacheTypeK(), is((CacheType) null)); + assertThat(options.getCacheTypeV(), is((CacheType) null)); + assertThat(options.isJinja(), is(false)); + assertThat(options.getChatTemplateKwargs(), is((Object) null)); + } + + @Test + public void tuningShortFlagsParsed() { + OpenAiServerCli.Options options = OpenAiServerCli.parse( + "-m", "m.gguf", "-b", "4096", "-ub", "2048", "-tb", "16", "-ctk", "q8_0", "-ctv", "q8_0"); + assertThat(options.getBatchSize(), is(4096)); + assertThat(options.getUbatchSize(), is(2048)); + assertThat(options.getThreadsBatch(), is(16)); + assertThat(options.getCacheTypeK(), is(CacheType.Q8_0)); + assertThat(options.getCacheTypeV(), is(CacheType.Q8_0)); + } + + @Test + public void tuningLongFlagsParsed() { + OpenAiServerCli.Options options = OpenAiServerCli.parse( + "-m", + "m.gguf", + "--batch-size", + "512", + "--ubatch-size", + "256", + "--threads-batch", + "6", + "--cache-type-k", + "f16", + "--cache-type-v", + "q4_0", + "--jinja"); + assertThat(options.getBatchSize(), is(512)); + assertThat(options.getUbatchSize(), is(256)); + assertThat(options.getThreadsBatch(), is(6)); + assertThat(options.getCacheTypeK(), is(CacheType.F16)); + assertThat(options.getCacheTypeV(), is(CacheType.Q4_0)); + assertThat(options.isJinja(), is(true)); + } + + @Test + public void cacheTypeIsCaseInsensitive() { + OpenAiServerCli.Options options = OpenAiServerCli.parse("-m", "m.gguf", "-ctk", "Q8_0"); + assertThat(options.getCacheTypeK(), is(CacheType.Q8_0)); + } + + @Test + public void unknownCacheTypeThrows() { + IllegalArgumentException ex = assertThrows( + IllegalArgumentException.class, () -> OpenAiServerCli.parse("-m", "m.gguf", "-ctk", "q3_k")); + assertThat(ex.getMessage(), containsString("expects one of")); + assertThat(ex.getMessage(), containsString("q8_0")); + assertThat(ex.getMessage(), containsString("q3_k")); + } + + @Test + public void chatTemplateKwargsParsedToRawJsonValues() { + OpenAiServerCli.Options options = OpenAiServerCli.parse( + "-m", "m.gguf", "--chat-template-kwargs", "{\"reasoning_effort\":\"low\",\"enable_thinking\":true}"); + assertThat(options.getChatTemplateKwargs().get("reasoning_effort"), is("\"low\"")); + assertThat(options.getChatTemplateKwargs().get("enable_thinking"), is("true")); + } + + @Test + public void chatTemplateKwargsInvalidJsonThrows() { + IllegalArgumentException ex = assertThrows( + IllegalArgumentException.class, + () -> OpenAiServerCli.parse("-m", "m.gguf", "--chat-template-kwargs", "{not json")); + assertThat(ex.getMessage(), containsString("--chat-template-kwargs expects a JSON object")); + } + + @Test + public void chatTemplateKwargsNonObjectThrows() { + IllegalArgumentException ex = assertThrows( + IllegalArgumentException.class, + () -> OpenAiServerCli.parse("-m", "m.gguf", "--chat-template-kwargs", "\"low\"")); + assertThat(ex.getMessage(), containsString("--chat-template-kwargs expects a JSON object")); + } + + @Test + public void toModelParametersCarriesTuningFlags() { + String argv = OpenAiServerCli.parse( + "-m", + "m.gguf", + "-b", + "4096", + "-ub", + "2048", + "-tb", + "16", + "-ctk", + "q8_0", + "-ctv", + "q8_0", + "--jinja", + "--chat-template-kwargs", + "{\"reasoning_effort\":\"low\"}") + .toModelParameters() + .toString(); + assertThat(argv, containsString("--batch-size 4096")); + assertThat(argv, containsString("--ubatch-size 2048")); + assertThat(argv, containsString("--threads-batch 16")); + assertThat(argv, containsString("--cache-type-k q8_0")); + assertThat(argv, containsString("--cache-type-v q8_0")); + assertThat(argv, containsString("--jinja")); + assertThat(argv, containsString("--chat-template-kwargs")); + assertThat(argv, containsString("reasoning_effort")); + } + + @Test + public void usageMentionsNewTuningFlags() { + String usage = OpenAiServerCli.usage(); + assertThat(usage, containsString("--batch-size")); + assertThat(usage, containsString("--ubatch-size")); + assertThat(usage, containsString("--threads-batch")); + assertThat(usage, containsString("--cache-type-k")); + assertThat(usage, containsString("--jinja")); + assertThat(usage, containsString("--chat-template-kwargs")); + } } diff --git a/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java b/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java index 43c43ddb..f30c8fb6 100644 --- a/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java +++ b/llama/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java @@ -29,10 +29,19 @@ public void builderAppliesLocalhostDefaults() { assertThat(config.getHeartbeatMillis(), is(OpenAiServerConfig.DEFAULT_HEARTBEAT_MILLIS)); assertThat(config.getCorsAllowOrigin(), is(OpenAiServerConfig.DEFAULT_CORS_ALLOW_ORIGIN)); assertThat(config.isSupportsVision(), is(false)); + assertThat(config.getModelFtype(), is("")); assertThat(config.getApiKey(), is((String) null)); assertThat(config.isAuthenticationEnabled(), is(false)); } + @Test + public void modelFtypeIsConfigurableAndNullBecomesEmpty() { + assertThat( + OpenAiServerConfig.builder().modelFtype("Q4_K - Medium").build().getModelFtype(), is("Q4_K - Medium")); + // null is normalized to the empty "unknown" marker + assertThat(OpenAiServerConfig.builder().modelFtype(null).build().getModelFtype(), is("")); + } + @Test public void authenticationEnabledOnlyForNonEmptyKey() { assertThat(OpenAiServerConfig.builder().build().isAuthenticationEnabled(), is(false)); diff --git a/llama/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java b/llama/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java index 866efd25..265d1f75 100644 --- a/llama/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java +++ b/llama/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java @@ -103,6 +103,28 @@ public void modelsJsonAdvertisesTheConfiguredModel() throws IOException { assertThat(root.path("object").asText(), is("list")); assertThat(root.path("data").get(0).path("id").asText(), is("gemma-local")); assertThat(root.path("data").get(0).path("object").asText(), is("model")); + // no ftype supplied -> the field is omitted entirely + assertThat(root.path("data").get(0).has("ftype"), is(false)); + } + + @Test + public void modelsJsonIncludesFtypeWhenKnownAndOmitsWhenBlank() throws IOException { + JsonNode withFtype = MAPPER.readTree(OpenAiSseFormatter.modelsJson("gemma-local", "Q4_K - Medium")); + assertThat(withFtype.path("data").get(0).path("ftype").asText(), is("Q4_K - Medium")); + + // empty and null are treated as "unknown" -> field omitted + assertThat( + MAPPER.readTree(OpenAiSseFormatter.modelsJson("gemma-local", "")) + .path("data") + .get(0) + .has("ftype"), + is(false)); + assertThat( + MAPPER.readTree(OpenAiSseFormatter.modelsJson("gemma-local", null)) + .path("data") + .get(0) + .has("ftype"), + is(false)); } @Test diff --git a/llama/src/test/java/net/ladenthin/llama/server/ServerLauncherTest.java b/llama/src/test/java/net/ladenthin/llama/server/ServerLauncherTest.java new file mode 100644 index 00000000..1c3fbada --- /dev/null +++ b/llama/src/test/java/net/ladenthin/llama/server/ServerLauncherTest.java @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.arrayContaining; +import static org.hamcrest.Matchers.emptyArray; +import static org.hamcrest.Matchers.is; + +import org.junit.jupiter.api.Test; + +/** + * Pure-Java unit tests for {@link ServerLauncher}'s single dispatch primitive, + * {@link ServerLauncher#withoutFlag(String[], String)}. Selection is derived from the length change + * (result shorter iff the flag was present), so these tests cover both the stripping behaviour and + * that selection signal. No server is started and no native library is required. + */ +public class ServerLauncherTest { + + private static final String FLAG = ServerLauncher.OPENAI_COMPAT_FLAG; + + // --- selection signal: shorter iff the flag was present --- + + @Test + public void resultIsShorterWhenFlagPresent() { + String[] in = {FLAG, "-m", "m.gguf", "--port", "8080"}; + assertThat(ServerLauncher.withoutFlag(in, FLAG).length < in.length, is(true)); + } + + @Test + public void resultKeepsLengthWhenFlagAbsent() { + String[] in = {"-m", "m.gguf", "--port", "8080"}; + assertThat(ServerLauncher.withoutFlag(in, FLAG).length == in.length, is(true)); + } + + @Test + public void flagPositionDoesNotMatter() { + String[] in = {"-m", "m.gguf", FLAG}; + assertThat(ServerLauncher.withoutFlag(in, FLAG).length < in.length, is(true)); + } + + // --- stripping behaviour --- + + @Test + public void stripsTheSelectorAndPreservesTheRest() { + String[] out = ServerLauncher.withoutFlag(new String[] {FLAG, "-m", "m.gguf", "--port", "8080"}, FLAG); + assertThat(out, arrayContaining("-m", "m.gguf", "--port", "8080")); + } + + @Test + public void removesEveryOccurrence() { + String[] out = ServerLauncher.withoutFlag(new String[] {FLAG, "-m", "m.gguf", FLAG}, FLAG); + assertThat(out, arrayContaining("-m", "m.gguf")); + } + + @Test + public void isNoOpWhenAbsent() { + assertThat(ServerLauncher.withoutFlag(new String[] {"-m", "m.gguf"}, FLAG), arrayContaining("-m", "m.gguf")); + } + + @Test + public void emptyArgsStayEmpty() { + assertThat(ServerLauncher.withoutFlag(new String[] {}, FLAG), is(emptyArray())); + } +} diff --git a/llama/src/test/java/net/ladenthin/llama/value/ModelMetaTest.java b/llama/src/test/java/net/ladenthin/llama/value/ModelMetaTest.java index 5552e2a7..19f41dc4 100644 --- a/llama/src/test/java/net/ladenthin/llama/value/ModelMetaTest.java +++ b/llama/src/test/java/net/ladenthin/llama/value/ModelMetaTest.java @@ -80,6 +80,25 @@ public void testGetModelName() throws Exception { assertThat(meta.getModelName(), is("Mistral-7B-v0.1")); } + @Test + public void testGetFtype() throws Exception { + ModelMeta meta = parse("{\"vocab_type\":1,\"n_vocab\":32016,\"n_ctx_train\":16384," + + "\"n_embd\":4096,\"n_params\":6738546688,\"size\":2825274880," + + "\"modalities\":{\"vision\":false,\"audio\":false}," + + "\"architecture\":\"mistral\",\"name\":\"Mistral-7B-v0.1\",\"ftype\":\"Q4_K - Medium\"}"); + + assertThat(meta.getFtype(), is("Q4_K - Medium")); + } + + @Test + public void testGetFtypeEmptyWhenAbsent() throws Exception { + ModelMeta meta = parse("{\"vocab_type\":1,\"n_vocab\":100,\"n_ctx_train\":4096," + + "\"n_embd\":512,\"n_params\":1000000,\"size\":500000," + + "\"modalities\":{\"vision\":false,\"audio\":false}}"); + + assertThat(meta.getFtype(), is("")); + } + @Test public void testGetArchitectureEmptyWhenAbsent() throws Exception { ModelMeta meta = parse("{\"vocab_type\":1,\"n_vocab\":100,\"n_ctx_train\":4096,"